xref: /petsc/src/vec/vec/impls/seq/dvec2.c (revision 03047865b8d8757cf1cf9cda45785c1537b01dc1)
1 /*
2    Defines some vector operation functions that are shared by
3    sequential and parallel vectors.
4 */
5 #include <../src/vec/vec/impls/dvecimpl.h>
6 #include <petsc/private/kernels/petscaxpy.h>
7 
8 #if defined(PETSC_USE_FORTRAN_KERNEL_MDOT)
9   #include <../src/vec/vec/impls/seq/ftn-kernels/fmdot.h>
VecMDot_Seq(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)10 PetscErrorCode VecMDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
11 {
12   const PetscInt     n = xin->map->n;
13   PetscInt           i = nv, nv_rem = nv & 0x3;
14   PetscScalar        sum0 = 0., sum1 = 0., sum2 = 0., sum3;
15   const PetscScalar *yy0, *yy1, *yy2, *yy3, *x;
16   Vec               *yy = (Vec *)yin;
17 
18   PetscFunctionBegin;
19   PetscCall(VecGetArrayRead(xin, &x));
20   switch (nv_rem) {
21   case 3:
22     PetscCall(VecGetArrayRead(yy[0], &yy0));
23     PetscCall(VecGetArrayRead(yy[1], &yy1));
24     PetscCall(VecGetArrayRead(yy[2], &yy2));
25     fortranmdot3_(x, yy0, yy1, yy2, &n, &sum0, &sum1, &sum2);
26     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
27     PetscCall(VecRestoreArrayRead(yy[1], &yy1));
28     PetscCall(VecRestoreArrayRead(yy[2], &yy2));
29     z[0] = sum0;
30     z[1] = sum1;
31     z[2] = sum2;
32     break;
33   case 2:
34     PetscCall(VecGetArrayRead(yy[0], &yy0));
35     PetscCall(VecGetArrayRead(yy[1], &yy1));
36     fortranmdot2_(x, yy0, yy1, &n, &sum0, &sum1);
37     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
38     PetscCall(VecRestoreArrayRead(yy[1], &yy1));
39     z[0] = sum0;
40     z[1] = sum1;
41     break;
42   case 1:
43     PetscCall(VecGetArrayRead(yy[0], &yy0));
44     fortranmdot1_(x, yy0, &n, &sum0);
45     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
46     z[0] = sum0;
47     break;
48   case 0:
49     break;
50   }
51   z += nv_rem;
52   i -= nv_rem;
53   yy += nv_rem;
54 
55   while (i > 0) {
56     sum0 = 0.;
57     sum1 = 0.;
58     sum2 = 0.;
59     sum3 = 0.;
60     PetscCall(VecGetArrayRead(yy[0], &yy0));
61     PetscCall(VecGetArrayRead(yy[1], &yy1));
62     PetscCall(VecGetArrayRead(yy[2], &yy2));
63     PetscCall(VecGetArrayRead(yy[3], &yy3));
64     fortranmdot4_(x, yy0, yy1, yy2, yy3, &n, &sum0, &sum1, &sum2, &sum3);
65     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
66     PetscCall(VecRestoreArrayRead(yy[1], &yy1));
67     PetscCall(VecRestoreArrayRead(yy[2], &yy2));
68     PetscCall(VecRestoreArrayRead(yy[3], &yy3));
69     yy += 4;
70     z[0] = sum0;
71     z[1] = sum1;
72     z[2] = sum2;
73     z[3] = sum3;
74     z += 4;
75     i -= 4;
76   }
77   PetscCall(VecRestoreArrayRead(xin, &x));
78   PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
79   PetscFunctionReturn(PETSC_SUCCESS);
80 }
81 
82 #else
VecMDot_Seq(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)83 PetscErrorCode VecMDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
84 {
85   const PetscInt     n = xin->map->n;
86   PetscInt           i = nv, j = n, nv_rem = nv & 0x3, j_rem;
87   PetscScalar        sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3, x0, x1, x2, x3;
88   const PetscScalar *yy0, *yy1, *yy2, *yy3, *x, *xbase;
89   const Vec         *yy = (Vec *)yin;
90 
91   PetscFunctionBegin;
92   if (n == 0) {
93     PetscCall(PetscArrayzero(z, nv));
94     PetscFunctionReturn(PETSC_SUCCESS);
95   }
96   PetscCall(VecGetArrayRead(xin, &xbase));
97   x = xbase;
98   switch (nv_rem) {
99   case 3:
100     PetscCall(VecGetArrayRead(yy[0], &yy0));
101     PetscCall(VecGetArrayRead(yy[1], &yy1));
102     PetscCall(VecGetArrayRead(yy[2], &yy2));
103     switch (j_rem = j & 0x3) {
104     case 3:
105       x2 = x[2];
106       sum0 += x2 * PetscConj(yy0[2]);
107       sum1 += x2 * PetscConj(yy1[2]);
108       sum2 += x2 * PetscConj(yy2[2]); /* fall through */
109     case 2:
110       x1 = x[1];
111       sum0 += x1 * PetscConj(yy0[1]);
112       sum1 += x1 * PetscConj(yy1[1]);
113       sum2 += x1 * PetscConj(yy2[1]); /* fall through */
114     case 1:
115       x0 = x[0];
116       sum0 += x0 * PetscConj(yy0[0]);
117       sum1 += x0 * PetscConj(yy1[0]);
118       sum2 += x0 * PetscConj(yy2[0]); /* fall through */
119     case 0:
120       x += j_rem;
121       yy0 += j_rem;
122       yy1 += j_rem;
123       yy2 += j_rem;
124       j -= j_rem;
125       break;
126     }
127     while (j > 0) {
128       x0 = x[0];
129       x1 = x[1];
130       x2 = x[2];
131       x3 = x[3];
132       x += 4;
133 
134       sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
135       yy0 += 4;
136       sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
137       yy1 += 4;
138       sum2 += x0 * PetscConj(yy2[0]) + x1 * PetscConj(yy2[1]) + x2 * PetscConj(yy2[2]) + x3 * PetscConj(yy2[3]);
139       yy2 += 4;
140       j -= 4;
141     }
142     z[0] = sum0;
143     z[1] = sum1;
144     z[2] = sum2;
145     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
146     PetscCall(VecRestoreArrayRead(yy[1], &yy1));
147     PetscCall(VecRestoreArrayRead(yy[2], &yy2));
148     break;
149   case 2:
150     PetscCall(VecGetArrayRead(yy[0], &yy0));
151     PetscCall(VecGetArrayRead(yy[1], &yy1));
152     switch (j_rem = j & 0x3) {
153     case 3:
154       x2 = x[2];
155       sum0 += x2 * PetscConj(yy0[2]);
156       sum1 += x2 * PetscConj(yy1[2]); /* fall through */
157     case 2:
158       x1 = x[1];
159       sum0 += x1 * PetscConj(yy0[1]);
160       sum1 += x1 * PetscConj(yy1[1]); /* fall through */
161     case 1:
162       x0 = x[0];
163       sum0 += x0 * PetscConj(yy0[0]);
164       sum1 += x0 * PetscConj(yy1[0]); /* fall through */
165     case 0:
166       x += j_rem;
167       yy0 += j_rem;
168       yy1 += j_rem;
169       j -= j_rem;
170       break;
171     }
172     while (j > 0) {
173       x0 = x[0];
174       x1 = x[1];
175       x2 = x[2];
176       x3 = x[3];
177       x += 4;
178 
179       sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
180       yy0 += 4;
181       sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
182       yy1 += 4;
183       j -= 4;
184     }
185     z[0] = sum0;
186     z[1] = sum1;
187 
188     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
189     PetscCall(VecRestoreArrayRead(yy[1], &yy1));
190     break;
191   case 1:
192     PetscCall(VecGetArrayRead(yy[0], &yy0));
193     switch (j_rem = j & 0x3) {
194     case 3:
195       x2 = x[2];
196       sum0 += x2 * PetscConj(yy0[2]); /* fall through */
197     case 2:
198       x1 = x[1];
199       sum0 += x1 * PetscConj(yy0[1]); /* fall through */
200     case 1:
201       x0 = x[0];
202       sum0 += x0 * PetscConj(yy0[0]); /* fall through */
203     case 0:
204       x += j_rem;
205       yy0 += j_rem;
206       j -= j_rem;
207       break;
208     }
209     while (j > 0) {
210       sum0 += x[0] * PetscConj(yy0[0]) + x[1] * PetscConj(yy0[1]) + x[2] * PetscConj(yy0[2]) + x[3] * PetscConj(yy0[3]);
211       yy0 += 4;
212       j -= 4;
213       x += 4;
214     }
215     z[0] = sum0;
216 
217     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
218     break;
219   case 0:
220     break;
221   }
222   z += nv_rem;
223   i -= nv_rem;
224   yy += nv_rem;
225 
226   while (i > 0) {
227     sum0 = 0.;
228     sum1 = 0.;
229     sum2 = 0.;
230     sum3 = 0.;
231     PetscCall(VecGetArrayRead(yy[0], &yy0));
232     PetscCall(VecGetArrayRead(yy[1], &yy1));
233     PetscCall(VecGetArrayRead(yy[2], &yy2));
234     PetscCall(VecGetArrayRead(yy[3], &yy3));
235 
236     j = n;
237     x = xbase;
238     switch (j_rem = j & 0x3) {
239     case 3:
240       x2 = x[2];
241       sum0 += x2 * PetscConj(yy0[2]);
242       sum1 += x2 * PetscConj(yy1[2]);
243       sum2 += x2 * PetscConj(yy2[2]);
244       sum3 += x2 * PetscConj(yy3[2]); /* fall through */
245     case 2:
246       x1 = x[1];
247       sum0 += x1 * PetscConj(yy0[1]);
248       sum1 += x1 * PetscConj(yy1[1]);
249       sum2 += x1 * PetscConj(yy2[1]);
250       sum3 += x1 * PetscConj(yy3[1]); /* fall through */
251     case 1:
252       x0 = x[0];
253       sum0 += x0 * PetscConj(yy0[0]);
254       sum1 += x0 * PetscConj(yy1[0]);
255       sum2 += x0 * PetscConj(yy2[0]);
256       sum3 += x0 * PetscConj(yy3[0]); /* fall through */
257     case 0:
258       x += j_rem;
259       yy0 += j_rem;
260       yy1 += j_rem;
261       yy2 += j_rem;
262       yy3 += j_rem;
263       j -= j_rem;
264       break;
265     }
266     while (j > 0) {
267       x0 = x[0];
268       x1 = x[1];
269       x2 = x[2];
270       x3 = x[3];
271       x += 4;
272 
273       sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
274       yy0 += 4;
275       sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
276       yy1 += 4;
277       sum2 += x0 * PetscConj(yy2[0]) + x1 * PetscConj(yy2[1]) + x2 * PetscConj(yy2[2]) + x3 * PetscConj(yy2[3]);
278       yy2 += 4;
279       sum3 += x0 * PetscConj(yy3[0]) + x1 * PetscConj(yy3[1]) + x2 * PetscConj(yy3[2]) + x3 * PetscConj(yy3[3]);
280       yy3 += 4;
281       j -= 4;
282     }
283     z[0] = sum0;
284     z[1] = sum1;
285     z[2] = sum2;
286     z[3] = sum3;
287     z += 4;
288     i -= 4;
289     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
290     PetscCall(VecRestoreArrayRead(yy[1], &yy1));
291     PetscCall(VecRestoreArrayRead(yy[2], &yy2));
292     PetscCall(VecRestoreArrayRead(yy[3], &yy3));
293     yy += 4;
294   }
295   PetscCall(VecRestoreArrayRead(xin, &xbase));
296   PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
297   PetscFunctionReturn(PETSC_SUCCESS);
298 }
299 #endif
300 
VecMTDot_Seq(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)301 PetscErrorCode VecMTDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
302 {
303   const PetscInt     n = xin->map->n;
304   PetscInt           i = nv, j = n, nv_rem = nv & 0x3, j_rem;
305   PetscScalar        sum0 = 0., sum1 = 0., sum2 = 0., sum3, x0, x1, x2, x3;
306   const PetscScalar *yy0, *yy1, *yy2, *yy3, *x, *xbase;
307   const Vec         *yy = (Vec *)yin;
308 
309   PetscFunctionBegin;
310   PetscCall(VecGetArrayRead(xin, &xbase));
311   x = xbase;
312 
313   switch (nv_rem) {
314   case 3:
315     PetscCall(VecGetArrayRead(yy[0], &yy0));
316     PetscCall(VecGetArrayRead(yy[1], &yy1));
317     PetscCall(VecGetArrayRead(yy[2], &yy2));
318     switch (j_rem = j & 0x3) {
319     case 3:
320       x2 = x[2];
321       sum0 += x2 * yy0[2];
322       sum1 += x2 * yy1[2];
323       sum2 += x2 * yy2[2]; /* fall through */
324     case 2:
325       x1 = x[1];
326       sum0 += x1 * yy0[1];
327       sum1 += x1 * yy1[1];
328       sum2 += x1 * yy2[1]; /* fall through */
329     case 1:
330       x0 = x[0];
331       sum0 += x0 * yy0[0];
332       sum1 += x0 * yy1[0];
333       sum2 += x0 * yy2[0]; /* fall through */
334     case 0:
335       x += j_rem;
336       yy0 += j_rem;
337       yy1 += j_rem;
338       yy2 += j_rem;
339       j -= j_rem;
340       break;
341     }
342     while (j > 0) {
343       x0 = x[0];
344       x1 = x[1];
345       x2 = x[2];
346       x3 = x[3];
347       x += 4;
348 
349       sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
350       yy0 += 4;
351       sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
352       yy1 += 4;
353       sum2 += x0 * yy2[0] + x1 * yy2[1] + x2 * yy2[2] + x3 * yy2[3];
354       yy2 += 4;
355       j -= 4;
356     }
357     z[0] = sum0;
358     z[1] = sum1;
359     z[2] = sum2;
360     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
361     PetscCall(VecRestoreArrayRead(yy[1], &yy1));
362     PetscCall(VecRestoreArrayRead(yy[2], &yy2));
363     break;
364   case 2:
365     PetscCall(VecGetArrayRead(yy[0], &yy0));
366     PetscCall(VecGetArrayRead(yy[1], &yy1));
367     switch (j_rem = j & 0x3) {
368     case 3:
369       x2 = x[2];
370       sum0 += x2 * yy0[2];
371       sum1 += x2 * yy1[2]; /* fall through */
372     case 2:
373       x1 = x[1];
374       sum0 += x1 * yy0[1];
375       sum1 += x1 * yy1[1]; /* fall through */
376     case 1:
377       x0 = x[0];
378       sum0 += x0 * yy0[0];
379       sum1 += x0 * yy1[0]; /* fall through */
380     case 0:
381       x += j_rem;
382       yy0 += j_rem;
383       yy1 += j_rem;
384       j -= j_rem;
385       break;
386     }
387     while (j > 0) {
388       x0 = x[0];
389       x1 = x[1];
390       x2 = x[2];
391       x3 = x[3];
392       x += 4;
393 
394       sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
395       yy0 += 4;
396       sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
397       yy1 += 4;
398       j -= 4;
399     }
400     z[0] = sum0;
401     z[1] = sum1;
402 
403     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
404     PetscCall(VecRestoreArrayRead(yy[1], &yy1));
405     break;
406   case 1:
407     PetscCall(VecGetArrayRead(yy[0], &yy0));
408     switch (j_rem = j & 0x3) {
409     case 3:
410       x2 = x[2];
411       sum0 += x2 * yy0[2]; /* fall through */
412     case 2:
413       x1 = x[1];
414       sum0 += x1 * yy0[1]; /* fall through */
415     case 1:
416       x0 = x[0];
417       sum0 += x0 * yy0[0]; /* fall through */
418     case 0:
419       x += j_rem;
420       yy0 += j_rem;
421       j -= j_rem;
422       break;
423     }
424     while (j > 0) {
425       sum0 += x[0] * yy0[0] + x[1] * yy0[1] + x[2] * yy0[2] + x[3] * yy0[3];
426       yy0 += 4;
427       j -= 4;
428       x += 4;
429     }
430     z[0] = sum0;
431 
432     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
433     break;
434   case 0:
435     break;
436   }
437   z += nv_rem;
438   i -= nv_rem;
439   yy += nv_rem;
440 
441   while (i > 0) {
442     sum0 = 0.;
443     sum1 = 0.;
444     sum2 = 0.;
445     sum3 = 0.;
446     PetscCall(VecGetArrayRead(yy[0], &yy0));
447     PetscCall(VecGetArrayRead(yy[1], &yy1));
448     PetscCall(VecGetArrayRead(yy[2], &yy2));
449     PetscCall(VecGetArrayRead(yy[3], &yy3));
450     x = xbase;
451 
452     j = n;
453     switch (j_rem = j & 0x3) {
454     case 3:
455       x2 = x[2];
456       sum0 += x2 * yy0[2];
457       sum1 += x2 * yy1[2];
458       sum2 += x2 * yy2[2];
459       sum3 += x2 * yy3[2]; /* fall through */
460     case 2:
461       x1 = x[1];
462       sum0 += x1 * yy0[1];
463       sum1 += x1 * yy1[1];
464       sum2 += x1 * yy2[1];
465       sum3 += x1 * yy3[1]; /* fall through */
466     case 1:
467       x0 = x[0];
468       sum0 += x0 * yy0[0];
469       sum1 += x0 * yy1[0];
470       sum2 += x0 * yy2[0];
471       sum3 += x0 * yy3[0]; /* fall through */
472     case 0:
473       x += j_rem;
474       yy0 += j_rem;
475       yy1 += j_rem;
476       yy2 += j_rem;
477       yy3 += j_rem;
478       j -= j_rem;
479       break;
480     }
481     while (j > 0) {
482       x0 = x[0];
483       x1 = x[1];
484       x2 = x[2];
485       x3 = x[3];
486       x += 4;
487 
488       sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
489       yy0 += 4;
490       sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
491       yy1 += 4;
492       sum2 += x0 * yy2[0] + x1 * yy2[1] + x2 * yy2[2] + x3 * yy2[3];
493       yy2 += 4;
494       sum3 += x0 * yy3[0] + x1 * yy3[1] + x2 * yy3[2] + x3 * yy3[3];
495       yy3 += 4;
496       j -= 4;
497     }
498     z[0] = sum0;
499     z[1] = sum1;
500     z[2] = sum2;
501     z[3] = sum3;
502     z += 4;
503     i -= 4;
504     PetscCall(VecRestoreArrayRead(yy[0], &yy0));
505     PetscCall(VecRestoreArrayRead(yy[1], &yy1));
506     PetscCall(VecRestoreArrayRead(yy[2], &yy2));
507     PetscCall(VecRestoreArrayRead(yy[3], &yy3));
508     yy += 4;
509   }
510   PetscCall(VecRestoreArrayRead(xin, &xbase));
511   PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
512   PetscFunctionReturn(PETSC_SUCCESS);
513 }
514 
VecMultiDot_Seq_GEMV(PetscBool conjugate,Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)515 static PetscErrorCode VecMultiDot_Seq_GEMV(PetscBool conjugate, Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
516 {
517   PetscInt           i, j, nfail;
518   const PetscScalar *xarray, *yarray, *yfirst, *ynext;
519   PetscBool          stop  = PETSC_FALSE;
520   const char        *trans = conjugate ? "C" : "T";
521   PetscInt64         lda   = 0;
522   PetscBLASInt       n, m;
523 
524   PetscFunctionBegin;
525   if (xin->map->n == 0) { // otherwise BLAS complains illegal values (0) for lda
526     PetscCall(PetscArrayzero(z, nv));
527     PetscFunctionReturn(PETSC_SUCCESS);
528   }
529 
530   // caller guarantees xin->map->n <= PETSC_BLAS_INT_MAX
531   PetscCall(PetscBLASIntCast(xin->map->n, &n));
532   PetscCall(VecGetArrayRead(xin, &xarray));
533   i = nfail = 0;
534   while (i < nv) {
535     stop = PETSC_FALSE;
536     PetscCall(VecGetArrayRead(yin[i], &yfirst));
537     yarray = yfirst;
538     for (j = i + 1; j < nv; j++) {
539       PetscCall(VecGetArrayRead(yin[j], &ynext));
540       if (j == i + 1) {
541         lda = ynext - yarray;
542         if (lda < 0 || lda > PETSC_BLAS_INT_MAX || lda - n > 64) stop = PETSC_TRUE;
543       } else if (lda * (j - i) != ynext - yarray) { // not in the same stride? if so, stop here
544         stop = PETSC_TRUE;
545       }
546       PetscCall(VecRestoreArrayRead(yin[j], &ynext));
547       if (stop) break;
548     }
549     PetscCall(VecRestoreArrayRead(yin[i], &yfirst));
550 
551     // we found m vectors yin[i..j)
552     PetscCall(PetscBLASIntCast(j - i, &m));
553     if (m > 1) {
554       PetscBLASInt ione = 1, lda2 = (PetscBLASInt)lda; // the cast is safe since we've screened out those lda > PETSC_BLAS_INT_MAX above
555       PetscScalar  one = 1, zero = 0;
556 
557       PetscCallBLAS("BLASgemv", BLASgemv_(trans, &n, &m, &one, yarray, &lda2, xarray, &ione, &zero, z + i, &ione));
558       PetscCall(PetscLogFlops(PetscMax(m * (2.0 * n - 1), 0.0)));
559     } else {
560       if (nfail == 0) {
561         if (conjugate) PetscCall(VecDot_Seq(xin, yin[i], z + i));
562         else PetscCall(VecTDot_Seq(xin, yin[i], z + i));
563         nfail++;
564       } else break;
565     }
566     i = j;
567   }
568   PetscCall(VecRestoreArrayRead(xin, &xarray));
569   if (i < nv) { // finish the remaining if any
570     if (conjugate) PetscCall(VecMDot_Seq(xin, nv - i, yin + i, z + i));
571     else PetscCall(VecMTDot_Seq(xin, nv - i, yin + i, z + i));
572   }
573   PetscFunctionReturn(PETSC_SUCCESS);
574 }
575 
VecMDot_Seq_GEMV(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)576 PetscErrorCode VecMDot_Seq_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
577 {
578   PetscFunctionBegin;
579   if (xin->map->n > PETSC_BLAS_INT_MAX) {
580     PetscCall(VecMDot_Seq(xin, nv, yin, z));
581   } else {
582     PetscCall(VecMultiDot_Seq_GEMV(PETSC_TRUE, xin, nv, yin, z));
583   }
584   PetscFunctionReturn(PETSC_SUCCESS);
585 }
586 
VecMTDot_Seq_GEMV(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)587 PetscErrorCode VecMTDot_Seq_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
588 {
589   PetscFunctionBegin;
590   if (xin->map->n > PETSC_BLAS_INT_MAX) {
591     PetscCall(VecMTDot_Seq(xin, nv, yin, z));
592   } else {
593     PetscCall(VecMultiDot_Seq_GEMV(PETSC_FALSE, xin, nv, yin, z));
594   }
595   PetscFunctionReturn(PETSC_SUCCESS);
596 }
597 
VecMinMax_Seq(Vec xin,PetscInt * idx,PetscReal * z,PetscReal minmax,int (* const cmp)(PetscReal,PetscReal))598 static PetscErrorCode VecMinMax_Seq(Vec xin, PetscInt *idx, PetscReal *z, PetscReal minmax, int (*const cmp)(PetscReal, PetscReal))
599 {
600   const PetscInt n = xin->map->n;
601   PetscInt       j = -1;
602 
603   PetscFunctionBegin;
604   if (n) {
605     const PetscScalar *xx;
606 
607     PetscCall(VecGetArrayRead(xin, &xx));
608     minmax = PetscRealPart(xx[(j = 0)]);
609     for (PetscInt i = 1; i < n; ++i) {
610       const PetscReal tmp = PetscRealPart(xx[i]);
611 
612       if (cmp(tmp, minmax)) {
613         j      = i;
614         minmax = tmp;
615       }
616     }
617     PetscCall(VecRestoreArrayRead(xin, &xx));
618   }
619   *z = minmax;
620   if (idx) *idx = j;
621   PetscFunctionReturn(PETSC_SUCCESS);
622 }
623 
VecMax_Seq_GT(PetscReal l,PetscReal r)624 static int VecMax_Seq_GT(PetscReal l, PetscReal r)
625 {
626   return l > r;
627 }
628 
VecMax_Seq(Vec xin,PetscInt * idx,PetscReal * z)629 PetscErrorCode VecMax_Seq(Vec xin, PetscInt *idx, PetscReal *z)
630 {
631   PetscFunctionBegin;
632   PetscCall(VecMinMax_Seq(xin, idx, z, PETSC_MIN_REAL, VecMax_Seq_GT));
633   PetscFunctionReturn(PETSC_SUCCESS);
634 }
635 
VecMin_Seq_LT(PetscReal l,PetscReal r)636 static int VecMin_Seq_LT(PetscReal l, PetscReal r)
637 {
638   return l < r;
639 }
640 
VecMin_Seq(Vec xin,PetscInt * idx,PetscReal * z)641 PetscErrorCode VecMin_Seq(Vec xin, PetscInt *idx, PetscReal *z)
642 {
643   PetscFunctionBegin;
644   PetscCall(VecMinMax_Seq(xin, idx, z, PETSC_MAX_REAL, VecMin_Seq_LT));
645   PetscFunctionReturn(PETSC_SUCCESS);
646 }
647 
VecSet_Seq(Vec xin,PetscScalar alpha)648 PetscErrorCode VecSet_Seq(Vec xin, PetscScalar alpha)
649 {
650   const PetscInt n = xin->map->n;
651   PetscScalar   *xx;
652 
653   PetscFunctionBegin;
654   PetscCall(VecGetArrayWrite(xin, &xx));
655   if (alpha == (PetscScalar)0.0) {
656     PetscCall(PetscArrayzero(xx, n));
657   } else {
658     for (PetscInt i = 0; i < n; i++) xx[i] = alpha;
659   }
660   PetscCall(VecRestoreArrayWrite(xin, &xx));
661   PetscFunctionReturn(PETSC_SUCCESS);
662 }
663 
VecMAXPY_Seq(Vec xin,PetscInt nv,const PetscScalar * alpha,Vec * y)664 PetscErrorCode VecMAXPY_Seq(Vec xin, PetscInt nv, const PetscScalar *alpha, Vec *y)
665 {
666   const PetscInt     j_rem = nv & 0x3, n = xin->map->n;
667   const PetscScalar *yptr[4];
668   PetscScalar       *xx;
669 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
670   #pragma disjoint(*xx, **yptr, *aptr)
671 #endif
672 
673   PetscFunctionBegin;
674   PetscCall(PetscLogFlops(nv * 2.0 * n));
675   PetscCall(VecGetArray(xin, &xx));
676   for (PetscInt i = 0; i < j_rem; ++i) PetscCall(VecGetArrayRead(y[i], yptr + i));
677   switch (j_rem) {
678   case 3:
679     PetscKernelAXPY3(xx, alpha[0], alpha[1], alpha[2], yptr[0], yptr[1], yptr[2], n);
680     break;
681   case 2:
682     PetscKernelAXPY2(xx, alpha[0], alpha[1], yptr[0], yptr[1], n);
683     break;
684   case 1:
685     PetscKernelAXPY(xx, alpha[0], yptr[0], n);
686   default:
687     break;
688   }
689   for (PetscInt i = 0; i < j_rem; ++i) PetscCall(VecRestoreArrayRead(y[i], yptr + i));
690   alpha += j_rem;
691   y += j_rem;
692   for (PetscInt j = j_rem, inc = 4; j < nv; j += inc, alpha += inc, y += inc) {
693     for (PetscInt i = 0; i < inc; ++i) PetscCall(VecGetArrayRead(y[i], yptr + i));
694     PetscKernelAXPY4(xx, alpha[0], alpha[1], alpha[2], alpha[3], yptr[0], yptr[1], yptr[2], yptr[3], n);
695     for (PetscInt i = 0; i < inc; ++i) PetscCall(VecRestoreArrayRead(y[i], yptr + i));
696   }
697   PetscCall(VecRestoreArray(xin, &xx));
698   PetscFunctionReturn(PETSC_SUCCESS);
699 }
700 
701 /*  y = y + sum alpha[i] x[i] */
VecMAXPY_Seq_GEMV(Vec yin,PetscInt nv,const PetscScalar alpha[],Vec xin[])702 PetscErrorCode VecMAXPY_Seq_GEMV(Vec yin, PetscInt nv, const PetscScalar alpha[], Vec xin[])
703 {
704   PetscInt           i, j, nfail;
705   PetscScalar       *yarray;
706   const PetscScalar *xfirst, *xnext, *xarray;
707   PetscBool          stop = PETSC_FALSE;
708   PetscInt64         lda  = 0;
709   PetscBLASInt       n, m;
710 
711   PetscFunctionBegin;
712   if (yin->map->n == 0 || yin->map->n > PETSC_BLAS_INT_MAX) {
713     PetscCall(VecMAXPY_Seq(yin, nv, alpha, xin));
714     PetscFunctionReturn(PETSC_SUCCESS);
715   }
716 
717   PetscCall(PetscBLASIntCast(yin->map->n, &n));
718   PetscCall(VecGetArray(yin, &yarray));
719   i = nfail = 0;
720   while (i < nv) {
721     stop = PETSC_FALSE;
722     PetscCall(VecGetArrayRead(xin[i], &xfirst));
723     xarray = xfirst;
724     for (j = i + 1; j < nv; j++) {
725       PetscCall(VecGetArrayRead(xin[j], &xnext));
726       if (j == i + 1) {
727         lda = xnext - xarray;
728         if (lda < 0 || lda > PETSC_BLAS_INT_MAX || lda - n > 64) stop = PETSC_TRUE;
729       } else if (lda * (j - i) != xnext - xarray) { // not in the same stride? if so, stop here
730         stop = PETSC_TRUE;
731       }
732       PetscCall(VecRestoreArrayRead(xin[j], &xnext));
733       if (stop) break;
734     }
735     PetscCall(VecRestoreArrayRead(xin[i], &xfirst));
736 
737     PetscCall(PetscBLASIntCast(j - i, &m));
738     if (m > 1) {
739       PetscBLASInt incx = 1, incy = 1, lda2 = (PetscBLASInt)lda; // the cast is safe since we've screened out those lda > PETSC_BLAS_INT_MAX above
740       PetscScalar  one = 1;
741       PetscCallBLAS("BLASgemv", BLASgemv_("N", &n, &m, &one, xarray, &lda2, alpha + i, &incx, &one, yarray, &incy));
742       PetscCall(PetscLogFlops(m * 2.0 * n));
743     } else {
744       // we only allow falling back on VecAXPY once
745       if (nfail++ == 0) PetscCall(VecAXPY_Seq(yin, alpha[i], xin[i]));
746       else break; // break the while loop
747     }
748     i = j;
749   }
750   PetscCall(VecRestoreArray(yin, &yarray));
751 
752   // finish the remaining if any
753   if (i < nv) PetscCall(VecMAXPY_Seq(yin, nv - i, alpha + i, xin + i));
754   PetscFunctionReturn(PETSC_SUCCESS);
755 }
756 
757 #include <../src/vec/vec/impls/seq/ftn-kernels/faypx.h>
758 
VecAYPX_Seq(Vec yin,PetscScalar alpha,Vec xin)759 PetscErrorCode VecAYPX_Seq(Vec yin, PetscScalar alpha, Vec xin)
760 {
761   PetscFunctionBegin;
762   if (alpha == (PetscScalar)0.0) {
763     PetscCall(VecCopy(xin, yin));
764   } else if (alpha == (PetscScalar)1.0) {
765     PetscCall(VecAXPY_Seq(yin, alpha, xin));
766   } else {
767     const PetscInt     n = yin->map->n;
768     const PetscScalar *xx;
769     PetscScalar       *yy;
770 
771     PetscCall(VecGetArrayRead(xin, &xx));
772     PetscCall(VecGetArray(yin, &yy));
773     if (alpha == (PetscScalar)-1.0) {
774       for (PetscInt i = 0; i < n; ++i) yy[i] = xx[i] - yy[i];
775       PetscCall(PetscLogFlops(n));
776     } else {
777 #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
778       fortranaypx_(&n, &alpha, xx, yy);
779 #else
780       for (PetscInt i = 0; i < n; ++i) yy[i] = xx[i] + alpha * yy[i];
781 #endif
782       PetscCall(PetscLogFlops(2 * n));
783     }
784     PetscCall(VecRestoreArrayRead(xin, &xx));
785     PetscCall(VecRestoreArray(yin, &yy));
786   }
787   PetscFunctionReturn(PETSC_SUCCESS);
788 }
789 
790 #include <../src/vec/vec/impls/seq/ftn-kernels/fwaxpy.h>
791 /*
792    IBM ESSL contains a routine dzaxpy() that is our WAXPY() but it appears
793    to be slower than a regular C loop.  Hence,we do not include it.
794    void ?zaxpy(int*,PetscScalar*,PetscScalar*,int*,PetscScalar*,int*,PetscScalar*,int*);
795 */
796 
VecWAXPY_Seq(Vec win,PetscScalar alpha,Vec xin,Vec yin)797 PetscErrorCode VecWAXPY_Seq(Vec win, PetscScalar alpha, Vec xin, Vec yin)
798 {
799   const PetscInt     n = win->map->n;
800   const PetscScalar *yy, *xx;
801   PetscScalar       *ww;
802 
803   PetscFunctionBegin;
804   PetscCall(VecGetArrayRead(xin, &xx));
805   PetscCall(VecGetArrayRead(yin, &yy));
806   PetscCall(VecGetArray(win, &ww));
807   if (alpha == (PetscScalar)1.0) {
808     PetscCall(PetscLogFlops(n));
809     /* could call BLAS axpy after call to memcopy, but may be slower */
810     for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] + xx[i];
811   } else if (alpha == (PetscScalar)-1.0) {
812     PetscCall(PetscLogFlops(n));
813     for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] - xx[i];
814   } else if (alpha == (PetscScalar)0.0) {
815     PetscCall(PetscArraycpy(ww, yy, n));
816   } else {
817     PetscCall(PetscLogFlops(2.0 * n));
818 #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
819     fortranwaxpy_(&n, &alpha, xx, yy, ww);
820 #else
821     for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] + alpha * xx[i];
822 #endif
823   }
824   PetscCall(VecRestoreArrayRead(xin, &xx));
825   PetscCall(VecRestoreArrayRead(yin, &yy));
826   PetscCall(VecRestoreArray(win, &ww));
827   PetscFunctionReturn(PETSC_SUCCESS);
828 }
829 
VecMaxPointwiseDivide_Seq(Vec xin,Vec yin,PetscReal * max)830 PetscErrorCode VecMaxPointwiseDivide_Seq(Vec xin, Vec yin, PetscReal *max)
831 {
832   const PetscInt     n = xin->map->n;
833   const PetscScalar *xx, *yy;
834   PetscReal          m = 0.0;
835 
836   PetscFunctionBegin;
837   PetscCall(VecGetArrayRead(xin, &xx));
838   PetscCall(VecGetArrayRead(yin, &yy));
839   for (PetscInt i = 0; i < n; ++i) {
840     const PetscReal v = PetscAbsScalar(yy[i] == (PetscScalar)0.0 ? xx[i] : xx[i] / yy[i]);
841 
842     // use a separate value to not re-evaluate side-effects
843     m = PetscMax(v, m);
844   }
845   PetscCall(VecRestoreArrayRead(xin, &xx));
846   PetscCall(VecRestoreArrayRead(yin, &yy));
847   PetscCall(PetscLogFlops(n));
848   *max = m;
849   PetscFunctionReturn(PETSC_SUCCESS);
850 }
851 
VecPlaceArray_Seq(Vec vin,const PetscScalar * a)852 PetscErrorCode VecPlaceArray_Seq(Vec vin, const PetscScalar *a)
853 {
854   Vec_Seq *v = (Vec_Seq *)vin->data;
855 
856   PetscFunctionBegin;
857   PetscCheck(!v->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
858   v->unplacedarray = v->array; /* save previous array so reset can bring it back */
859   v->array         = (PetscScalar *)a;
860   PetscFunctionReturn(PETSC_SUCCESS);
861 }
862 
VecReplaceArray_Seq(Vec vin,const PetscScalar * a)863 PetscErrorCode VecReplaceArray_Seq(Vec vin, const PetscScalar *a)
864 {
865   Vec_Seq *v = (Vec_Seq *)vin->data;
866 
867   PetscFunctionBegin;
868   PetscCall(PetscFree(v->array_allocated));
869   v->array_allocated = v->array = (PetscScalar *)a;
870   PetscFunctionReturn(PETSC_SUCCESS);
871 }
872