1 /*
2 Defines some vector operation functions that are shared by
3 sequential and parallel vectors.
4 */
5 #include <../src/vec/vec/impls/dvecimpl.h>
6 #include <petsc/private/kernels/petscaxpy.h>
7
8 #if defined(PETSC_USE_FORTRAN_KERNEL_MDOT)
9 #include <../src/vec/vec/impls/seq/ftn-kernels/fmdot.h>
VecMDot_Seq(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)10 PetscErrorCode VecMDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
11 {
12 const PetscInt n = xin->map->n;
13 PetscInt i = nv, nv_rem = nv & 0x3;
14 PetscScalar sum0 = 0., sum1 = 0., sum2 = 0., sum3;
15 const PetscScalar *yy0, *yy1, *yy2, *yy3, *x;
16 Vec *yy = (Vec *)yin;
17
18 PetscFunctionBegin;
19 PetscCall(VecGetArrayRead(xin, &x));
20 switch (nv_rem) {
21 case 3:
22 PetscCall(VecGetArrayRead(yy[0], &yy0));
23 PetscCall(VecGetArrayRead(yy[1], &yy1));
24 PetscCall(VecGetArrayRead(yy[2], &yy2));
25 fortranmdot3_(x, yy0, yy1, yy2, &n, &sum0, &sum1, &sum2);
26 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
27 PetscCall(VecRestoreArrayRead(yy[1], &yy1));
28 PetscCall(VecRestoreArrayRead(yy[2], &yy2));
29 z[0] = sum0;
30 z[1] = sum1;
31 z[2] = sum2;
32 break;
33 case 2:
34 PetscCall(VecGetArrayRead(yy[0], &yy0));
35 PetscCall(VecGetArrayRead(yy[1], &yy1));
36 fortranmdot2_(x, yy0, yy1, &n, &sum0, &sum1);
37 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
38 PetscCall(VecRestoreArrayRead(yy[1], &yy1));
39 z[0] = sum0;
40 z[1] = sum1;
41 break;
42 case 1:
43 PetscCall(VecGetArrayRead(yy[0], &yy0));
44 fortranmdot1_(x, yy0, &n, &sum0);
45 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
46 z[0] = sum0;
47 break;
48 case 0:
49 break;
50 }
51 z += nv_rem;
52 i -= nv_rem;
53 yy += nv_rem;
54
55 while (i > 0) {
56 sum0 = 0.;
57 sum1 = 0.;
58 sum2 = 0.;
59 sum3 = 0.;
60 PetscCall(VecGetArrayRead(yy[0], &yy0));
61 PetscCall(VecGetArrayRead(yy[1], &yy1));
62 PetscCall(VecGetArrayRead(yy[2], &yy2));
63 PetscCall(VecGetArrayRead(yy[3], &yy3));
64 fortranmdot4_(x, yy0, yy1, yy2, yy3, &n, &sum0, &sum1, &sum2, &sum3);
65 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
66 PetscCall(VecRestoreArrayRead(yy[1], &yy1));
67 PetscCall(VecRestoreArrayRead(yy[2], &yy2));
68 PetscCall(VecRestoreArrayRead(yy[3], &yy3));
69 yy += 4;
70 z[0] = sum0;
71 z[1] = sum1;
72 z[2] = sum2;
73 z[3] = sum3;
74 z += 4;
75 i -= 4;
76 }
77 PetscCall(VecRestoreArrayRead(xin, &x));
78 PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
79 PetscFunctionReturn(PETSC_SUCCESS);
80 }
81
82 #else
VecMDot_Seq(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)83 PetscErrorCode VecMDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
84 {
85 const PetscInt n = xin->map->n;
86 PetscInt i = nv, j = n, nv_rem = nv & 0x3, j_rem;
87 PetscScalar sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3, x0, x1, x2, x3;
88 const PetscScalar *yy0, *yy1, *yy2, *yy3, *x, *xbase;
89 const Vec *yy = (Vec *)yin;
90
91 PetscFunctionBegin;
92 if (n == 0) {
93 PetscCall(PetscArrayzero(z, nv));
94 PetscFunctionReturn(PETSC_SUCCESS);
95 }
96 PetscCall(VecGetArrayRead(xin, &xbase));
97 x = xbase;
98 switch (nv_rem) {
99 case 3:
100 PetscCall(VecGetArrayRead(yy[0], &yy0));
101 PetscCall(VecGetArrayRead(yy[1], &yy1));
102 PetscCall(VecGetArrayRead(yy[2], &yy2));
103 switch (j_rem = j & 0x3) {
104 case 3:
105 x2 = x[2];
106 sum0 += x2 * PetscConj(yy0[2]);
107 sum1 += x2 * PetscConj(yy1[2]);
108 sum2 += x2 * PetscConj(yy2[2]); /* fall through */
109 case 2:
110 x1 = x[1];
111 sum0 += x1 * PetscConj(yy0[1]);
112 sum1 += x1 * PetscConj(yy1[1]);
113 sum2 += x1 * PetscConj(yy2[1]); /* fall through */
114 case 1:
115 x0 = x[0];
116 sum0 += x0 * PetscConj(yy0[0]);
117 sum1 += x0 * PetscConj(yy1[0]);
118 sum2 += x0 * PetscConj(yy2[0]); /* fall through */
119 case 0:
120 x += j_rem;
121 yy0 += j_rem;
122 yy1 += j_rem;
123 yy2 += j_rem;
124 j -= j_rem;
125 break;
126 }
127 while (j > 0) {
128 x0 = x[0];
129 x1 = x[1];
130 x2 = x[2];
131 x3 = x[3];
132 x += 4;
133
134 sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
135 yy0 += 4;
136 sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
137 yy1 += 4;
138 sum2 += x0 * PetscConj(yy2[0]) + x1 * PetscConj(yy2[1]) + x2 * PetscConj(yy2[2]) + x3 * PetscConj(yy2[3]);
139 yy2 += 4;
140 j -= 4;
141 }
142 z[0] = sum0;
143 z[1] = sum1;
144 z[2] = sum2;
145 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
146 PetscCall(VecRestoreArrayRead(yy[1], &yy1));
147 PetscCall(VecRestoreArrayRead(yy[2], &yy2));
148 break;
149 case 2:
150 PetscCall(VecGetArrayRead(yy[0], &yy0));
151 PetscCall(VecGetArrayRead(yy[1], &yy1));
152 switch (j_rem = j & 0x3) {
153 case 3:
154 x2 = x[2];
155 sum0 += x2 * PetscConj(yy0[2]);
156 sum1 += x2 * PetscConj(yy1[2]); /* fall through */
157 case 2:
158 x1 = x[1];
159 sum0 += x1 * PetscConj(yy0[1]);
160 sum1 += x1 * PetscConj(yy1[1]); /* fall through */
161 case 1:
162 x0 = x[0];
163 sum0 += x0 * PetscConj(yy0[0]);
164 sum1 += x0 * PetscConj(yy1[0]); /* fall through */
165 case 0:
166 x += j_rem;
167 yy0 += j_rem;
168 yy1 += j_rem;
169 j -= j_rem;
170 break;
171 }
172 while (j > 0) {
173 x0 = x[0];
174 x1 = x[1];
175 x2 = x[2];
176 x3 = x[3];
177 x += 4;
178
179 sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
180 yy0 += 4;
181 sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
182 yy1 += 4;
183 j -= 4;
184 }
185 z[0] = sum0;
186 z[1] = sum1;
187
188 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
189 PetscCall(VecRestoreArrayRead(yy[1], &yy1));
190 break;
191 case 1:
192 PetscCall(VecGetArrayRead(yy[0], &yy0));
193 switch (j_rem = j & 0x3) {
194 case 3:
195 x2 = x[2];
196 sum0 += x2 * PetscConj(yy0[2]); /* fall through */
197 case 2:
198 x1 = x[1];
199 sum0 += x1 * PetscConj(yy0[1]); /* fall through */
200 case 1:
201 x0 = x[0];
202 sum0 += x0 * PetscConj(yy0[0]); /* fall through */
203 case 0:
204 x += j_rem;
205 yy0 += j_rem;
206 j -= j_rem;
207 break;
208 }
209 while (j > 0) {
210 sum0 += x[0] * PetscConj(yy0[0]) + x[1] * PetscConj(yy0[1]) + x[2] * PetscConj(yy0[2]) + x[3] * PetscConj(yy0[3]);
211 yy0 += 4;
212 j -= 4;
213 x += 4;
214 }
215 z[0] = sum0;
216
217 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
218 break;
219 case 0:
220 break;
221 }
222 z += nv_rem;
223 i -= nv_rem;
224 yy += nv_rem;
225
226 while (i > 0) {
227 sum0 = 0.;
228 sum1 = 0.;
229 sum2 = 0.;
230 sum3 = 0.;
231 PetscCall(VecGetArrayRead(yy[0], &yy0));
232 PetscCall(VecGetArrayRead(yy[1], &yy1));
233 PetscCall(VecGetArrayRead(yy[2], &yy2));
234 PetscCall(VecGetArrayRead(yy[3], &yy3));
235
236 j = n;
237 x = xbase;
238 switch (j_rem = j & 0x3) {
239 case 3:
240 x2 = x[2];
241 sum0 += x2 * PetscConj(yy0[2]);
242 sum1 += x2 * PetscConj(yy1[2]);
243 sum2 += x2 * PetscConj(yy2[2]);
244 sum3 += x2 * PetscConj(yy3[2]); /* fall through */
245 case 2:
246 x1 = x[1];
247 sum0 += x1 * PetscConj(yy0[1]);
248 sum1 += x1 * PetscConj(yy1[1]);
249 sum2 += x1 * PetscConj(yy2[1]);
250 sum3 += x1 * PetscConj(yy3[1]); /* fall through */
251 case 1:
252 x0 = x[0];
253 sum0 += x0 * PetscConj(yy0[0]);
254 sum1 += x0 * PetscConj(yy1[0]);
255 sum2 += x0 * PetscConj(yy2[0]);
256 sum3 += x0 * PetscConj(yy3[0]); /* fall through */
257 case 0:
258 x += j_rem;
259 yy0 += j_rem;
260 yy1 += j_rem;
261 yy2 += j_rem;
262 yy3 += j_rem;
263 j -= j_rem;
264 break;
265 }
266 while (j > 0) {
267 x0 = x[0];
268 x1 = x[1];
269 x2 = x[2];
270 x3 = x[3];
271 x += 4;
272
273 sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
274 yy0 += 4;
275 sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
276 yy1 += 4;
277 sum2 += x0 * PetscConj(yy2[0]) + x1 * PetscConj(yy2[1]) + x2 * PetscConj(yy2[2]) + x3 * PetscConj(yy2[3]);
278 yy2 += 4;
279 sum3 += x0 * PetscConj(yy3[0]) + x1 * PetscConj(yy3[1]) + x2 * PetscConj(yy3[2]) + x3 * PetscConj(yy3[3]);
280 yy3 += 4;
281 j -= 4;
282 }
283 z[0] = sum0;
284 z[1] = sum1;
285 z[2] = sum2;
286 z[3] = sum3;
287 z += 4;
288 i -= 4;
289 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
290 PetscCall(VecRestoreArrayRead(yy[1], &yy1));
291 PetscCall(VecRestoreArrayRead(yy[2], &yy2));
292 PetscCall(VecRestoreArrayRead(yy[3], &yy3));
293 yy += 4;
294 }
295 PetscCall(VecRestoreArrayRead(xin, &xbase));
296 PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
297 PetscFunctionReturn(PETSC_SUCCESS);
298 }
299 #endif
300
VecMTDot_Seq(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)301 PetscErrorCode VecMTDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
302 {
303 const PetscInt n = xin->map->n;
304 PetscInt i = nv, j = n, nv_rem = nv & 0x3, j_rem;
305 PetscScalar sum0 = 0., sum1 = 0., sum2 = 0., sum3, x0, x1, x2, x3;
306 const PetscScalar *yy0, *yy1, *yy2, *yy3, *x, *xbase;
307 const Vec *yy = (Vec *)yin;
308
309 PetscFunctionBegin;
310 PetscCall(VecGetArrayRead(xin, &xbase));
311 x = xbase;
312
313 switch (nv_rem) {
314 case 3:
315 PetscCall(VecGetArrayRead(yy[0], &yy0));
316 PetscCall(VecGetArrayRead(yy[1], &yy1));
317 PetscCall(VecGetArrayRead(yy[2], &yy2));
318 switch (j_rem = j & 0x3) {
319 case 3:
320 x2 = x[2];
321 sum0 += x2 * yy0[2];
322 sum1 += x2 * yy1[2];
323 sum2 += x2 * yy2[2]; /* fall through */
324 case 2:
325 x1 = x[1];
326 sum0 += x1 * yy0[1];
327 sum1 += x1 * yy1[1];
328 sum2 += x1 * yy2[1]; /* fall through */
329 case 1:
330 x0 = x[0];
331 sum0 += x0 * yy0[0];
332 sum1 += x0 * yy1[0];
333 sum2 += x0 * yy2[0]; /* fall through */
334 case 0:
335 x += j_rem;
336 yy0 += j_rem;
337 yy1 += j_rem;
338 yy2 += j_rem;
339 j -= j_rem;
340 break;
341 }
342 while (j > 0) {
343 x0 = x[0];
344 x1 = x[1];
345 x2 = x[2];
346 x3 = x[3];
347 x += 4;
348
349 sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
350 yy0 += 4;
351 sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
352 yy1 += 4;
353 sum2 += x0 * yy2[0] + x1 * yy2[1] + x2 * yy2[2] + x3 * yy2[3];
354 yy2 += 4;
355 j -= 4;
356 }
357 z[0] = sum0;
358 z[1] = sum1;
359 z[2] = sum2;
360 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
361 PetscCall(VecRestoreArrayRead(yy[1], &yy1));
362 PetscCall(VecRestoreArrayRead(yy[2], &yy2));
363 break;
364 case 2:
365 PetscCall(VecGetArrayRead(yy[0], &yy0));
366 PetscCall(VecGetArrayRead(yy[1], &yy1));
367 switch (j_rem = j & 0x3) {
368 case 3:
369 x2 = x[2];
370 sum0 += x2 * yy0[2];
371 sum1 += x2 * yy1[2]; /* fall through */
372 case 2:
373 x1 = x[1];
374 sum0 += x1 * yy0[1];
375 sum1 += x1 * yy1[1]; /* fall through */
376 case 1:
377 x0 = x[0];
378 sum0 += x0 * yy0[0];
379 sum1 += x0 * yy1[0]; /* fall through */
380 case 0:
381 x += j_rem;
382 yy0 += j_rem;
383 yy1 += j_rem;
384 j -= j_rem;
385 break;
386 }
387 while (j > 0) {
388 x0 = x[0];
389 x1 = x[1];
390 x2 = x[2];
391 x3 = x[3];
392 x += 4;
393
394 sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
395 yy0 += 4;
396 sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
397 yy1 += 4;
398 j -= 4;
399 }
400 z[0] = sum0;
401 z[1] = sum1;
402
403 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
404 PetscCall(VecRestoreArrayRead(yy[1], &yy1));
405 break;
406 case 1:
407 PetscCall(VecGetArrayRead(yy[0], &yy0));
408 switch (j_rem = j & 0x3) {
409 case 3:
410 x2 = x[2];
411 sum0 += x2 * yy0[2]; /* fall through */
412 case 2:
413 x1 = x[1];
414 sum0 += x1 * yy0[1]; /* fall through */
415 case 1:
416 x0 = x[0];
417 sum0 += x0 * yy0[0]; /* fall through */
418 case 0:
419 x += j_rem;
420 yy0 += j_rem;
421 j -= j_rem;
422 break;
423 }
424 while (j > 0) {
425 sum0 += x[0] * yy0[0] + x[1] * yy0[1] + x[2] * yy0[2] + x[3] * yy0[3];
426 yy0 += 4;
427 j -= 4;
428 x += 4;
429 }
430 z[0] = sum0;
431
432 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
433 break;
434 case 0:
435 break;
436 }
437 z += nv_rem;
438 i -= nv_rem;
439 yy += nv_rem;
440
441 while (i > 0) {
442 sum0 = 0.;
443 sum1 = 0.;
444 sum2 = 0.;
445 sum3 = 0.;
446 PetscCall(VecGetArrayRead(yy[0], &yy0));
447 PetscCall(VecGetArrayRead(yy[1], &yy1));
448 PetscCall(VecGetArrayRead(yy[2], &yy2));
449 PetscCall(VecGetArrayRead(yy[3], &yy3));
450 x = xbase;
451
452 j = n;
453 switch (j_rem = j & 0x3) {
454 case 3:
455 x2 = x[2];
456 sum0 += x2 * yy0[2];
457 sum1 += x2 * yy1[2];
458 sum2 += x2 * yy2[2];
459 sum3 += x2 * yy3[2]; /* fall through */
460 case 2:
461 x1 = x[1];
462 sum0 += x1 * yy0[1];
463 sum1 += x1 * yy1[1];
464 sum2 += x1 * yy2[1];
465 sum3 += x1 * yy3[1]; /* fall through */
466 case 1:
467 x0 = x[0];
468 sum0 += x0 * yy0[0];
469 sum1 += x0 * yy1[0];
470 sum2 += x0 * yy2[0];
471 sum3 += x0 * yy3[0]; /* fall through */
472 case 0:
473 x += j_rem;
474 yy0 += j_rem;
475 yy1 += j_rem;
476 yy2 += j_rem;
477 yy3 += j_rem;
478 j -= j_rem;
479 break;
480 }
481 while (j > 0) {
482 x0 = x[0];
483 x1 = x[1];
484 x2 = x[2];
485 x3 = x[3];
486 x += 4;
487
488 sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
489 yy0 += 4;
490 sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
491 yy1 += 4;
492 sum2 += x0 * yy2[0] + x1 * yy2[1] + x2 * yy2[2] + x3 * yy2[3];
493 yy2 += 4;
494 sum3 += x0 * yy3[0] + x1 * yy3[1] + x2 * yy3[2] + x3 * yy3[3];
495 yy3 += 4;
496 j -= 4;
497 }
498 z[0] = sum0;
499 z[1] = sum1;
500 z[2] = sum2;
501 z[3] = sum3;
502 z += 4;
503 i -= 4;
504 PetscCall(VecRestoreArrayRead(yy[0], &yy0));
505 PetscCall(VecRestoreArrayRead(yy[1], &yy1));
506 PetscCall(VecRestoreArrayRead(yy[2], &yy2));
507 PetscCall(VecRestoreArrayRead(yy[3], &yy3));
508 yy += 4;
509 }
510 PetscCall(VecRestoreArrayRead(xin, &xbase));
511 PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
512 PetscFunctionReturn(PETSC_SUCCESS);
513 }
514
VecMultiDot_Seq_GEMV(PetscBool conjugate,Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)515 static PetscErrorCode VecMultiDot_Seq_GEMV(PetscBool conjugate, Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
516 {
517 PetscInt i, j, nfail;
518 const PetscScalar *xarray, *yarray, *yfirst, *ynext;
519 PetscBool stop = PETSC_FALSE;
520 const char *trans = conjugate ? "C" : "T";
521 PetscInt64 lda = 0;
522 PetscBLASInt n, m;
523
524 PetscFunctionBegin;
525 if (xin->map->n == 0) { // otherwise BLAS complains illegal values (0) for lda
526 PetscCall(PetscArrayzero(z, nv));
527 PetscFunctionReturn(PETSC_SUCCESS);
528 }
529
530 // caller guarantees xin->map->n <= PETSC_BLAS_INT_MAX
531 PetscCall(PetscBLASIntCast(xin->map->n, &n));
532 PetscCall(VecGetArrayRead(xin, &xarray));
533 i = nfail = 0;
534 while (i < nv) {
535 stop = PETSC_FALSE;
536 PetscCall(VecGetArrayRead(yin[i], &yfirst));
537 yarray = yfirst;
538 for (j = i + 1; j < nv; j++) {
539 PetscCall(VecGetArrayRead(yin[j], &ynext));
540 if (j == i + 1) {
541 lda = ynext - yarray;
542 if (lda < 0 || lda > PETSC_BLAS_INT_MAX || lda - n > 64) stop = PETSC_TRUE;
543 } else if (lda * (j - i) != ynext - yarray) { // not in the same stride? if so, stop here
544 stop = PETSC_TRUE;
545 }
546 PetscCall(VecRestoreArrayRead(yin[j], &ynext));
547 if (stop) break;
548 }
549 PetscCall(VecRestoreArrayRead(yin[i], &yfirst));
550
551 // we found m vectors yin[i..j)
552 PetscCall(PetscBLASIntCast(j - i, &m));
553 if (m > 1) {
554 PetscBLASInt ione = 1, lda2 = (PetscBLASInt)lda; // the cast is safe since we've screened out those lda > PETSC_BLAS_INT_MAX above
555 PetscScalar one = 1, zero = 0;
556
557 PetscCallBLAS("BLASgemv", BLASgemv_(trans, &n, &m, &one, yarray, &lda2, xarray, &ione, &zero, z + i, &ione));
558 PetscCall(PetscLogFlops(PetscMax(m * (2.0 * n - 1), 0.0)));
559 } else {
560 if (nfail == 0) {
561 if (conjugate) PetscCall(VecDot_Seq(xin, yin[i], z + i));
562 else PetscCall(VecTDot_Seq(xin, yin[i], z + i));
563 nfail++;
564 } else break;
565 }
566 i = j;
567 }
568 PetscCall(VecRestoreArrayRead(xin, &xarray));
569 if (i < nv) { // finish the remaining if any
570 if (conjugate) PetscCall(VecMDot_Seq(xin, nv - i, yin + i, z + i));
571 else PetscCall(VecMTDot_Seq(xin, nv - i, yin + i, z + i));
572 }
573 PetscFunctionReturn(PETSC_SUCCESS);
574 }
575
VecMDot_Seq_GEMV(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)576 PetscErrorCode VecMDot_Seq_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
577 {
578 PetscFunctionBegin;
579 if (xin->map->n > PETSC_BLAS_INT_MAX) {
580 PetscCall(VecMDot_Seq(xin, nv, yin, z));
581 } else {
582 PetscCall(VecMultiDot_Seq_GEMV(PETSC_TRUE, xin, nv, yin, z));
583 }
584 PetscFunctionReturn(PETSC_SUCCESS);
585 }
586
VecMTDot_Seq_GEMV(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)587 PetscErrorCode VecMTDot_Seq_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
588 {
589 PetscFunctionBegin;
590 if (xin->map->n > PETSC_BLAS_INT_MAX) {
591 PetscCall(VecMTDot_Seq(xin, nv, yin, z));
592 } else {
593 PetscCall(VecMultiDot_Seq_GEMV(PETSC_FALSE, xin, nv, yin, z));
594 }
595 PetscFunctionReturn(PETSC_SUCCESS);
596 }
597
VecMinMax_Seq(Vec xin,PetscInt * idx,PetscReal * z,PetscReal minmax,int (* const cmp)(PetscReal,PetscReal))598 static PetscErrorCode VecMinMax_Seq(Vec xin, PetscInt *idx, PetscReal *z, PetscReal minmax, int (*const cmp)(PetscReal, PetscReal))
599 {
600 const PetscInt n = xin->map->n;
601 PetscInt j = -1;
602
603 PetscFunctionBegin;
604 if (n) {
605 const PetscScalar *xx;
606
607 PetscCall(VecGetArrayRead(xin, &xx));
608 minmax = PetscRealPart(xx[(j = 0)]);
609 for (PetscInt i = 1; i < n; ++i) {
610 const PetscReal tmp = PetscRealPart(xx[i]);
611
612 if (cmp(tmp, minmax)) {
613 j = i;
614 minmax = tmp;
615 }
616 }
617 PetscCall(VecRestoreArrayRead(xin, &xx));
618 }
619 *z = minmax;
620 if (idx) *idx = j;
621 PetscFunctionReturn(PETSC_SUCCESS);
622 }
623
VecMax_Seq_GT(PetscReal l,PetscReal r)624 static int VecMax_Seq_GT(PetscReal l, PetscReal r)
625 {
626 return l > r;
627 }
628
VecMax_Seq(Vec xin,PetscInt * idx,PetscReal * z)629 PetscErrorCode VecMax_Seq(Vec xin, PetscInt *idx, PetscReal *z)
630 {
631 PetscFunctionBegin;
632 PetscCall(VecMinMax_Seq(xin, idx, z, PETSC_MIN_REAL, VecMax_Seq_GT));
633 PetscFunctionReturn(PETSC_SUCCESS);
634 }
635
VecMin_Seq_LT(PetscReal l,PetscReal r)636 static int VecMin_Seq_LT(PetscReal l, PetscReal r)
637 {
638 return l < r;
639 }
640
VecMin_Seq(Vec xin,PetscInt * idx,PetscReal * z)641 PetscErrorCode VecMin_Seq(Vec xin, PetscInt *idx, PetscReal *z)
642 {
643 PetscFunctionBegin;
644 PetscCall(VecMinMax_Seq(xin, idx, z, PETSC_MAX_REAL, VecMin_Seq_LT));
645 PetscFunctionReturn(PETSC_SUCCESS);
646 }
647
VecSet_Seq(Vec xin,PetscScalar alpha)648 PetscErrorCode VecSet_Seq(Vec xin, PetscScalar alpha)
649 {
650 const PetscInt n = xin->map->n;
651 PetscScalar *xx;
652
653 PetscFunctionBegin;
654 PetscCall(VecGetArrayWrite(xin, &xx));
655 if (alpha == (PetscScalar)0.0) {
656 PetscCall(PetscArrayzero(xx, n));
657 } else {
658 for (PetscInt i = 0; i < n; i++) xx[i] = alpha;
659 }
660 PetscCall(VecRestoreArrayWrite(xin, &xx));
661 PetscFunctionReturn(PETSC_SUCCESS);
662 }
663
VecMAXPY_Seq(Vec xin,PetscInt nv,const PetscScalar * alpha,Vec * y)664 PetscErrorCode VecMAXPY_Seq(Vec xin, PetscInt nv, const PetscScalar *alpha, Vec *y)
665 {
666 const PetscInt j_rem = nv & 0x3, n = xin->map->n;
667 const PetscScalar *yptr[4];
668 PetscScalar *xx;
669 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
670 #pragma disjoint(*xx, **yptr, *aptr)
671 #endif
672
673 PetscFunctionBegin;
674 PetscCall(PetscLogFlops(nv * 2.0 * n));
675 PetscCall(VecGetArray(xin, &xx));
676 for (PetscInt i = 0; i < j_rem; ++i) PetscCall(VecGetArrayRead(y[i], yptr + i));
677 switch (j_rem) {
678 case 3:
679 PetscKernelAXPY3(xx, alpha[0], alpha[1], alpha[2], yptr[0], yptr[1], yptr[2], n);
680 break;
681 case 2:
682 PetscKernelAXPY2(xx, alpha[0], alpha[1], yptr[0], yptr[1], n);
683 break;
684 case 1:
685 PetscKernelAXPY(xx, alpha[0], yptr[0], n);
686 default:
687 break;
688 }
689 for (PetscInt i = 0; i < j_rem; ++i) PetscCall(VecRestoreArrayRead(y[i], yptr + i));
690 alpha += j_rem;
691 y += j_rem;
692 for (PetscInt j = j_rem, inc = 4; j < nv; j += inc, alpha += inc, y += inc) {
693 for (PetscInt i = 0; i < inc; ++i) PetscCall(VecGetArrayRead(y[i], yptr + i));
694 PetscKernelAXPY4(xx, alpha[0], alpha[1], alpha[2], alpha[3], yptr[0], yptr[1], yptr[2], yptr[3], n);
695 for (PetscInt i = 0; i < inc; ++i) PetscCall(VecRestoreArrayRead(y[i], yptr + i));
696 }
697 PetscCall(VecRestoreArray(xin, &xx));
698 PetscFunctionReturn(PETSC_SUCCESS);
699 }
700
701 /* y = y + sum alpha[i] x[i] */
VecMAXPY_Seq_GEMV(Vec yin,PetscInt nv,const PetscScalar alpha[],Vec xin[])702 PetscErrorCode VecMAXPY_Seq_GEMV(Vec yin, PetscInt nv, const PetscScalar alpha[], Vec xin[])
703 {
704 PetscInt i, j, nfail;
705 PetscScalar *yarray;
706 const PetscScalar *xfirst, *xnext, *xarray;
707 PetscBool stop = PETSC_FALSE;
708 PetscInt64 lda = 0;
709 PetscBLASInt n, m;
710
711 PetscFunctionBegin;
712 if (yin->map->n == 0 || yin->map->n > PETSC_BLAS_INT_MAX) {
713 PetscCall(VecMAXPY_Seq(yin, nv, alpha, xin));
714 PetscFunctionReturn(PETSC_SUCCESS);
715 }
716
717 PetscCall(PetscBLASIntCast(yin->map->n, &n));
718 PetscCall(VecGetArray(yin, &yarray));
719 i = nfail = 0;
720 while (i < nv) {
721 stop = PETSC_FALSE;
722 PetscCall(VecGetArrayRead(xin[i], &xfirst));
723 xarray = xfirst;
724 for (j = i + 1; j < nv; j++) {
725 PetscCall(VecGetArrayRead(xin[j], &xnext));
726 if (j == i + 1) {
727 lda = xnext - xarray;
728 if (lda < 0 || lda > PETSC_BLAS_INT_MAX || lda - n > 64) stop = PETSC_TRUE;
729 } else if (lda * (j - i) != xnext - xarray) { // not in the same stride? if so, stop here
730 stop = PETSC_TRUE;
731 }
732 PetscCall(VecRestoreArrayRead(xin[j], &xnext));
733 if (stop) break;
734 }
735 PetscCall(VecRestoreArrayRead(xin[i], &xfirst));
736
737 PetscCall(PetscBLASIntCast(j - i, &m));
738 if (m > 1) {
739 PetscBLASInt incx = 1, incy = 1, lda2 = (PetscBLASInt)lda; // the cast is safe since we've screened out those lda > PETSC_BLAS_INT_MAX above
740 PetscScalar one = 1;
741 PetscCallBLAS("BLASgemv", BLASgemv_("N", &n, &m, &one, xarray, &lda2, alpha + i, &incx, &one, yarray, &incy));
742 PetscCall(PetscLogFlops(m * 2.0 * n));
743 } else {
744 // we only allow falling back on VecAXPY once
745 if (nfail++ == 0) PetscCall(VecAXPY_Seq(yin, alpha[i], xin[i]));
746 else break; // break the while loop
747 }
748 i = j;
749 }
750 PetscCall(VecRestoreArray(yin, &yarray));
751
752 // finish the remaining if any
753 if (i < nv) PetscCall(VecMAXPY_Seq(yin, nv - i, alpha + i, xin + i));
754 PetscFunctionReturn(PETSC_SUCCESS);
755 }
756
757 #include <../src/vec/vec/impls/seq/ftn-kernels/faypx.h>
758
VecAYPX_Seq(Vec yin,PetscScalar alpha,Vec xin)759 PetscErrorCode VecAYPX_Seq(Vec yin, PetscScalar alpha, Vec xin)
760 {
761 PetscFunctionBegin;
762 if (alpha == (PetscScalar)0.0) {
763 PetscCall(VecCopy(xin, yin));
764 } else if (alpha == (PetscScalar)1.0) {
765 PetscCall(VecAXPY_Seq(yin, alpha, xin));
766 } else {
767 const PetscInt n = yin->map->n;
768 const PetscScalar *xx;
769 PetscScalar *yy;
770
771 PetscCall(VecGetArrayRead(xin, &xx));
772 PetscCall(VecGetArray(yin, &yy));
773 if (alpha == (PetscScalar)-1.0) {
774 for (PetscInt i = 0; i < n; ++i) yy[i] = xx[i] - yy[i];
775 PetscCall(PetscLogFlops(n));
776 } else {
777 #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
778 fortranaypx_(&n, &alpha, xx, yy);
779 #else
780 for (PetscInt i = 0; i < n; ++i) yy[i] = xx[i] + alpha * yy[i];
781 #endif
782 PetscCall(PetscLogFlops(2 * n));
783 }
784 PetscCall(VecRestoreArrayRead(xin, &xx));
785 PetscCall(VecRestoreArray(yin, &yy));
786 }
787 PetscFunctionReturn(PETSC_SUCCESS);
788 }
789
790 #include <../src/vec/vec/impls/seq/ftn-kernels/fwaxpy.h>
791 /*
792 IBM ESSL contains a routine dzaxpy() that is our WAXPY() but it appears
793 to be slower than a regular C loop. Hence,we do not include it.
794 void ?zaxpy(int*,PetscScalar*,PetscScalar*,int*,PetscScalar*,int*,PetscScalar*,int*);
795 */
796
VecWAXPY_Seq(Vec win,PetscScalar alpha,Vec xin,Vec yin)797 PetscErrorCode VecWAXPY_Seq(Vec win, PetscScalar alpha, Vec xin, Vec yin)
798 {
799 const PetscInt n = win->map->n;
800 const PetscScalar *yy, *xx;
801 PetscScalar *ww;
802
803 PetscFunctionBegin;
804 PetscCall(VecGetArrayRead(xin, &xx));
805 PetscCall(VecGetArrayRead(yin, &yy));
806 PetscCall(VecGetArray(win, &ww));
807 if (alpha == (PetscScalar)1.0) {
808 PetscCall(PetscLogFlops(n));
809 /* could call BLAS axpy after call to memcopy, but may be slower */
810 for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] + xx[i];
811 } else if (alpha == (PetscScalar)-1.0) {
812 PetscCall(PetscLogFlops(n));
813 for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] - xx[i];
814 } else if (alpha == (PetscScalar)0.0) {
815 PetscCall(PetscArraycpy(ww, yy, n));
816 } else {
817 PetscCall(PetscLogFlops(2.0 * n));
818 #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
819 fortranwaxpy_(&n, &alpha, xx, yy, ww);
820 #else
821 for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] + alpha * xx[i];
822 #endif
823 }
824 PetscCall(VecRestoreArrayRead(xin, &xx));
825 PetscCall(VecRestoreArrayRead(yin, &yy));
826 PetscCall(VecRestoreArray(win, &ww));
827 PetscFunctionReturn(PETSC_SUCCESS);
828 }
829
VecMaxPointwiseDivide_Seq(Vec xin,Vec yin,PetscReal * max)830 PetscErrorCode VecMaxPointwiseDivide_Seq(Vec xin, Vec yin, PetscReal *max)
831 {
832 const PetscInt n = xin->map->n;
833 const PetscScalar *xx, *yy;
834 PetscReal m = 0.0;
835
836 PetscFunctionBegin;
837 PetscCall(VecGetArrayRead(xin, &xx));
838 PetscCall(VecGetArrayRead(yin, &yy));
839 for (PetscInt i = 0; i < n; ++i) {
840 const PetscReal v = PetscAbsScalar(yy[i] == (PetscScalar)0.0 ? xx[i] : xx[i] / yy[i]);
841
842 // use a separate value to not re-evaluate side-effects
843 m = PetscMax(v, m);
844 }
845 PetscCall(VecRestoreArrayRead(xin, &xx));
846 PetscCall(VecRestoreArrayRead(yin, &yy));
847 PetscCall(PetscLogFlops(n));
848 *max = m;
849 PetscFunctionReturn(PETSC_SUCCESS);
850 }
851
VecPlaceArray_Seq(Vec vin,const PetscScalar * a)852 PetscErrorCode VecPlaceArray_Seq(Vec vin, const PetscScalar *a)
853 {
854 Vec_Seq *v = (Vec_Seq *)vin->data;
855
856 PetscFunctionBegin;
857 PetscCheck(!v->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
858 v->unplacedarray = v->array; /* save previous array so reset can bring it back */
859 v->array = (PetscScalar *)a;
860 PetscFunctionReturn(PETSC_SUCCESS);
861 }
862
VecReplaceArray_Seq(Vec vin,const PetscScalar * a)863 PetscErrorCode VecReplaceArray_Seq(Vec vin, const PetscScalar *a)
864 {
865 Vec_Seq *v = (Vec_Seq *)vin->data;
866
867 PetscFunctionBegin;
868 PetscCall(PetscFree(v->array_allocated));
869 v->array_allocated = v->array = (PetscScalar *)a;
870 PetscFunctionReturn(PETSC_SUCCESS);
871 }
872