xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision e8c0849ab8fe171bed529bea27238c9b402db591) !
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8   #include <immintrin.h>
9 #elif defined(PETSC_HAVE_XMMINTRIN_H)
10   #include <xmmintrin.h>
11 #endif
12 
MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)13 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
14 {
15   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
16   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
17   const PetscInt *idx;
18   PetscInt        start, end, *ai, *aj, bs;
19   PetscBT         table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
28 
29   PetscCall(PetscBTCreate(m, &table));
30   PetscCall(PetscMalloc1(m + 1, &nidx));
31 
32   for (i = 0; i < is_max; i++) {
33     /* Initialise the two local arrays */
34     isz = 0;
35     PetscCall(PetscBTMemzero(m, table));
36 
37     /* Extract the indices, assume there can be duplicate entries */
38     PetscCall(ISGetIndices(is[i], &idx));
39     PetscCall(ISGetLocalSize(is[i], &n));
40 
41     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42     for (j = 0; j < n; ++j) {
43       ival = idx[j] / bs; /* convert the indices into block indices */
44       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
45       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
46     }
47     PetscCall(ISRestoreIndices(is[i], &idx));
48     PetscCall(ISDestroy(&is[i]));
49 
50     k = 0;
51     for (j = 0; j < ov; j++) { /* for each overlap*/
52       n = isz;
53       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
54         row   = nidx[k];
55         start = ai[row];
56         end   = ai[row + 1];
57         for (l = start; l < end; l++) {
58           val = aj[l];
59           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
60         }
61       }
62     }
63     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
64   }
65   PetscCall(PetscBTDestroy(&table));
66   PetscCall(PetscFree(nidx));
67   PetscFunctionReturn(PETSC_SUCCESS);
68 }
69 
MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)70 static PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
71 {
72   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
73   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
74   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
75   const PetscInt *irow, *icol;
76   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
77   PetscInt       *aj = a->j, *ai = a->i;
78   MatScalar      *mat_a;
79   Mat             C;
80   PetscBool       flag;
81 
82   PetscFunctionBegin;
83   PetscCall(ISGetIndices(isrow, &irow));
84   PetscCall(ISGetIndices(iscol, &icol));
85   PetscCall(ISGetLocalSize(isrow, &nrows));
86   PetscCall(ISGetLocalSize(iscol, &ncols));
87 
88   PetscCall(PetscCalloc1(1 + oldcols, &smap));
89   ssmap = smap;
90   PetscCall(PetscMalloc1(1 + nrows, &lens));
91   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
92   /* determine lens of each row */
93   for (i = 0; i < nrows; i++) {
94     kstart  = ai[irow[i]];
95     kend    = kstart + a->ilen[irow[i]];
96     lens[i] = 0;
97     for (k = kstart; k < kend; k++) {
98       if (ssmap[aj[k]]) lens[i]++;
99     }
100   }
101   /* Create and fill new matrix */
102   if (scall == MAT_REUSE_MATRIX) {
103     c = (Mat_SeqBAIJ *)((*B)->data);
104 
105     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
106     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
107     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
108     PetscCall(PetscArrayzero(c->ilen, c->mbs));
109     C = *B;
110   } else {
111     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
112     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
113     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
114     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
115   }
116   c = (Mat_SeqBAIJ *)C->data;
117   for (i = 0; i < nrows; i++) {
118     row      = irow[i];
119     kstart   = ai[row];
120     kend     = kstart + a->ilen[row];
121     mat_i    = c->i[i];
122     mat_j    = PetscSafePointerPlusOffset(c->j, mat_i);
123     mat_a    = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
124     mat_ilen = c->ilen + i;
125     for (k = kstart; k < kend; k++) {
126       if ((tcol = ssmap[a->j[k]])) {
127         *mat_j++ = tcol - 1;
128         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
129         mat_a += bs2;
130         (*mat_ilen)++;
131       }
132     }
133   }
134   /* sort */
135   if (c->j && c->a) {
136     MatScalar *work;
137     PetscCall(PetscMalloc1(bs2, &work));
138     for (i = 0; i < nrows; i++) {
139       PetscInt ilen;
140       mat_i = c->i[i];
141       mat_j = c->j + mat_i;
142       mat_a = c->a + mat_i * bs2;
143       ilen  = c->ilen[i];
144       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
145     }
146     PetscCall(PetscFree(work));
147   }
148 
149   /* Free work space */
150   PetscCall(ISRestoreIndices(iscol, &icol));
151   PetscCall(PetscFree(smap));
152   PetscCall(PetscFree(lens));
153   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
154   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
155 
156   PetscCall(ISRestoreIndices(isrow, &irow));
157   *B = C;
158   PetscFunctionReturn(PETSC_SUCCESS);
159 }
160 
MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)161 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
162 {
163   IS is1, is2;
164 
165   PetscFunctionBegin;
166   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
167   if (isrow == iscol) {
168     is2 = is1;
169     PetscCall(PetscObjectReference((PetscObject)is2));
170   } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
171   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
172   PetscCall(ISDestroy(&is1));
173   PetscCall(ISDestroy(&is2));
174   PetscFunctionReturn(PETSC_SUCCESS);
175 }
176 
MatDestroySubMatrix_SeqBAIJ(Mat C)177 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
178 {
179   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
180   Mat_SubSppt *submatj = c->submatis1;
181 
182   PetscFunctionBegin;
183   PetscCall((*submatj->destroy)(C));
184   PetscCall(MatDestroySubMatrix_Private(submatj));
185   PetscFunctionReturn(PETSC_SUCCESS);
186 }
187 
188 /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat * mat[])189 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
190 {
191   PetscInt     i;
192   Mat          C;
193   Mat_SeqBAIJ *c;
194   Mat_SubSppt *submatj;
195 
196   PetscFunctionBegin;
197   for (i = 0; i < n; i++) {
198     C       = (*mat)[i];
199     c       = (Mat_SeqBAIJ *)C->data;
200     submatj = c->submatis1;
201     if (submatj) {
202       if (--((PetscObject)C)->refct <= 0) {
203         PetscCall(PetscFree(C->factorprefix));
204         PetscCall((*submatj->destroy)(C));
205         PetscCall(MatDestroySubMatrix_Private(submatj));
206         PetscCall(PetscFree(C->defaultvectype));
207         PetscCall(PetscFree(C->defaultrandtype));
208         PetscCall(PetscLayoutDestroy(&C->rmap));
209         PetscCall(PetscLayoutDestroy(&C->cmap));
210         PetscCall(PetscHeaderDestroy(&C));
211       }
212     } else {
213       PetscCall(MatDestroy(&C));
214     }
215   }
216 
217   /* Destroy Dummy submatrices created for reuse */
218   PetscCall(MatDestroySubMatrices_Dummy(n, mat));
219 
220   PetscCall(PetscFree(*mat));
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat * B[])224 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
225 {
226   PetscInt i;
227 
228   PetscFunctionBegin;
229   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
230 
231   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
232   PetscFunctionReturn(PETSC_SUCCESS);
233 }
234 
235 /* Should check that shapes of vectors and matrices match */
MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)236 PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
237 {
238   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
239   PetscScalar       *z, sum;
240   const PetscScalar *x;
241   const MatScalar   *v;
242   PetscInt           mbs, i, n;
243   const PetscInt    *idx, *ii, *ridx = NULL;
244   PetscBool          usecprow = a->compressedrow.use;
245 
246   PetscFunctionBegin;
247   PetscCall(VecGetArrayRead(xx, &x));
248   PetscCall(VecGetArrayWrite(zz, &z));
249 
250   if (usecprow) {
251     mbs  = a->compressedrow.nrows;
252     ii   = a->compressedrow.i;
253     ridx = a->compressedrow.rindex;
254     PetscCall(PetscArrayzero(z, a->mbs));
255   } else {
256     mbs = a->mbs;
257     ii  = a->i;
258   }
259 
260   for (i = 0; i < mbs; i++) {
261     n   = ii[1] - ii[0];
262     v   = a->a + ii[0];
263     idx = a->j + ii[0];
264     ii++;
265     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
266     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
267     sum = 0.0;
268     PetscSparseDensePlusDot(sum, x, v, idx, n);
269     if (usecprow) {
270       z[ridx[i]] = sum;
271     } else {
272       z[i] = sum;
273     }
274   }
275   PetscCall(VecRestoreArrayRead(xx, &x));
276   PetscCall(VecRestoreArrayWrite(zz, &z));
277   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
278   PetscFunctionReturn(PETSC_SUCCESS);
279 }
280 
MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)281 PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
282 {
283   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
284   PetscScalar       *z = NULL, sum1, sum2, *zarray;
285   const PetscScalar *x, *xb;
286   PetscScalar        x1, x2;
287   const MatScalar   *v;
288   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
289   PetscBool          usecprow = a->compressedrow.use;
290 
291   PetscFunctionBegin;
292   PetscCall(VecGetArrayRead(xx, &x));
293   PetscCall(VecGetArrayWrite(zz, &zarray));
294 
295   idx = a->j;
296   v   = a->a;
297   if (usecprow) {
298     mbs  = a->compressedrow.nrows;
299     ii   = a->compressedrow.i;
300     ridx = a->compressedrow.rindex;
301     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
302   } else {
303     mbs = a->mbs;
304     ii  = a->i;
305     z   = zarray;
306   }
307 
308   for (i = 0; i < mbs; i++) {
309     n = ii[1] - ii[0];
310     ii++;
311     sum1 = 0.0;
312     sum2 = 0.0;
313     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
314     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
315     for (j = 0; j < n; j++) {
316       xb = x + 2 * (*idx++);
317       x1 = xb[0];
318       x2 = xb[1];
319       sum1 += v[0] * x1 + v[2] * x2;
320       sum2 += v[1] * x1 + v[3] * x2;
321       v += 4;
322     }
323     if (usecprow) z = zarray + 2 * ridx[i];
324     z[0] = sum1;
325     z[1] = sum2;
326     if (!usecprow) z += 2;
327   }
328   PetscCall(VecRestoreArrayRead(xx, &x));
329   PetscCall(VecRestoreArrayWrite(zz, &zarray));
330   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
331   PetscFunctionReturn(PETSC_SUCCESS);
332 }
333 
MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)334 PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
335 {
336   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
337   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
338   const PetscScalar *x, *xb;
339   const MatScalar   *v;
340   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
341   PetscBool          usecprow = a->compressedrow.use;
342 
343 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
344   #pragma disjoint(*v, *z, *xb)
345 #endif
346 
347   PetscFunctionBegin;
348   PetscCall(VecGetArrayRead(xx, &x));
349   PetscCall(VecGetArrayWrite(zz, &zarray));
350 
351   idx = a->j;
352   v   = a->a;
353   if (usecprow) {
354     mbs  = a->compressedrow.nrows;
355     ii   = a->compressedrow.i;
356     ridx = a->compressedrow.rindex;
357     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
358   } else {
359     mbs = a->mbs;
360     ii  = a->i;
361     z   = zarray;
362   }
363 
364   for (i = 0; i < mbs; i++) {
365     n = ii[1] - ii[0];
366     ii++;
367     sum1 = 0.0;
368     sum2 = 0.0;
369     sum3 = 0.0;
370     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
371     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
372     for (j = 0; j < n; j++) {
373       xb = x + 3 * (*idx++);
374       x1 = xb[0];
375       x2 = xb[1];
376       x3 = xb[2];
377 
378       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
379       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
380       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
381       v += 9;
382     }
383     if (usecprow) z = zarray + 3 * ridx[i];
384     z[0] = sum1;
385     z[1] = sum2;
386     z[2] = sum3;
387     if (!usecprow) z += 3;
388   }
389   PetscCall(VecRestoreArrayRead(xx, &x));
390   PetscCall(VecRestoreArrayWrite(zz, &zarray));
391   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
392   PetscFunctionReturn(PETSC_SUCCESS);
393 }
394 
MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)395 PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
396 {
397   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
398   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
399   const PetscScalar *x, *xb;
400   const MatScalar   *v;
401   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
402   PetscBool          usecprow = a->compressedrow.use;
403 
404   PetscFunctionBegin;
405   PetscCall(VecGetArrayRead(xx, &x));
406   PetscCall(VecGetArrayWrite(zz, &zarray));
407 
408   idx = a->j;
409   v   = a->a;
410   if (usecprow) {
411     mbs  = a->compressedrow.nrows;
412     ii   = a->compressedrow.i;
413     ridx = a->compressedrow.rindex;
414     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
415   } else {
416     mbs = a->mbs;
417     ii  = a->i;
418     z   = zarray;
419   }
420 
421   for (i = 0; i < mbs; i++) {
422     n = ii[1] - ii[0];
423     ii++;
424     sum1 = 0.0;
425     sum2 = 0.0;
426     sum3 = 0.0;
427     sum4 = 0.0;
428 
429     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
430     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
431     for (j = 0; j < n; j++) {
432       xb = x + 4 * (*idx++);
433       x1 = xb[0];
434       x2 = xb[1];
435       x3 = xb[2];
436       x4 = xb[3];
437       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
438       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
439       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
440       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
441       v += 16;
442     }
443     if (usecprow) z = zarray + 4 * ridx[i];
444     z[0] = sum1;
445     z[1] = sum2;
446     z[2] = sum3;
447     z[3] = sum4;
448     if (!usecprow) z += 4;
449   }
450   PetscCall(VecRestoreArrayRead(xx, &x));
451   PetscCall(VecRestoreArrayWrite(zz, &zarray));
452   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
453   PetscFunctionReturn(PETSC_SUCCESS);
454 }
455 
MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)456 PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
457 {
458   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
459   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
460   const PetscScalar *xb, *x;
461   const MatScalar   *v;
462   const PetscInt    *idx, *ii, *ridx = NULL;
463   PetscInt           mbs, i, j, n;
464   PetscBool          usecprow = a->compressedrow.use;
465 
466   PetscFunctionBegin;
467   PetscCall(VecGetArrayRead(xx, &x));
468   PetscCall(VecGetArrayWrite(zz, &zarray));
469 
470   idx = a->j;
471   v   = a->a;
472   if (usecprow) {
473     mbs  = a->compressedrow.nrows;
474     ii   = a->compressedrow.i;
475     ridx = a->compressedrow.rindex;
476     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
477   } else {
478     mbs = a->mbs;
479     ii  = a->i;
480     z   = zarray;
481   }
482 
483   for (i = 0; i < mbs; i++) {
484     n = ii[1] - ii[0];
485     ii++;
486     sum1 = 0.0;
487     sum2 = 0.0;
488     sum3 = 0.0;
489     sum4 = 0.0;
490     sum5 = 0.0;
491     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
492     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
493     for (j = 0; j < n; j++) {
494       xb = x + 5 * (*idx++);
495       x1 = xb[0];
496       x2 = xb[1];
497       x3 = xb[2];
498       x4 = xb[3];
499       x5 = xb[4];
500       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
501       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
502       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
503       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
504       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
505       v += 25;
506     }
507     if (usecprow) z = zarray + 5 * ridx[i];
508     z[0] = sum1;
509     z[1] = sum2;
510     z[2] = sum3;
511     z[3] = sum4;
512     z[4] = sum5;
513     if (!usecprow) z += 5;
514   }
515   PetscCall(VecRestoreArrayRead(xx, &x));
516   PetscCall(VecRestoreArrayWrite(zz, &zarray));
517   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
518   PetscFunctionReturn(PETSC_SUCCESS);
519 }
520 
MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)521 PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
522 {
523   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
524   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
525   const PetscScalar *x, *xb;
526   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
527   const MatScalar   *v;
528   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
529   PetscBool          usecprow = a->compressedrow.use;
530 
531   PetscFunctionBegin;
532   PetscCall(VecGetArrayRead(xx, &x));
533   PetscCall(VecGetArrayWrite(zz, &zarray));
534 
535   idx = a->j;
536   v   = a->a;
537   if (usecprow) {
538     mbs  = a->compressedrow.nrows;
539     ii   = a->compressedrow.i;
540     ridx = a->compressedrow.rindex;
541     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
542   } else {
543     mbs = a->mbs;
544     ii  = a->i;
545     z   = zarray;
546   }
547 
548   for (i = 0; i < mbs; i++) {
549     n = ii[1] - ii[0];
550     ii++;
551     sum1 = 0.0;
552     sum2 = 0.0;
553     sum3 = 0.0;
554     sum4 = 0.0;
555     sum5 = 0.0;
556     sum6 = 0.0;
557 
558     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
559     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
560     for (j = 0; j < n; j++) {
561       xb = x + 6 * (*idx++);
562       x1 = xb[0];
563       x2 = xb[1];
564       x3 = xb[2];
565       x4 = xb[3];
566       x5 = xb[4];
567       x6 = xb[5];
568       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
569       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
570       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
571       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
572       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
573       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
574       v += 36;
575     }
576     if (usecprow) z = zarray + 6 * ridx[i];
577     z[0] = sum1;
578     z[1] = sum2;
579     z[2] = sum3;
580     z[3] = sum4;
581     z[4] = sum5;
582     z[5] = sum6;
583     if (!usecprow) z += 6;
584   }
585 
586   PetscCall(VecRestoreArrayRead(xx, &x));
587   PetscCall(VecRestoreArrayWrite(zz, &zarray));
588   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
589   PetscFunctionReturn(PETSC_SUCCESS);
590 }
591 
MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)592 PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
593 {
594   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
595   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
596   const PetscScalar *x, *xb;
597   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
598   const MatScalar   *v;
599   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
600   PetscBool          usecprow = a->compressedrow.use;
601 
602   PetscFunctionBegin;
603   PetscCall(VecGetArrayRead(xx, &x));
604   PetscCall(VecGetArrayWrite(zz, &zarray));
605 
606   idx = a->j;
607   v   = a->a;
608   if (usecprow) {
609     mbs  = a->compressedrow.nrows;
610     ii   = a->compressedrow.i;
611     ridx = a->compressedrow.rindex;
612     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
613   } else {
614     mbs = a->mbs;
615     ii  = a->i;
616     z   = zarray;
617   }
618 
619   for (i = 0; i < mbs; i++) {
620     n = ii[1] - ii[0];
621     ii++;
622     sum1 = 0.0;
623     sum2 = 0.0;
624     sum3 = 0.0;
625     sum4 = 0.0;
626     sum5 = 0.0;
627     sum6 = 0.0;
628     sum7 = 0.0;
629 
630     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
631     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
632     for (j = 0; j < n; j++) {
633       xb = x + 7 * (*idx++);
634       x1 = xb[0];
635       x2 = xb[1];
636       x3 = xb[2];
637       x4 = xb[3];
638       x5 = xb[4];
639       x6 = xb[5];
640       x7 = xb[6];
641       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
642       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
643       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
644       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
645       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
646       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
647       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
648       v += 49;
649     }
650     if (usecprow) z = zarray + 7 * ridx[i];
651     z[0] = sum1;
652     z[1] = sum2;
653     z[2] = sum3;
654     z[3] = sum4;
655     z[4] = sum5;
656     z[5] = sum6;
657     z[6] = sum7;
658     if (!usecprow) z += 7;
659   }
660 
661   PetscCall(VecRestoreArrayRead(xx, &x));
662   PetscCall(VecRestoreArrayWrite(zz, &zarray));
663   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
664   PetscFunctionReturn(PETSC_SUCCESS);
665 }
666 
667 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)668 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
669 {
670   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
671   PetscScalar       *z = NULL, *work, *workt, *zarray;
672   const PetscScalar *x, *xb;
673   const MatScalar   *v;
674   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
675   const PetscInt    *idx, *ii, *ridx = NULL;
676   PetscInt           k;
677   PetscBool          usecprow = a->compressedrow.use;
678 
679   __m256d a0, a1, a2, a3, a4, a5;
680   __m256d w0, w1, w2, w3;
681   __m256d z0, z1, z2;
682   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
683 
684   PetscFunctionBegin;
685   PetscCall(VecGetArrayRead(xx, &x));
686   PetscCall(VecGetArrayWrite(zz, &zarray));
687 
688   idx = a->j;
689   v   = a->a;
690   if (usecprow) {
691     mbs  = a->compressedrow.nrows;
692     ii   = a->compressedrow.i;
693     ridx = a->compressedrow.rindex;
694     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
695   } else {
696     mbs = a->mbs;
697     ii  = a->i;
698     z   = zarray;
699   }
700 
701   if (!a->mult_work) {
702     k = PetscMax(A->rmap->n, A->cmap->n);
703     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
704   }
705 
706   work = a->mult_work;
707   for (i = 0; i < mbs; i++) {
708     n = ii[1] - ii[0];
709     ii++;
710     workt = work;
711     for (j = 0; j < n; j++) {
712       xb = x + bs * (*idx++);
713       for (k = 0; k < bs; k++) workt[k] = xb[k];
714       workt += bs;
715     }
716     if (usecprow) z = zarray + bs * ridx[i];
717 
718     z0 = _mm256_setzero_pd();
719     z1 = _mm256_setzero_pd();
720     z2 = _mm256_setzero_pd();
721 
722     for (j = 0; j < n; j++) {
723       /* first column of a */
724       w0 = _mm256_set1_pd(work[j * 9]);
725       a0 = _mm256_loadu_pd(&v[j * 81]);
726       z0 = _mm256_fmadd_pd(a0, w0, z0);
727       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
728       z1 = _mm256_fmadd_pd(a1, w0, z1);
729       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
730       z2 = _mm256_fmadd_pd(a2, w0, z2);
731 
732       /* second column of a */
733       w1 = _mm256_set1_pd(work[j * 9 + 1]);
734       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
735       z0 = _mm256_fmadd_pd(a0, w1, z0);
736       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
737       z1 = _mm256_fmadd_pd(a1, w1, z1);
738       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
739       z2 = _mm256_fmadd_pd(a2, w1, z2);
740 
741       /* third column of a */
742       w2 = _mm256_set1_pd(work[j * 9 + 2]);
743       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
744       z0 = _mm256_fmadd_pd(a3, w2, z0);
745       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
746       z1 = _mm256_fmadd_pd(a4, w2, z1);
747       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
748       z2 = _mm256_fmadd_pd(a5, w2, z2);
749 
750       /* fourth column of a */
751       w3 = _mm256_set1_pd(work[j * 9 + 3]);
752       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
753       z0 = _mm256_fmadd_pd(a0, w3, z0);
754       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
755       z1 = _mm256_fmadd_pd(a1, w3, z1);
756       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
757       z2 = _mm256_fmadd_pd(a2, w3, z2);
758 
759       /* fifth column of a */
760       w0 = _mm256_set1_pd(work[j * 9 + 4]);
761       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
762       z0 = _mm256_fmadd_pd(a3, w0, z0);
763       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
764       z1 = _mm256_fmadd_pd(a4, w0, z1);
765       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
766       z2 = _mm256_fmadd_pd(a5, w0, z2);
767 
768       /* sixth column of a */
769       w1 = _mm256_set1_pd(work[j * 9 + 5]);
770       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
771       z0 = _mm256_fmadd_pd(a0, w1, z0);
772       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
773       z1 = _mm256_fmadd_pd(a1, w1, z1);
774       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
775       z2 = _mm256_fmadd_pd(a2, w1, z2);
776 
777       /* seventh column of a */
778       w2 = _mm256_set1_pd(work[j * 9 + 6]);
779       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
780       z0 = _mm256_fmadd_pd(a0, w2, z0);
781       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
782       z1 = _mm256_fmadd_pd(a1, w2, z1);
783       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
784       z2 = _mm256_fmadd_pd(a2, w2, z2);
785 
786       /* eighth column of a */
787       w3 = _mm256_set1_pd(work[j * 9 + 7]);
788       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
789       z0 = _mm256_fmadd_pd(a3, w3, z0);
790       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
791       z1 = _mm256_fmadd_pd(a4, w3, z1);
792       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
793       z2 = _mm256_fmadd_pd(a5, w3, z2);
794 
795       /* ninth column of a */
796       w0 = _mm256_set1_pd(work[j * 9 + 8]);
797       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
798       z0 = _mm256_fmadd_pd(a0, w0, z0);
799       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
800       z1 = _mm256_fmadd_pd(a1, w0, z1);
801       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
802       z2 = _mm256_fmadd_pd(a2, w0, z2);
803     }
804 
805     _mm256_storeu_pd(&z[0], z0);
806     _mm256_storeu_pd(&z[4], z1);
807     _mm256_maskstore_pd(&z[8], mask1, z2);
808 
809     v += n * bs2;
810     if (!usecprow) z += bs;
811   }
812   PetscCall(VecRestoreArrayRead(xx, &x));
813   PetscCall(VecRestoreArrayWrite(zz, &zarray));
814   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
815   PetscFunctionReturn(PETSC_SUCCESS);
816 }
817 #endif
818 
MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)819 PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
820 {
821   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
822   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
823   const PetscScalar *x, *xb;
824   PetscScalar       *zarray, xv;
825   const MatScalar   *v;
826   const PetscInt    *ii, *ij = a->j, *idx;
827   PetscInt           mbs, i, j, k, n, *ridx = NULL;
828   PetscBool          usecprow = a->compressedrow.use;
829 
830   PetscFunctionBegin;
831   PetscCall(VecGetArrayRead(xx, &x));
832   PetscCall(VecGetArrayWrite(zz, &zarray));
833 
834   v = a->a;
835   if (usecprow) {
836     mbs  = a->compressedrow.nrows;
837     ii   = a->compressedrow.i;
838     ridx = a->compressedrow.rindex;
839     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
840   } else {
841     mbs = a->mbs;
842     ii  = a->i;
843     z   = zarray;
844   }
845 
846   for (i = 0; i < mbs; i++) {
847     n     = ii[i + 1] - ii[i];
848     idx   = ij + ii[i];
849     sum1  = 0.0;
850     sum2  = 0.0;
851     sum3  = 0.0;
852     sum4  = 0.0;
853     sum5  = 0.0;
854     sum6  = 0.0;
855     sum7  = 0.0;
856     sum8  = 0.0;
857     sum9  = 0.0;
858     sum10 = 0.0;
859     sum11 = 0.0;
860 
861     for (j = 0; j < n; j++) {
862       xb = x + 11 * (idx[j]);
863 
864       for (k = 0; k < 11; k++) {
865         xv = xb[k];
866         sum1 += v[0] * xv;
867         sum2 += v[1] * xv;
868         sum3 += v[2] * xv;
869         sum4 += v[3] * xv;
870         sum5 += v[4] * xv;
871         sum6 += v[5] * xv;
872         sum7 += v[6] * xv;
873         sum8 += v[7] * xv;
874         sum9 += v[8] * xv;
875         sum10 += v[9] * xv;
876         sum11 += v[10] * xv;
877         v += 11;
878       }
879     }
880     if (usecprow) z = zarray + 11 * ridx[i];
881     z[0]  = sum1;
882     z[1]  = sum2;
883     z[2]  = sum3;
884     z[3]  = sum4;
885     z[4]  = sum5;
886     z[5]  = sum6;
887     z[6]  = sum7;
888     z[7]  = sum8;
889     z[8]  = sum9;
890     z[9]  = sum10;
891     z[10] = sum11;
892 
893     if (!usecprow) z += 11;
894   }
895 
896   PetscCall(VecRestoreArrayRead(xx, &x));
897   PetscCall(VecRestoreArrayWrite(zz, &zarray));
898   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
899   PetscFunctionReturn(PETSC_SUCCESS);
900 }
901 
902 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)903 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
904 {
905   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
906   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
907   const PetscScalar *x, *xb;
908   PetscScalar       *zarray, xv;
909   const MatScalar   *v;
910   const PetscInt    *ii, *ij = a->j, *idx;
911   PetscInt           mbs, i, j, k, n, *ridx = NULL;
912   PetscBool          usecprow = a->compressedrow.use;
913 
914   PetscFunctionBegin;
915   PetscCall(VecGetArrayRead(xx, &x));
916   PetscCall(VecGetArrayWrite(zz, &zarray));
917 
918   v = a->a;
919   if (usecprow) {
920     mbs  = a->compressedrow.nrows;
921     ii   = a->compressedrow.i;
922     ridx = a->compressedrow.rindex;
923     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
924   } else {
925     mbs = a->mbs;
926     ii  = a->i;
927     z   = zarray;
928   }
929 
930   for (i = 0; i < mbs; i++) {
931     n     = ii[i + 1] - ii[i];
932     idx   = ij + ii[i];
933     sum1  = 0.0;
934     sum2  = 0.0;
935     sum3  = 0.0;
936     sum4  = 0.0;
937     sum5  = 0.0;
938     sum6  = 0.0;
939     sum7  = 0.0;
940     sum8  = 0.0;
941     sum9  = 0.0;
942     sum10 = 0.0;
943     sum11 = 0.0;
944     sum12 = 0.0;
945 
946     for (j = 0; j < n; j++) {
947       xb = x + 12 * (idx[j]);
948 
949       for (k = 0; k < 12; k++) {
950         xv = xb[k];
951         sum1 += v[0] * xv;
952         sum2 += v[1] * xv;
953         sum3 += v[2] * xv;
954         sum4 += v[3] * xv;
955         sum5 += v[4] * xv;
956         sum6 += v[5] * xv;
957         sum7 += v[6] * xv;
958         sum8 += v[7] * xv;
959         sum9 += v[8] * xv;
960         sum10 += v[9] * xv;
961         sum11 += v[10] * xv;
962         sum12 += v[11] * xv;
963         v += 12;
964       }
965     }
966     if (usecprow) z = zarray + 12 * ridx[i];
967     z[0]  = sum1;
968     z[1]  = sum2;
969     z[2]  = sum3;
970     z[3]  = sum4;
971     z[4]  = sum5;
972     z[5]  = sum6;
973     z[6]  = sum7;
974     z[7]  = sum8;
975     z[8]  = sum9;
976     z[9]  = sum10;
977     z[10] = sum11;
978     z[11] = sum12;
979     if (!usecprow) z += 12;
980   }
981   PetscCall(VecRestoreArrayRead(xx, &x));
982   PetscCall(VecRestoreArrayWrite(zz, &zarray));
983   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)987 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
988 {
989   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
990   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
991   const PetscScalar *x, *xb;
992   PetscScalar       *zarray, *yarray, xv;
993   const MatScalar   *v;
994   const PetscInt    *ii, *ij = a->j, *idx;
995   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
996   PetscBool          usecprow = a->compressedrow.use;
997 
998   PetscFunctionBegin;
999   PetscCall(VecGetArrayRead(xx, &x));
1000   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1001 
1002   v = a->a;
1003   if (usecprow) {
1004     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1005     mbs  = a->compressedrow.nrows;
1006     ii   = a->compressedrow.i;
1007     ridx = a->compressedrow.rindex;
1008   } else {
1009     ii = a->i;
1010     y  = yarray;
1011     z  = zarray;
1012   }
1013 
1014   for (i = 0; i < mbs; i++) {
1015     n   = ii[i + 1] - ii[i];
1016     idx = ij + ii[i];
1017 
1018     if (usecprow) {
1019       y = yarray + 12 * ridx[i];
1020       z = zarray + 12 * ridx[i];
1021     }
1022     sum1  = y[0];
1023     sum2  = y[1];
1024     sum3  = y[2];
1025     sum4  = y[3];
1026     sum5  = y[4];
1027     sum6  = y[5];
1028     sum7  = y[6];
1029     sum8  = y[7];
1030     sum9  = y[8];
1031     sum10 = y[9];
1032     sum11 = y[10];
1033     sum12 = y[11];
1034 
1035     for (j = 0; j < n; j++) {
1036       xb = x + 12 * (idx[j]);
1037 
1038       for (k = 0; k < 12; k++) {
1039         xv = xb[k];
1040         sum1 += v[0] * xv;
1041         sum2 += v[1] * xv;
1042         sum3 += v[2] * xv;
1043         sum4 += v[3] * xv;
1044         sum5 += v[4] * xv;
1045         sum6 += v[5] * xv;
1046         sum7 += v[6] * xv;
1047         sum8 += v[7] * xv;
1048         sum9 += v[8] * xv;
1049         sum10 += v[9] * xv;
1050         sum11 += v[10] * xv;
1051         sum12 += v[11] * xv;
1052         v += 12;
1053       }
1054     }
1055 
1056     z[0]  = sum1;
1057     z[1]  = sum2;
1058     z[2]  = sum3;
1059     z[3]  = sum4;
1060     z[4]  = sum5;
1061     z[5]  = sum6;
1062     z[6]  = sum7;
1063     z[7]  = sum8;
1064     z[8]  = sum9;
1065     z[9]  = sum10;
1066     z[10] = sum11;
1067     z[11] = sum12;
1068     if (!usecprow) {
1069       y += 12;
1070       z += 12;
1071     }
1072   }
1073   PetscCall(VecRestoreArrayRead(xx, &x));
1074   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1075   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
1079 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)1080 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1081 {
1082   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1083   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1084   const PetscScalar *x, *xb;
1085   PetscScalar        x1, x2, x3, x4, *zarray;
1086   const MatScalar   *v;
1087   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1088   PetscInt           mbs, i, j, n;
1089   PetscBool          usecprow = a->compressedrow.use;
1090 
1091   PetscFunctionBegin;
1092   PetscCall(VecGetArrayRead(xx, &x));
1093   PetscCall(VecGetArrayWrite(zz, &zarray));
1094 
1095   v = a->a;
1096   if (usecprow) {
1097     mbs  = a->compressedrow.nrows;
1098     ii   = a->compressedrow.i;
1099     ridx = a->compressedrow.rindex;
1100     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1101   } else {
1102     mbs = a->mbs;
1103     ii  = a->i;
1104     z   = zarray;
1105   }
1106 
1107   for (i = 0; i < mbs; i++) {
1108     n   = ii[i + 1] - ii[i];
1109     idx = ij + ii[i];
1110 
1111     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1112     for (j = 0; j < n; j++) {
1113       xb = x + 12 * (idx[j]);
1114       x1 = xb[0];
1115       x2 = xb[1];
1116       x3 = xb[2];
1117       x4 = xb[3];
1118 
1119       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1120       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1121       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1122       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1123       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1124       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1125       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1126       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1127       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1128       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1129       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1130       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1131       v += 48;
1132 
1133       x1 = xb[4];
1134       x2 = xb[5];
1135       x3 = xb[6];
1136       x4 = xb[7];
1137 
1138       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1139       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1140       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1141       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1142       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1143       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1144       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1145       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1146       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1147       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1148       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1149       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1150       v += 48;
1151 
1152       x1 = xb[8];
1153       x2 = xb[9];
1154       x3 = xb[10];
1155       x4 = xb[11];
1156       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1157       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1158       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1159       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1160       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1161       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1162       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1163       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1164       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1165       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1166       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1167       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1168       v += 48;
1169     }
1170     if (usecprow) z = zarray + 12 * ridx[i];
1171     z[0]  = sum1;
1172     z[1]  = sum2;
1173     z[2]  = sum3;
1174     z[3]  = sum4;
1175     z[4]  = sum5;
1176     z[5]  = sum6;
1177     z[6]  = sum7;
1178     z[7]  = sum8;
1179     z[8]  = sum9;
1180     z[9]  = sum10;
1181     z[10] = sum11;
1182     z[11] = sum12;
1183     if (!usecprow) z += 12;
1184   }
1185   PetscCall(VecRestoreArrayRead(xx, &x));
1186   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1187   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1188   PetscFunctionReturn(PETSC_SUCCESS);
1189 }
1190 
1191 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)1192 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1193 {
1194   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1195   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1196   const PetscScalar *x, *xb;
1197   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1198   const MatScalar   *v;
1199   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1200   PetscInt           mbs      = a->mbs, i, j, n;
1201   PetscBool          usecprow = a->compressedrow.use;
1202 
1203   PetscFunctionBegin;
1204   PetscCall(VecGetArrayRead(xx, &x));
1205   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1206 
1207   v = a->a;
1208   if (usecprow) {
1209     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1210     mbs  = a->compressedrow.nrows;
1211     ii   = a->compressedrow.i;
1212     ridx = a->compressedrow.rindex;
1213   } else {
1214     ii = a->i;
1215     y  = yarray;
1216     z  = zarray;
1217   }
1218 
1219   for (i = 0; i < mbs; i++) {
1220     n   = ii[i + 1] - ii[i];
1221     idx = ij + ii[i];
1222 
1223     if (usecprow) {
1224       y = yarray + 12 * ridx[i];
1225       z = zarray + 12 * ridx[i];
1226     }
1227     sum1  = y[0];
1228     sum2  = y[1];
1229     sum3  = y[2];
1230     sum4  = y[3];
1231     sum5  = y[4];
1232     sum6  = y[5];
1233     sum7  = y[6];
1234     sum8  = y[7];
1235     sum9  = y[8];
1236     sum10 = y[9];
1237     sum11 = y[10];
1238     sum12 = y[11];
1239 
1240     for (j = 0; j < n; j++) {
1241       xb = x + 12 * (idx[j]);
1242       x1 = xb[0];
1243       x2 = xb[1];
1244       x3 = xb[2];
1245       x4 = xb[3];
1246 
1247       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1248       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1249       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1250       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1251       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1252       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1253       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1254       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1255       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1256       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1257       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1258       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1259       v += 48;
1260 
1261       x1 = xb[4];
1262       x2 = xb[5];
1263       x3 = xb[6];
1264       x4 = xb[7];
1265 
1266       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1267       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1268       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1269       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1270       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1271       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1272       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1273       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1274       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1275       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1276       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1277       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1278       v += 48;
1279 
1280       x1 = xb[8];
1281       x2 = xb[9];
1282       x3 = xb[10];
1283       x4 = xb[11];
1284       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1285       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1286       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1287       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1288       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1289       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1290       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1291       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1292       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1293       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1294       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1295       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1296       v += 48;
1297     }
1298     z[0]  = sum1;
1299     z[1]  = sum2;
1300     z[2]  = sum3;
1301     z[3]  = sum4;
1302     z[4]  = sum5;
1303     z[5]  = sum6;
1304     z[6]  = sum7;
1305     z[7]  = sum8;
1306     z[8]  = sum9;
1307     z[9]  = sum10;
1308     z[10] = sum11;
1309     z[11] = sum12;
1310     if (!usecprow) {
1311       y += 12;
1312       z += 12;
1313     }
1314   }
1315   PetscCall(VecRestoreArrayRead(xx, &x));
1316   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1317   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1318   PetscFunctionReturn(PETSC_SUCCESS);
1319 }
1320 
1321 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)1322 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1323 {
1324   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1325   PetscScalar       *z = NULL, *zarray;
1326   const PetscScalar *x, *work;
1327   const MatScalar   *v = a->a;
1328   PetscInt           mbs, i, j, n;
1329   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1330   PetscBool          usecprow = a->compressedrow.use;
1331   const PetscInt     bs = 12, bs2 = 144;
1332 
1333   __m256d a0, a1, a2, a3, a4, a5;
1334   __m256d w0, w1, w2, w3;
1335   __m256d z0, z1, z2;
1336 
1337   PetscFunctionBegin;
1338   PetscCall(VecGetArrayRead(xx, &x));
1339   PetscCall(VecGetArrayWrite(zz, &zarray));
1340 
1341   if (usecprow) {
1342     mbs  = a->compressedrow.nrows;
1343     ii   = a->compressedrow.i;
1344     ridx = a->compressedrow.rindex;
1345     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1346   } else {
1347     mbs = a->mbs;
1348     ii  = a->i;
1349     z   = zarray;
1350   }
1351 
1352   for (i = 0; i < mbs; i++) {
1353     z0 = _mm256_setzero_pd();
1354     z1 = _mm256_setzero_pd();
1355     z2 = _mm256_setzero_pd();
1356 
1357     n = ii[1] - ii[0];
1358     ii++;
1359     for (j = 0; j < n; j++) {
1360       work = x + bs * (*idx++);
1361 
1362       /* first column of a */
1363       w0 = _mm256_set1_pd(work[0]);
1364       a0 = _mm256_loadu_pd(v + 0);
1365       z0 = _mm256_fmadd_pd(a0, w0, z0);
1366       a1 = _mm256_loadu_pd(v + 4);
1367       z1 = _mm256_fmadd_pd(a1, w0, z1);
1368       a2 = _mm256_loadu_pd(v + 8);
1369       z2 = _mm256_fmadd_pd(a2, w0, z2);
1370 
1371       /* second column of a */
1372       w1 = _mm256_set1_pd(work[1]);
1373       a3 = _mm256_loadu_pd(v + 12);
1374       z0 = _mm256_fmadd_pd(a3, w1, z0);
1375       a4 = _mm256_loadu_pd(v + 16);
1376       z1 = _mm256_fmadd_pd(a4, w1, z1);
1377       a5 = _mm256_loadu_pd(v + 20);
1378       z2 = _mm256_fmadd_pd(a5, w1, z2);
1379 
1380       /* third column of a */
1381       w2 = _mm256_set1_pd(work[2]);
1382       a0 = _mm256_loadu_pd(v + 24);
1383       z0 = _mm256_fmadd_pd(a0, w2, z0);
1384       a1 = _mm256_loadu_pd(v + 28);
1385       z1 = _mm256_fmadd_pd(a1, w2, z1);
1386       a2 = _mm256_loadu_pd(v + 32);
1387       z2 = _mm256_fmadd_pd(a2, w2, z2);
1388 
1389       /* fourth column of a */
1390       w3 = _mm256_set1_pd(work[3]);
1391       a3 = _mm256_loadu_pd(v + 36);
1392       z0 = _mm256_fmadd_pd(a3, w3, z0);
1393       a4 = _mm256_loadu_pd(v + 40);
1394       z1 = _mm256_fmadd_pd(a4, w3, z1);
1395       a5 = _mm256_loadu_pd(v + 44);
1396       z2 = _mm256_fmadd_pd(a5, w3, z2);
1397 
1398       /* fifth column of a */
1399       w0 = _mm256_set1_pd(work[4]);
1400       a0 = _mm256_loadu_pd(v + 48);
1401       z0 = _mm256_fmadd_pd(a0, w0, z0);
1402       a1 = _mm256_loadu_pd(v + 52);
1403       z1 = _mm256_fmadd_pd(a1, w0, z1);
1404       a2 = _mm256_loadu_pd(v + 56);
1405       z2 = _mm256_fmadd_pd(a2, w0, z2);
1406 
1407       /* sixth column of a */
1408       w1 = _mm256_set1_pd(work[5]);
1409       a3 = _mm256_loadu_pd(v + 60);
1410       z0 = _mm256_fmadd_pd(a3, w1, z0);
1411       a4 = _mm256_loadu_pd(v + 64);
1412       z1 = _mm256_fmadd_pd(a4, w1, z1);
1413       a5 = _mm256_loadu_pd(v + 68);
1414       z2 = _mm256_fmadd_pd(a5, w1, z2);
1415 
1416       /* seventh column of a */
1417       w2 = _mm256_set1_pd(work[6]);
1418       a0 = _mm256_loadu_pd(v + 72);
1419       z0 = _mm256_fmadd_pd(a0, w2, z0);
1420       a1 = _mm256_loadu_pd(v + 76);
1421       z1 = _mm256_fmadd_pd(a1, w2, z1);
1422       a2 = _mm256_loadu_pd(v + 80);
1423       z2 = _mm256_fmadd_pd(a2, w2, z2);
1424 
1425       /* eighth column of a */
1426       w3 = _mm256_set1_pd(work[7]);
1427       a3 = _mm256_loadu_pd(v + 84);
1428       z0 = _mm256_fmadd_pd(a3, w3, z0);
1429       a4 = _mm256_loadu_pd(v + 88);
1430       z1 = _mm256_fmadd_pd(a4, w3, z1);
1431       a5 = _mm256_loadu_pd(v + 92);
1432       z2 = _mm256_fmadd_pd(a5, w3, z2);
1433 
1434       /* ninth column of a */
1435       w0 = _mm256_set1_pd(work[8]);
1436       a0 = _mm256_loadu_pd(v + 96);
1437       z0 = _mm256_fmadd_pd(a0, w0, z0);
1438       a1 = _mm256_loadu_pd(v + 100);
1439       z1 = _mm256_fmadd_pd(a1, w0, z1);
1440       a2 = _mm256_loadu_pd(v + 104);
1441       z2 = _mm256_fmadd_pd(a2, w0, z2);
1442 
1443       /* tenth column of a */
1444       w1 = _mm256_set1_pd(work[9]);
1445       a3 = _mm256_loadu_pd(v + 108);
1446       z0 = _mm256_fmadd_pd(a3, w1, z0);
1447       a4 = _mm256_loadu_pd(v + 112);
1448       z1 = _mm256_fmadd_pd(a4, w1, z1);
1449       a5 = _mm256_loadu_pd(v + 116);
1450       z2 = _mm256_fmadd_pd(a5, w1, z2);
1451 
1452       /* eleventh column of a */
1453       w2 = _mm256_set1_pd(work[10]);
1454       a0 = _mm256_loadu_pd(v + 120);
1455       z0 = _mm256_fmadd_pd(a0, w2, z0);
1456       a1 = _mm256_loadu_pd(v + 124);
1457       z1 = _mm256_fmadd_pd(a1, w2, z1);
1458       a2 = _mm256_loadu_pd(v + 128);
1459       z2 = _mm256_fmadd_pd(a2, w2, z2);
1460 
1461       /* twelveth column of a */
1462       w3 = _mm256_set1_pd(work[11]);
1463       a3 = _mm256_loadu_pd(v + 132);
1464       z0 = _mm256_fmadd_pd(a3, w3, z0);
1465       a4 = _mm256_loadu_pd(v + 136);
1466       z1 = _mm256_fmadd_pd(a4, w3, z1);
1467       a5 = _mm256_loadu_pd(v + 140);
1468       z2 = _mm256_fmadd_pd(a5, w3, z2);
1469 
1470       v += bs2;
1471     }
1472     if (usecprow) z = zarray + bs * ridx[i];
1473     _mm256_storeu_pd(&z[0], z0);
1474     _mm256_storeu_pd(&z[4], z1);
1475     _mm256_storeu_pd(&z[8], z2);
1476     if (!usecprow) z += bs;
1477   }
1478   PetscCall(VecRestoreArrayRead(xx, &x));
1479   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1480   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1481   PetscFunctionReturn(PETSC_SUCCESS);
1482 }
1483 #endif
1484 
1485 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1486 /* Default MatMult for block size 15 */
MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)1487 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1488 {
1489   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1490   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1491   const PetscScalar *x, *xb;
1492   PetscScalar       *zarray, xv;
1493   const MatScalar   *v;
1494   const PetscInt    *ii, *ij = a->j, *idx;
1495   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1496   PetscBool          usecprow = a->compressedrow.use;
1497 
1498   PetscFunctionBegin;
1499   PetscCall(VecGetArrayRead(xx, &x));
1500   PetscCall(VecGetArrayWrite(zz, &zarray));
1501 
1502   v = a->a;
1503   if (usecprow) {
1504     mbs  = a->compressedrow.nrows;
1505     ii   = a->compressedrow.i;
1506     ridx = a->compressedrow.rindex;
1507     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1508   } else {
1509     mbs = a->mbs;
1510     ii  = a->i;
1511     z   = zarray;
1512   }
1513 
1514   for (i = 0; i < mbs; i++) {
1515     n     = ii[i + 1] - ii[i];
1516     idx   = ij + ii[i];
1517     sum1  = 0.0;
1518     sum2  = 0.0;
1519     sum3  = 0.0;
1520     sum4  = 0.0;
1521     sum5  = 0.0;
1522     sum6  = 0.0;
1523     sum7  = 0.0;
1524     sum8  = 0.0;
1525     sum9  = 0.0;
1526     sum10 = 0.0;
1527     sum11 = 0.0;
1528     sum12 = 0.0;
1529     sum13 = 0.0;
1530     sum14 = 0.0;
1531     sum15 = 0.0;
1532 
1533     for (j = 0; j < n; j++) {
1534       xb = x + 15 * (idx[j]);
1535 
1536       for (k = 0; k < 15; k++) {
1537         xv = xb[k];
1538         sum1 += v[0] * xv;
1539         sum2 += v[1] * xv;
1540         sum3 += v[2] * xv;
1541         sum4 += v[3] * xv;
1542         sum5 += v[4] * xv;
1543         sum6 += v[5] * xv;
1544         sum7 += v[6] * xv;
1545         sum8 += v[7] * xv;
1546         sum9 += v[8] * xv;
1547         sum10 += v[9] * xv;
1548         sum11 += v[10] * xv;
1549         sum12 += v[11] * xv;
1550         sum13 += v[12] * xv;
1551         sum14 += v[13] * xv;
1552         sum15 += v[14] * xv;
1553         v += 15;
1554       }
1555     }
1556     if (usecprow) z = zarray + 15 * ridx[i];
1557     z[0]  = sum1;
1558     z[1]  = sum2;
1559     z[2]  = sum3;
1560     z[3]  = sum4;
1561     z[4]  = sum5;
1562     z[5]  = sum6;
1563     z[6]  = sum7;
1564     z[7]  = sum8;
1565     z[8]  = sum9;
1566     z[9]  = sum10;
1567     z[10] = sum11;
1568     z[11] = sum12;
1569     z[12] = sum13;
1570     z[13] = sum14;
1571     z[14] = sum15;
1572 
1573     if (!usecprow) z += 15;
1574   }
1575 
1576   PetscCall(VecRestoreArrayRead(xx, &x));
1577   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1578   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1579   PetscFunctionReturn(PETSC_SUCCESS);
1580 }
1581 
1582 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)1583 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1584 {
1585   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1586   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1587   const PetscScalar *x, *xb;
1588   PetscScalar        x1, x2, x3, x4, *zarray;
1589   const MatScalar   *v;
1590   const PetscInt    *ii, *ij = a->j, *idx;
1591   PetscInt           mbs, i, j, n, *ridx = NULL;
1592   PetscBool          usecprow = a->compressedrow.use;
1593 
1594   PetscFunctionBegin;
1595   PetscCall(VecGetArrayRead(xx, &x));
1596   PetscCall(VecGetArrayWrite(zz, &zarray));
1597 
1598   v = a->a;
1599   if (usecprow) {
1600     mbs  = a->compressedrow.nrows;
1601     ii   = a->compressedrow.i;
1602     ridx = a->compressedrow.rindex;
1603     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1604   } else {
1605     mbs = a->mbs;
1606     ii  = a->i;
1607     z   = zarray;
1608   }
1609 
1610   for (i = 0; i < mbs; i++) {
1611     n     = ii[i + 1] - ii[i];
1612     idx   = ij + ii[i];
1613     sum1  = 0.0;
1614     sum2  = 0.0;
1615     sum3  = 0.0;
1616     sum4  = 0.0;
1617     sum5  = 0.0;
1618     sum6  = 0.0;
1619     sum7  = 0.0;
1620     sum8  = 0.0;
1621     sum9  = 0.0;
1622     sum10 = 0.0;
1623     sum11 = 0.0;
1624     sum12 = 0.0;
1625     sum13 = 0.0;
1626     sum14 = 0.0;
1627     sum15 = 0.0;
1628 
1629     for (j = 0; j < n; j++) {
1630       xb = x + 15 * (idx[j]);
1631       x1 = xb[0];
1632       x2 = xb[1];
1633       x3 = xb[2];
1634       x4 = xb[3];
1635 
1636       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1637       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1638       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1639       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1640       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1641       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1642       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1643       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1644       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1645       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1646       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1647       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1648       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1649       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1650       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1651 
1652       v += 60;
1653 
1654       x1 = xb[4];
1655       x2 = xb[5];
1656       x3 = xb[6];
1657       x4 = xb[7];
1658 
1659       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1660       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1661       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1662       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1663       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1664       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1665       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1666       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1667       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1668       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1669       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1670       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1671       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1672       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1673       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1674       v += 60;
1675 
1676       x1 = xb[8];
1677       x2 = xb[9];
1678       x3 = xb[10];
1679       x4 = xb[11];
1680       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1681       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1682       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1683       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1684       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1685       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1686       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1687       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1688       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1689       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1690       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1691       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1692       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1693       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1694       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1695       v += 60;
1696 
1697       x1 = xb[12];
1698       x2 = xb[13];
1699       x3 = xb[14];
1700       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1701       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1702       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1703       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1704       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1705       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1706       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1707       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1708       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1709       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1710       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1711       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1712       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1713       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1714       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1715       v += 45;
1716     }
1717     if (usecprow) z = zarray + 15 * ridx[i];
1718     z[0]  = sum1;
1719     z[1]  = sum2;
1720     z[2]  = sum3;
1721     z[3]  = sum4;
1722     z[4]  = sum5;
1723     z[5]  = sum6;
1724     z[6]  = sum7;
1725     z[7]  = sum8;
1726     z[8]  = sum9;
1727     z[9]  = sum10;
1728     z[10] = sum11;
1729     z[11] = sum12;
1730     z[12] = sum13;
1731     z[13] = sum14;
1732     z[14] = sum15;
1733 
1734     if (!usecprow) z += 15;
1735   }
1736 
1737   PetscCall(VecRestoreArrayRead(xx, &x));
1738   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1739   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1740   PetscFunctionReturn(PETSC_SUCCESS);
1741 }
1742 
1743 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)1744 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1745 {
1746   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1747   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1748   const PetscScalar *x, *xb;
1749   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1750   const MatScalar   *v;
1751   const PetscInt    *ii, *ij = a->j, *idx;
1752   PetscInt           mbs, i, j, n, *ridx = NULL;
1753   PetscBool          usecprow = a->compressedrow.use;
1754 
1755   PetscFunctionBegin;
1756   PetscCall(VecGetArrayRead(xx, &x));
1757   PetscCall(VecGetArrayWrite(zz, &zarray));
1758 
1759   v = a->a;
1760   if (usecprow) {
1761     mbs  = a->compressedrow.nrows;
1762     ii   = a->compressedrow.i;
1763     ridx = a->compressedrow.rindex;
1764     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1765   } else {
1766     mbs = a->mbs;
1767     ii  = a->i;
1768     z   = zarray;
1769   }
1770 
1771   for (i = 0; i < mbs; i++) {
1772     n     = ii[i + 1] - ii[i];
1773     idx   = ij + ii[i];
1774     sum1  = 0.0;
1775     sum2  = 0.0;
1776     sum3  = 0.0;
1777     sum4  = 0.0;
1778     sum5  = 0.0;
1779     sum6  = 0.0;
1780     sum7  = 0.0;
1781     sum8  = 0.0;
1782     sum9  = 0.0;
1783     sum10 = 0.0;
1784     sum11 = 0.0;
1785     sum12 = 0.0;
1786     sum13 = 0.0;
1787     sum14 = 0.0;
1788     sum15 = 0.0;
1789 
1790     for (j = 0; j < n; j++) {
1791       xb = x + 15 * (idx[j]);
1792       x1 = xb[0];
1793       x2 = xb[1];
1794       x3 = xb[2];
1795       x4 = xb[3];
1796       x5 = xb[4];
1797       x6 = xb[5];
1798       x7 = xb[6];
1799       x8 = xb[7];
1800 
1801       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1802       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1803       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1804       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1805       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1806       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1807       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1808       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1809       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1810       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1811       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1812       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1813       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1814       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1815       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1816       v += 120;
1817 
1818       x1 = xb[8];
1819       x2 = xb[9];
1820       x3 = xb[10];
1821       x4 = xb[11];
1822       x5 = xb[12];
1823       x6 = xb[13];
1824       x7 = xb[14];
1825 
1826       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1827       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1828       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1829       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1830       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1831       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1832       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1833       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1834       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1835       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1836       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1837       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1838       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1839       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1840       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1841       v += 105;
1842     }
1843     if (usecprow) z = zarray + 15 * ridx[i];
1844     z[0]  = sum1;
1845     z[1]  = sum2;
1846     z[2]  = sum3;
1847     z[3]  = sum4;
1848     z[4]  = sum5;
1849     z[5]  = sum6;
1850     z[6]  = sum7;
1851     z[7]  = sum8;
1852     z[8]  = sum9;
1853     z[9]  = sum10;
1854     z[10] = sum11;
1855     z[11] = sum12;
1856     z[12] = sum13;
1857     z[13] = sum14;
1858     z[14] = sum15;
1859 
1860     if (!usecprow) z += 15;
1861   }
1862 
1863   PetscCall(VecRestoreArrayRead(xx, &x));
1864   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1865   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1866   PetscFunctionReturn(PETSC_SUCCESS);
1867 }
1868 
1869 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)1870 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1871 {
1872   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1873   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1874   const PetscScalar *x, *xb;
1875   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1876   const MatScalar   *v;
1877   const PetscInt    *ii, *ij = a->j, *idx;
1878   PetscInt           mbs, i, j, n, *ridx = NULL;
1879   PetscBool          usecprow = a->compressedrow.use;
1880 
1881   PetscFunctionBegin;
1882   PetscCall(VecGetArrayRead(xx, &x));
1883   PetscCall(VecGetArrayWrite(zz, &zarray));
1884 
1885   v = a->a;
1886   if (usecprow) {
1887     mbs  = a->compressedrow.nrows;
1888     ii   = a->compressedrow.i;
1889     ridx = a->compressedrow.rindex;
1890     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1891   } else {
1892     mbs = a->mbs;
1893     ii  = a->i;
1894     z   = zarray;
1895   }
1896 
1897   for (i = 0; i < mbs; i++) {
1898     n     = ii[i + 1] - ii[i];
1899     idx   = ij + ii[i];
1900     sum1  = 0.0;
1901     sum2  = 0.0;
1902     sum3  = 0.0;
1903     sum4  = 0.0;
1904     sum5  = 0.0;
1905     sum6  = 0.0;
1906     sum7  = 0.0;
1907     sum8  = 0.0;
1908     sum9  = 0.0;
1909     sum10 = 0.0;
1910     sum11 = 0.0;
1911     sum12 = 0.0;
1912     sum13 = 0.0;
1913     sum14 = 0.0;
1914     sum15 = 0.0;
1915 
1916     for (j = 0; j < n; j++) {
1917       xb  = x + 15 * (idx[j]);
1918       x1  = xb[0];
1919       x2  = xb[1];
1920       x3  = xb[2];
1921       x4  = xb[3];
1922       x5  = xb[4];
1923       x6  = xb[5];
1924       x7  = xb[6];
1925       x8  = xb[7];
1926       x9  = xb[8];
1927       x10 = xb[9];
1928       x11 = xb[10];
1929       x12 = xb[11];
1930       x13 = xb[12];
1931       x14 = xb[13];
1932       x15 = xb[14];
1933 
1934       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1935       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1936       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1937       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1938       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1939       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1940       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1941       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1942       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1943       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1944       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1945       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1946       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1947       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1948       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1949       v += 225;
1950     }
1951     if (usecprow) z = zarray + 15 * ridx[i];
1952     z[0]  = sum1;
1953     z[1]  = sum2;
1954     z[2]  = sum3;
1955     z[3]  = sum4;
1956     z[4]  = sum5;
1957     z[5]  = sum6;
1958     z[6]  = sum7;
1959     z[7]  = sum8;
1960     z[8]  = sum9;
1961     z[9]  = sum10;
1962     z[10] = sum11;
1963     z[11] = sum12;
1964     z[12] = sum13;
1965     z[13] = sum14;
1966     z[14] = sum15;
1967 
1968     if (!usecprow) z += 15;
1969   }
1970 
1971   PetscCall(VecRestoreArrayRead(xx, &x));
1972   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1973   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1974   PetscFunctionReturn(PETSC_SUCCESS);
1975 }
1976 
1977 /*
1978     This will not work with MatScalar == float because it calls the BLAS
1979 */
MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)1980 PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1981 {
1982   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1983   PetscScalar       *z = NULL, *work, *workt, *zarray;
1984   const PetscScalar *x, *xb;
1985   const MatScalar   *v;
1986   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1987   const PetscInt    *idx, *ii, *ridx = NULL;
1988   PetscInt           ncols, k;
1989   PetscBool          usecprow = a->compressedrow.use;
1990 
1991   PetscFunctionBegin;
1992   PetscCall(VecGetArrayRead(xx, &x));
1993   PetscCall(VecGetArrayWrite(zz, &zarray));
1994 
1995   idx = a->j;
1996   v   = a->a;
1997   if (usecprow) {
1998     mbs  = a->compressedrow.nrows;
1999     ii   = a->compressedrow.i;
2000     ridx = a->compressedrow.rindex;
2001     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2002   } else {
2003     mbs = a->mbs;
2004     ii  = a->i;
2005     z   = zarray;
2006   }
2007 
2008   if (!a->mult_work) {
2009     k = PetscMax(A->rmap->n, A->cmap->n);
2010     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2011   }
2012   work = a->mult_work;
2013   for (i = 0; i < mbs; i++) {
2014     n = ii[1] - ii[0];
2015     ii++;
2016     ncols = n * bs;
2017     workt = work;
2018     for (j = 0; j < n; j++) {
2019       xb = x + bs * (*idx++);
2020       for (k = 0; k < bs; k++) workt[k] = xb[k];
2021       workt += bs;
2022     }
2023     if (usecprow) z = zarray + bs * ridx[i];
2024     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2025     v += n * bs2;
2026     if (!usecprow) z += bs;
2027   }
2028   PetscCall(VecRestoreArrayRead(xx, &x));
2029   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2030   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2031   PetscFunctionReturn(PETSC_SUCCESS);
2032 }
2033 
MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)2034 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2035 {
2036   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2037   const PetscScalar *x;
2038   PetscScalar       *y, *z, sum;
2039   const MatScalar   *v;
2040   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2041   const PetscInt    *idx, *ii;
2042   PetscBool          usecprow = a->compressedrow.use;
2043 
2044   PetscFunctionBegin;
2045   PetscCall(VecGetArrayRead(xx, &x));
2046   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2047 
2048   idx = a->j;
2049   v   = a->a;
2050   if (usecprow) {
2051     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2052     mbs  = a->compressedrow.nrows;
2053     ii   = a->compressedrow.i;
2054     ridx = a->compressedrow.rindex;
2055   } else {
2056     ii = a->i;
2057   }
2058 
2059   for (i = 0; i < mbs; i++) {
2060     n = ii[1] - ii[0];
2061     ii++;
2062     if (!usecprow) {
2063       sum = y[i];
2064     } else {
2065       sum = y[ridx[i]];
2066     }
2067     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2068     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2069     PetscSparseDensePlusDot(sum, x, v, idx, n);
2070     v += n;
2071     idx += n;
2072     if (usecprow) {
2073       z[ridx[i]] = sum;
2074     } else {
2075       z[i] = sum;
2076     }
2077   }
2078   PetscCall(VecRestoreArrayRead(xx, &x));
2079   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2080   PetscCall(PetscLogFlops(2.0 * a->nz));
2081   PetscFunctionReturn(PETSC_SUCCESS);
2082 }
2083 
MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)2084 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2085 {
2086   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2087   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2088   const PetscScalar *x, *xb;
2089   PetscScalar        x1, x2, *yarray, *zarray;
2090   const MatScalar   *v;
2091   PetscInt           mbs = a->mbs, i, n, j;
2092   const PetscInt    *idx, *ii, *ridx = NULL;
2093   PetscBool          usecprow = a->compressedrow.use;
2094 
2095   PetscFunctionBegin;
2096   PetscCall(VecGetArrayRead(xx, &x));
2097   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2098 
2099   idx = a->j;
2100   v   = a->a;
2101   if (usecprow) {
2102     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2103     mbs  = a->compressedrow.nrows;
2104     ii   = a->compressedrow.i;
2105     ridx = a->compressedrow.rindex;
2106   } else {
2107     ii = a->i;
2108     y  = yarray;
2109     z  = zarray;
2110   }
2111 
2112   for (i = 0; i < mbs; i++) {
2113     n = ii[1] - ii[0];
2114     ii++;
2115     if (usecprow) {
2116       z = zarray + 2 * ridx[i];
2117       y = yarray + 2 * ridx[i];
2118     }
2119     sum1 = y[0];
2120     sum2 = y[1];
2121     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2122     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2123     for (j = 0; j < n; j++) {
2124       xb = x + 2 * (*idx++);
2125       x1 = xb[0];
2126       x2 = xb[1];
2127 
2128       sum1 += v[0] * x1 + v[2] * x2;
2129       sum2 += v[1] * x1 + v[3] * x2;
2130       v += 4;
2131     }
2132     z[0] = sum1;
2133     z[1] = sum2;
2134     if (!usecprow) {
2135       z += 2;
2136       y += 2;
2137     }
2138   }
2139   PetscCall(VecRestoreArrayRead(xx, &x));
2140   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2141   PetscCall(PetscLogFlops(4.0 * a->nz));
2142   PetscFunctionReturn(PETSC_SUCCESS);
2143 }
2144 
MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)2145 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2146 {
2147   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2148   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2149   const PetscScalar *x, *xb;
2150   const MatScalar   *v;
2151   PetscInt           mbs = a->mbs, i, j, n;
2152   const PetscInt    *idx, *ii, *ridx = NULL;
2153   PetscBool          usecprow = a->compressedrow.use;
2154 
2155   PetscFunctionBegin;
2156   PetscCall(VecGetArrayRead(xx, &x));
2157   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2158 
2159   idx = a->j;
2160   v   = a->a;
2161   if (usecprow) {
2162     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2163     mbs  = a->compressedrow.nrows;
2164     ii   = a->compressedrow.i;
2165     ridx = a->compressedrow.rindex;
2166   } else {
2167     ii = a->i;
2168     y  = yarray;
2169     z  = zarray;
2170   }
2171 
2172   for (i = 0; i < mbs; i++) {
2173     n = ii[1] - ii[0];
2174     ii++;
2175     if (usecprow) {
2176       z = zarray + 3 * ridx[i];
2177       y = yarray + 3 * ridx[i];
2178     }
2179     sum1 = y[0];
2180     sum2 = y[1];
2181     sum3 = y[2];
2182     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2183     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2184     for (j = 0; j < n; j++) {
2185       xb = x + 3 * (*idx++);
2186       x1 = xb[0];
2187       x2 = xb[1];
2188       x3 = xb[2];
2189       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2190       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2191       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2192       v += 9;
2193     }
2194     z[0] = sum1;
2195     z[1] = sum2;
2196     z[2] = sum3;
2197     if (!usecprow) {
2198       z += 3;
2199       y += 3;
2200     }
2201   }
2202   PetscCall(VecRestoreArrayRead(xx, &x));
2203   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2204   PetscCall(PetscLogFlops(18.0 * a->nz));
2205   PetscFunctionReturn(PETSC_SUCCESS);
2206 }
2207 
MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)2208 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2209 {
2210   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2211   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2212   const PetscScalar *x, *xb;
2213   const MatScalar   *v;
2214   PetscInt           mbs = a->mbs, i, j, n;
2215   const PetscInt    *idx, *ii, *ridx = NULL;
2216   PetscBool          usecprow = a->compressedrow.use;
2217 
2218   PetscFunctionBegin;
2219   PetscCall(VecGetArrayRead(xx, &x));
2220   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2221 
2222   idx = a->j;
2223   v   = a->a;
2224   if (usecprow) {
2225     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2226     mbs  = a->compressedrow.nrows;
2227     ii   = a->compressedrow.i;
2228     ridx = a->compressedrow.rindex;
2229   } else {
2230     ii = a->i;
2231     y  = yarray;
2232     z  = zarray;
2233   }
2234 
2235   for (i = 0; i < mbs; i++) {
2236     n = ii[1] - ii[0];
2237     ii++;
2238     if (usecprow) {
2239       z = zarray + 4 * ridx[i];
2240       y = yarray + 4 * ridx[i];
2241     }
2242     sum1 = y[0];
2243     sum2 = y[1];
2244     sum3 = y[2];
2245     sum4 = y[3];
2246     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2247     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2248     for (j = 0; j < n; j++) {
2249       xb = x + 4 * (*idx++);
2250       x1 = xb[0];
2251       x2 = xb[1];
2252       x3 = xb[2];
2253       x4 = xb[3];
2254       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2255       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2256       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2257       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2258       v += 16;
2259     }
2260     z[0] = sum1;
2261     z[1] = sum2;
2262     z[2] = sum3;
2263     z[3] = sum4;
2264     if (!usecprow) {
2265       z += 4;
2266       y += 4;
2267     }
2268   }
2269   PetscCall(VecRestoreArrayRead(xx, &x));
2270   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2271   PetscCall(PetscLogFlops(32.0 * a->nz));
2272   PetscFunctionReturn(PETSC_SUCCESS);
2273 }
2274 
MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)2275 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2276 {
2277   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2278   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2279   const PetscScalar *x, *xb;
2280   PetscScalar       *yarray, *zarray;
2281   const MatScalar   *v;
2282   PetscInt           mbs = a->mbs, i, j, n;
2283   const PetscInt    *idx, *ii, *ridx = NULL;
2284   PetscBool          usecprow = a->compressedrow.use;
2285 
2286   PetscFunctionBegin;
2287   PetscCall(VecGetArrayRead(xx, &x));
2288   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2289 
2290   idx = a->j;
2291   v   = a->a;
2292   if (usecprow) {
2293     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2294     mbs  = a->compressedrow.nrows;
2295     ii   = a->compressedrow.i;
2296     ridx = a->compressedrow.rindex;
2297   } else {
2298     ii = a->i;
2299     y  = yarray;
2300     z  = zarray;
2301   }
2302 
2303   for (i = 0; i < mbs; i++) {
2304     n = ii[1] - ii[0];
2305     ii++;
2306     if (usecprow) {
2307       z = zarray + 5 * ridx[i];
2308       y = yarray + 5 * ridx[i];
2309     }
2310     sum1 = y[0];
2311     sum2 = y[1];
2312     sum3 = y[2];
2313     sum4 = y[3];
2314     sum5 = y[4];
2315     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2316     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2317     for (j = 0; j < n; j++) {
2318       xb = x + 5 * (*idx++);
2319       x1 = xb[0];
2320       x2 = xb[1];
2321       x3 = xb[2];
2322       x4 = xb[3];
2323       x5 = xb[4];
2324       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2325       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2326       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2327       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2328       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2329       v += 25;
2330     }
2331     z[0] = sum1;
2332     z[1] = sum2;
2333     z[2] = sum3;
2334     z[3] = sum4;
2335     z[4] = sum5;
2336     if (!usecprow) {
2337       z += 5;
2338       y += 5;
2339     }
2340   }
2341   PetscCall(VecRestoreArrayRead(xx, &x));
2342   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2343   PetscCall(PetscLogFlops(50.0 * a->nz));
2344   PetscFunctionReturn(PETSC_SUCCESS);
2345 }
2346 
MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)2347 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2348 {
2349   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2350   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2351   const PetscScalar *x, *xb;
2352   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2353   const MatScalar   *v;
2354   PetscInt           mbs = a->mbs, i, j, n;
2355   const PetscInt    *idx, *ii, *ridx = NULL;
2356   PetscBool          usecprow = a->compressedrow.use;
2357 
2358   PetscFunctionBegin;
2359   PetscCall(VecGetArrayRead(xx, &x));
2360   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2361 
2362   idx = a->j;
2363   v   = a->a;
2364   if (usecprow) {
2365     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2366     mbs  = a->compressedrow.nrows;
2367     ii   = a->compressedrow.i;
2368     ridx = a->compressedrow.rindex;
2369   } else {
2370     ii = a->i;
2371     y  = yarray;
2372     z  = zarray;
2373   }
2374 
2375   for (i = 0; i < mbs; i++) {
2376     n = ii[1] - ii[0];
2377     ii++;
2378     if (usecprow) {
2379       z = zarray + 6 * ridx[i];
2380       y = yarray + 6 * ridx[i];
2381     }
2382     sum1 = y[0];
2383     sum2 = y[1];
2384     sum3 = y[2];
2385     sum4 = y[3];
2386     sum5 = y[4];
2387     sum6 = y[5];
2388     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2389     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2390     for (j = 0; j < n; j++) {
2391       xb = x + 6 * (*idx++);
2392       x1 = xb[0];
2393       x2 = xb[1];
2394       x3 = xb[2];
2395       x4 = xb[3];
2396       x5 = xb[4];
2397       x6 = xb[5];
2398       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2399       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2400       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2401       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2402       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2403       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2404       v += 36;
2405     }
2406     z[0] = sum1;
2407     z[1] = sum2;
2408     z[2] = sum3;
2409     z[3] = sum4;
2410     z[4] = sum5;
2411     z[5] = sum6;
2412     if (!usecprow) {
2413       z += 6;
2414       y += 6;
2415     }
2416   }
2417   PetscCall(VecRestoreArrayRead(xx, &x));
2418   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2419   PetscCall(PetscLogFlops(72.0 * a->nz));
2420   PetscFunctionReturn(PETSC_SUCCESS);
2421 }
2422 
MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)2423 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2424 {
2425   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2426   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2427   const PetscScalar *x, *xb;
2428   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2429   const MatScalar   *v;
2430   PetscInt           mbs = a->mbs, i, j, n;
2431   const PetscInt    *idx, *ii, *ridx = NULL;
2432   PetscBool          usecprow = a->compressedrow.use;
2433 
2434   PetscFunctionBegin;
2435   PetscCall(VecGetArrayRead(xx, &x));
2436   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2437 
2438   idx = a->j;
2439   v   = a->a;
2440   if (usecprow) {
2441     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2442     mbs  = a->compressedrow.nrows;
2443     ii   = a->compressedrow.i;
2444     ridx = a->compressedrow.rindex;
2445   } else {
2446     ii = a->i;
2447     y  = yarray;
2448     z  = zarray;
2449   }
2450 
2451   for (i = 0; i < mbs; i++) {
2452     n = ii[1] - ii[0];
2453     ii++;
2454     if (usecprow) {
2455       z = zarray + 7 * ridx[i];
2456       y = yarray + 7 * ridx[i];
2457     }
2458     sum1 = y[0];
2459     sum2 = y[1];
2460     sum3 = y[2];
2461     sum4 = y[3];
2462     sum5 = y[4];
2463     sum6 = y[5];
2464     sum7 = y[6];
2465     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2466     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2467     for (j = 0; j < n; j++) {
2468       xb = x + 7 * (*idx++);
2469       x1 = xb[0];
2470       x2 = xb[1];
2471       x3 = xb[2];
2472       x4 = xb[3];
2473       x5 = xb[4];
2474       x6 = xb[5];
2475       x7 = xb[6];
2476       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2477       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2478       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2479       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2480       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2481       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2482       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2483       v += 49;
2484     }
2485     z[0] = sum1;
2486     z[1] = sum2;
2487     z[2] = sum3;
2488     z[3] = sum4;
2489     z[4] = sum5;
2490     z[5] = sum6;
2491     z[6] = sum7;
2492     if (!usecprow) {
2493       z += 7;
2494       y += 7;
2495     }
2496   }
2497   PetscCall(VecRestoreArrayRead(xx, &x));
2498   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2499   PetscCall(PetscLogFlops(98.0 * a->nz));
2500   PetscFunctionReturn(PETSC_SUCCESS);
2501 }
2502 
2503 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)2504 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2505 {
2506   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2507   PetscScalar       *z = NULL, *work, *workt, *zarray;
2508   const PetscScalar *x, *xb;
2509   const MatScalar   *v;
2510   PetscInt           mbs, i, j, n;
2511   PetscInt           k;
2512   PetscBool          usecprow = a->compressedrow.use;
2513   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2514 
2515   __m256d a0, a1, a2, a3, a4, a5;
2516   __m256d w0, w1, w2, w3;
2517   __m256d z0, z1, z2;
2518   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2519 
2520   PetscFunctionBegin;
2521   PetscCall(VecCopy(yy, zz));
2522   PetscCall(VecGetArrayRead(xx, &x));
2523   PetscCall(VecGetArray(zz, &zarray));
2524 
2525   idx = a->j;
2526   v   = a->a;
2527   if (usecprow) {
2528     mbs  = a->compressedrow.nrows;
2529     ii   = a->compressedrow.i;
2530     ridx = a->compressedrow.rindex;
2531   } else {
2532     mbs = a->mbs;
2533     ii  = a->i;
2534     z   = zarray;
2535   }
2536 
2537   if (!a->mult_work) {
2538     k = PetscMax(A->rmap->n, A->cmap->n);
2539     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2540   }
2541 
2542   work = a->mult_work;
2543   for (i = 0; i < mbs; i++) {
2544     n = ii[1] - ii[0];
2545     ii++;
2546     workt = work;
2547     for (j = 0; j < n; j++) {
2548       xb = x + bs * (*idx++);
2549       for (k = 0; k < bs; k++) workt[k] = xb[k];
2550       workt += bs;
2551     }
2552     if (usecprow) z = zarray + bs * ridx[i];
2553 
2554     z0 = _mm256_loadu_pd(&z[0]);
2555     z1 = _mm256_loadu_pd(&z[4]);
2556     z2 = _mm256_set1_pd(z[8]);
2557 
2558     for (j = 0; j < n; j++) {
2559       /* first column of a */
2560       w0 = _mm256_set1_pd(work[j * 9]);
2561       a0 = _mm256_loadu_pd(&v[j * 81]);
2562       z0 = _mm256_fmadd_pd(a0, w0, z0);
2563       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2564       z1 = _mm256_fmadd_pd(a1, w0, z1);
2565       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2566       z2 = _mm256_fmadd_pd(a2, w0, z2);
2567 
2568       /* second column of a */
2569       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2570       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2571       z0 = _mm256_fmadd_pd(a0, w1, z0);
2572       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2573       z1 = _mm256_fmadd_pd(a1, w1, z1);
2574       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2575       z2 = _mm256_fmadd_pd(a2, w1, z2);
2576 
2577       /* third column of a */
2578       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2579       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2580       z0 = _mm256_fmadd_pd(a3, w2, z0);
2581       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2582       z1 = _mm256_fmadd_pd(a4, w2, z1);
2583       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2584       z2 = _mm256_fmadd_pd(a5, w2, z2);
2585 
2586       /* fourth column of a */
2587       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2588       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2589       z0 = _mm256_fmadd_pd(a0, w3, z0);
2590       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2591       z1 = _mm256_fmadd_pd(a1, w3, z1);
2592       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2593       z2 = _mm256_fmadd_pd(a2, w3, z2);
2594 
2595       /* fifth column of a */
2596       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2597       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2598       z0 = _mm256_fmadd_pd(a3, w0, z0);
2599       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2600       z1 = _mm256_fmadd_pd(a4, w0, z1);
2601       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2602       z2 = _mm256_fmadd_pd(a5, w0, z2);
2603 
2604       /* sixth column of a */
2605       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2606       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2607       z0 = _mm256_fmadd_pd(a0, w1, z0);
2608       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2609       z1 = _mm256_fmadd_pd(a1, w1, z1);
2610       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2611       z2 = _mm256_fmadd_pd(a2, w1, z2);
2612 
2613       /* seventh column of a */
2614       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2615       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2616       z0 = _mm256_fmadd_pd(a0, w2, z0);
2617       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2618       z1 = _mm256_fmadd_pd(a1, w2, z1);
2619       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2620       z2 = _mm256_fmadd_pd(a2, w2, z2);
2621 
2622       /* eighth column of a */
2623       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2624       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2625       z0 = _mm256_fmadd_pd(a3, w3, z0);
2626       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2627       z1 = _mm256_fmadd_pd(a4, w3, z1);
2628       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2629       z2 = _mm256_fmadd_pd(a5, w3, z2);
2630 
2631       /* ninth column of a */
2632       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2633       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2634       z0 = _mm256_fmadd_pd(a0, w0, z0);
2635       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2636       z1 = _mm256_fmadd_pd(a1, w0, z1);
2637       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2638       z2 = _mm256_fmadd_pd(a2, w0, z2);
2639     }
2640 
2641     _mm256_storeu_pd(&z[0], z0);
2642     _mm256_storeu_pd(&z[4], z1);
2643     _mm256_maskstore_pd(&z[8], mask1, z2);
2644 
2645     v += n * bs2;
2646     if (!usecprow) z += bs;
2647   }
2648   PetscCall(VecRestoreArrayRead(xx, &x));
2649   PetscCall(VecRestoreArray(zz, &zarray));
2650   PetscCall(PetscLogFlops(162.0 * a->nz));
2651   PetscFunctionReturn(PETSC_SUCCESS);
2652 }
2653 #endif
2654 
MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)2655 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2656 {
2657   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2658   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2659   const PetscScalar *x, *xb;
2660   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2661   const MatScalar   *v;
2662   PetscInt           mbs = a->mbs, i, j, n;
2663   const PetscInt    *idx, *ii, *ridx = NULL;
2664   PetscBool          usecprow = a->compressedrow.use;
2665 
2666   PetscFunctionBegin;
2667   PetscCall(VecGetArrayRead(xx, &x));
2668   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2669 
2670   idx = a->j;
2671   v   = a->a;
2672   if (usecprow) {
2673     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2674     mbs  = a->compressedrow.nrows;
2675     ii   = a->compressedrow.i;
2676     ridx = a->compressedrow.rindex;
2677   } else {
2678     ii = a->i;
2679     y  = yarray;
2680     z  = zarray;
2681   }
2682 
2683   for (i = 0; i < mbs; i++) {
2684     n = ii[1] - ii[0];
2685     ii++;
2686     if (usecprow) {
2687       z = zarray + 11 * ridx[i];
2688       y = yarray + 11 * ridx[i];
2689     }
2690     sum1  = y[0];
2691     sum2  = y[1];
2692     sum3  = y[2];
2693     sum4  = y[3];
2694     sum5  = y[4];
2695     sum6  = y[5];
2696     sum7  = y[6];
2697     sum8  = y[7];
2698     sum9  = y[8];
2699     sum10 = y[9];
2700     sum11 = y[10];
2701     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2702     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2703     for (j = 0; j < n; j++) {
2704       xb  = x + 11 * (*idx++);
2705       x1  = xb[0];
2706       x2  = xb[1];
2707       x3  = xb[2];
2708       x4  = xb[3];
2709       x5  = xb[4];
2710       x6  = xb[5];
2711       x7  = xb[6];
2712       x8  = xb[7];
2713       x9  = xb[8];
2714       x10 = xb[9];
2715       x11 = xb[10];
2716       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2717       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2718       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2719       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2720       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2721       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2722       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2723       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2724       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2725       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2726       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2727       v += 121;
2728     }
2729     z[0]  = sum1;
2730     z[1]  = sum2;
2731     z[2]  = sum3;
2732     z[3]  = sum4;
2733     z[4]  = sum5;
2734     z[5]  = sum6;
2735     z[6]  = sum7;
2736     z[7]  = sum8;
2737     z[8]  = sum9;
2738     z[9]  = sum10;
2739     z[10] = sum11;
2740     if (!usecprow) {
2741       z += 11;
2742       y += 11;
2743     }
2744   }
2745   PetscCall(VecRestoreArrayRead(xx, &x));
2746   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2747   PetscCall(PetscLogFlops(242.0 * a->nz));
2748   PetscFunctionReturn(PETSC_SUCCESS);
2749 }
2750 
MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)2751 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2752 {
2753   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2754   PetscScalar       *z = NULL, *work, *workt, *zarray;
2755   const PetscScalar *x, *xb;
2756   const MatScalar   *v;
2757   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2758   PetscInt           ncols, k;
2759   const PetscInt    *ridx     = NULL, *idx, *ii;
2760   PetscBool          usecprow = a->compressedrow.use;
2761 
2762   PetscFunctionBegin;
2763   PetscCall(VecCopy(yy, zz));
2764   PetscCall(VecGetArrayRead(xx, &x));
2765   PetscCall(VecGetArray(zz, &zarray));
2766 
2767   idx = a->j;
2768   v   = a->a;
2769   if (usecprow) {
2770     mbs  = a->compressedrow.nrows;
2771     ii   = a->compressedrow.i;
2772     ridx = a->compressedrow.rindex;
2773   } else {
2774     mbs = a->mbs;
2775     ii  = a->i;
2776     z   = zarray;
2777   }
2778 
2779   if (!a->mult_work) {
2780     k = PetscMax(A->rmap->n, A->cmap->n);
2781     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2782   }
2783   work = a->mult_work;
2784   for (i = 0; i < mbs; i++) {
2785     n = ii[1] - ii[0];
2786     ii++;
2787     ncols = n * bs;
2788     workt = work;
2789     for (j = 0; j < n; j++) {
2790       xb = x + bs * (*idx++);
2791       for (k = 0; k < bs; k++) workt[k] = xb[k];
2792       workt += bs;
2793     }
2794     if (usecprow) z = zarray + bs * ridx[i];
2795     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2796     v += n * bs2;
2797     if (!usecprow) z += bs;
2798   }
2799   PetscCall(VecRestoreArrayRead(xx, &x));
2800   PetscCall(VecRestoreArray(zz, &zarray));
2801   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2802   PetscFunctionReturn(PETSC_SUCCESS);
2803 }
2804 
MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)2805 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2806 {
2807   PetscScalar zero = 0.0;
2808 
2809   PetscFunctionBegin;
2810   PetscCall(VecSet(zz, zero));
2811   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2812   PetscFunctionReturn(PETSC_SUCCESS);
2813 }
2814 
MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)2815 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2816 {
2817   PetscScalar zero = 0.0;
2818 
2819   PetscFunctionBegin;
2820   PetscCall(VecSet(zz, zero));
2821   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2822   PetscFunctionReturn(PETSC_SUCCESS);
2823 }
2824 
MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)2825 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2826 {
2827   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2828   PetscScalar       *z, x1, x2, x3, x4, x5;
2829   const PetscScalar *x, *xb = NULL;
2830   const MatScalar   *v;
2831   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2832   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2833   Mat_CompressedRow  cprow    = a->compressedrow;
2834   PetscBool          usecprow = cprow.use;
2835 
2836   PetscFunctionBegin;
2837   if (yy != zz) PetscCall(VecCopy(yy, zz));
2838   PetscCall(VecGetArrayRead(xx, &x));
2839   PetscCall(VecGetArray(zz, &z));
2840 
2841   idx = a->j;
2842   v   = a->a;
2843   if (usecprow) {
2844     mbs  = cprow.nrows;
2845     ii   = cprow.i;
2846     ridx = cprow.rindex;
2847   } else {
2848     mbs = a->mbs;
2849     ii  = a->i;
2850     xb  = x;
2851   }
2852 
2853   switch (bs) {
2854   case 1:
2855     for (i = 0; i < mbs; i++) {
2856       if (usecprow) xb = x + ridx[i];
2857       x1 = xb[0];
2858       ib = idx + ii[0];
2859       n  = ii[1] - ii[0];
2860       ii++;
2861       for (j = 0; j < n; j++) {
2862         rval = ib[j];
2863         z[rval] += PetscConj(*v) * x1;
2864         v++;
2865       }
2866       if (!usecprow) xb++;
2867     }
2868     break;
2869   case 2:
2870     for (i = 0; i < mbs; i++) {
2871       if (usecprow) xb = x + 2 * ridx[i];
2872       x1 = xb[0];
2873       x2 = xb[1];
2874       ib = idx + ii[0];
2875       n  = ii[1] - ii[0];
2876       ii++;
2877       for (j = 0; j < n; j++) {
2878         rval = ib[j] * 2;
2879         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2880         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2881         v += 4;
2882       }
2883       if (!usecprow) xb += 2;
2884     }
2885     break;
2886   case 3:
2887     for (i = 0; i < mbs; i++) {
2888       if (usecprow) xb = x + 3 * ridx[i];
2889       x1 = xb[0];
2890       x2 = xb[1];
2891       x3 = xb[2];
2892       ib = idx + ii[0];
2893       n  = ii[1] - ii[0];
2894       ii++;
2895       for (j = 0; j < n; j++) {
2896         rval = ib[j] * 3;
2897         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2898         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2899         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2900         v += 9;
2901       }
2902       if (!usecprow) xb += 3;
2903     }
2904     break;
2905   case 4:
2906     for (i = 0; i < mbs; i++) {
2907       if (usecprow) xb = x + 4 * ridx[i];
2908       x1 = xb[0];
2909       x2 = xb[1];
2910       x3 = xb[2];
2911       x4 = xb[3];
2912       ib = idx + ii[0];
2913       n  = ii[1] - ii[0];
2914       ii++;
2915       for (j = 0; j < n; j++) {
2916         rval = ib[j] * 4;
2917         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2918         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2919         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2920         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2921         v += 16;
2922       }
2923       if (!usecprow) xb += 4;
2924     }
2925     break;
2926   case 5:
2927     for (i = 0; i < mbs; i++) {
2928       if (usecprow) xb = x + 5 * ridx[i];
2929       x1 = xb[0];
2930       x2 = xb[1];
2931       x3 = xb[2];
2932       x4 = xb[3];
2933       x5 = xb[4];
2934       ib = idx + ii[0];
2935       n  = ii[1] - ii[0];
2936       ii++;
2937       for (j = 0; j < n; j++) {
2938         rval = ib[j] * 5;
2939         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2940         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2941         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2942         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2943         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2944         v += 25;
2945       }
2946       if (!usecprow) xb += 5;
2947     }
2948     break;
2949   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2950     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2951 #if 0
2952     {
2953       PetscInt          ncols,k,bs2=a->bs2;
2954       PetscScalar       *work,*workt,zb;
2955       const PetscScalar *xtmp;
2956       if (!a->mult_work) {
2957         k    = PetscMax(A->rmap->n,A->cmap->n);
2958         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2959       }
2960       work = a->mult_work;
2961       xtmp = x;
2962       for (i=0; i<mbs; i++) {
2963         n     = ii[1] - ii[0]; ii++;
2964         ncols = n*bs;
2965         PetscCall(PetscArrayzero(work,ncols));
2966         if (usecprow) xtmp = x + bs*ridx[i];
2967         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2968         v += n*bs2;
2969         if (!usecprow) xtmp += bs;
2970         workt = work;
2971         for (j=0; j<n; j++) {
2972           zb = z + bs*(*idx++);
2973           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2974           workt += bs;
2975         }
2976       }
2977     }
2978 #endif
2979   }
2980   PetscCall(VecRestoreArrayRead(xx, &x));
2981   PetscCall(VecRestoreArray(zz, &z));
2982   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2983   PetscFunctionReturn(PETSC_SUCCESS);
2984 }
2985 
MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)2986 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2987 {
2988   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2989   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2990   const PetscScalar *x, *xb = NULL;
2991   const MatScalar   *v;
2992   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2993   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2994   Mat_CompressedRow  cprow    = a->compressedrow;
2995   PetscBool          usecprow = cprow.use;
2996 
2997   PetscFunctionBegin;
2998   if (yy != zz) PetscCall(VecCopy(yy, zz));
2999   PetscCall(VecGetArrayRead(xx, &x));
3000   PetscCall(VecGetArray(zz, &z));
3001 
3002   idx = a->j;
3003   v   = a->a;
3004   if (usecprow) {
3005     mbs  = cprow.nrows;
3006     ii   = cprow.i;
3007     ridx = cprow.rindex;
3008   } else {
3009     mbs = a->mbs;
3010     ii  = a->i;
3011     xb  = x;
3012   }
3013 
3014   switch (bs) {
3015   case 1:
3016     for (i = 0; i < mbs; i++) {
3017       if (usecprow) xb = x + ridx[i];
3018       x1 = xb[0];
3019       ib = idx + ii[0];
3020       n  = ii[1] - ii[0];
3021       ii++;
3022       for (j = 0; j < n; j++) {
3023         rval = ib[j];
3024         z[rval] += *v * x1;
3025         v++;
3026       }
3027       if (!usecprow) xb++;
3028     }
3029     break;
3030   case 2:
3031     for (i = 0; i < mbs; i++) {
3032       if (usecprow) xb = x + 2 * ridx[i];
3033       x1 = xb[0];
3034       x2 = xb[1];
3035       ib = idx + ii[0];
3036       n  = ii[1] - ii[0];
3037       ii++;
3038       for (j = 0; j < n; j++) {
3039         rval = ib[j] * 2;
3040         z[rval++] += v[0] * x1 + v[1] * x2;
3041         z[rval++] += v[2] * x1 + v[3] * x2;
3042         v += 4;
3043       }
3044       if (!usecprow) xb += 2;
3045     }
3046     break;
3047   case 3:
3048     for (i = 0; i < mbs; i++) {
3049       if (usecprow) xb = x + 3 * ridx[i];
3050       x1 = xb[0];
3051       x2 = xb[1];
3052       x3 = xb[2];
3053       ib = idx + ii[0];
3054       n  = ii[1] - ii[0];
3055       ii++;
3056       for (j = 0; j < n; j++) {
3057         rval = ib[j] * 3;
3058         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3059         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3060         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3061         v += 9;
3062       }
3063       if (!usecprow) xb += 3;
3064     }
3065     break;
3066   case 4:
3067     for (i = 0; i < mbs; i++) {
3068       if (usecprow) xb = x + 4 * ridx[i];
3069       x1 = xb[0];
3070       x2 = xb[1];
3071       x3 = xb[2];
3072       x4 = xb[3];
3073       ib = idx + ii[0];
3074       n  = ii[1] - ii[0];
3075       ii++;
3076       for (j = 0; j < n; j++) {
3077         rval = ib[j] * 4;
3078         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3079         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3080         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3081         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3082         v += 16;
3083       }
3084       if (!usecprow) xb += 4;
3085     }
3086     break;
3087   case 5:
3088     for (i = 0; i < mbs; i++) {
3089       if (usecprow) xb = x + 5 * ridx[i];
3090       x1 = xb[0];
3091       x2 = xb[1];
3092       x3 = xb[2];
3093       x4 = xb[3];
3094       x5 = xb[4];
3095       ib = idx + ii[0];
3096       n  = ii[1] - ii[0];
3097       ii++;
3098       for (j = 0; j < n; j++) {
3099         rval = ib[j] * 5;
3100         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3101         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3102         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3103         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3104         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3105         v += 25;
3106       }
3107       if (!usecprow) xb += 5;
3108     }
3109     break;
3110   default: { /* block sizes larger than 5 by 5 are handled by BLAS */
3111     PetscInt           ncols, k;
3112     PetscScalar       *work, *workt;
3113     const PetscScalar *xtmp;
3114     if (!a->mult_work) {
3115       k = PetscMax(A->rmap->n, A->cmap->n);
3116       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3117     }
3118     work = a->mult_work;
3119     xtmp = x;
3120     for (i = 0; i < mbs; i++) {
3121       n = ii[1] - ii[0];
3122       ii++;
3123       ncols = n * bs;
3124       PetscCall(PetscArrayzero(work, ncols));
3125       if (usecprow) xtmp = x + bs * ridx[i];
3126       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3127       v += n * bs2;
3128       if (!usecprow) xtmp += bs;
3129       workt = work;
3130       for (j = 0; j < n; j++) {
3131         zb = z + bs * (*idx++);
3132         for (k = 0; k < bs; k++) zb[k] += workt[k];
3133         workt += bs;
3134       }
3135     }
3136   }
3137   }
3138   PetscCall(VecRestoreArrayRead(xx, &x));
3139   PetscCall(VecRestoreArray(zz, &z));
3140   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3141   PetscFunctionReturn(PETSC_SUCCESS);
3142 }
3143 
MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)3144 PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3145 {
3146   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3147   PetscInt     totalnz = a->bs2 * a->nz;
3148   PetscScalar  oalpha  = alpha;
3149   PetscBLASInt one     = 1, tnz;
3150 
3151   PetscFunctionBegin;
3152   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3153   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3154   PetscCall(PetscLogFlops(totalnz));
3155   PetscFunctionReturn(PETSC_SUCCESS);
3156 }
3157 
MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal * norm)3158 PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3159 {
3160   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3161   MatScalar   *v   = a->a;
3162   PetscReal    sum = 0.0;
3163   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3164 
3165   PetscFunctionBegin;
3166   if (type == NORM_FROBENIUS) {
3167 #if defined(PETSC_USE_REAL___FP16)
3168     PetscBLASInt one = 1, cnt = bs2 * nz;
3169     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3170 #else
3171     for (i = 0; i < bs2 * nz; i++) {
3172       sum += PetscRealPart(PetscConj(*v) * (*v));
3173       v++;
3174     }
3175 #endif
3176     *norm = PetscSqrtReal(sum);
3177     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3178   } else if (type == NORM_1) { /* maximum column sum */
3179     PetscReal *tmp;
3180     PetscInt  *bcol = a->j;
3181     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3182     for (i = 0; i < nz; i++) {
3183       for (j = 0; j < bs; j++) {
3184         k1 = bs * (*bcol) + j; /* column index */
3185         for (k = 0; k < bs; k++) {
3186           tmp[k1] += PetscAbsScalar(*v);
3187           v++;
3188         }
3189       }
3190       bcol++;
3191     }
3192     *norm = 0.0;
3193     for (j = 0; j < A->cmap->n; j++) {
3194       if (tmp[j] > *norm) *norm = tmp[j];
3195     }
3196     PetscCall(PetscFree(tmp));
3197     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3198   } else if (type == NORM_INFINITY) { /* maximum row sum */
3199     *norm = 0.0;
3200     for (k = 0; k < bs; k++) {
3201       for (j = 0; j < a->mbs; j++) {
3202         v   = a->a + bs2 * a->i[j] + k;
3203         sum = 0.0;
3204         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3205           for (k1 = 0; k1 < bs; k1++) {
3206             sum += PetscAbsScalar(*v);
3207             v += bs;
3208           }
3209         }
3210         if (sum > *norm) *norm = sum;
3211       }
3212     }
3213     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3214   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3215   PetscFunctionReturn(PETSC_SUCCESS);
3216 }
3217 
MatGetDiagonal_SeqBAIJ(Mat A,Vec v)3218 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3219 {
3220   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
3221   PetscInt         n;
3222   const PetscInt   bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2;
3223   PetscScalar     *x;
3224   const MatScalar *aa = a->a, *aa_j;
3225   const PetscInt  *ai = a->i, *adiag;
3226   PetscBool        diagDense;
3227 
3228   PetscFunctionBegin;
3229   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3230 
3231   PetscCall(VecGetLocalSize(v, &n));
3232   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3233   PetscCall(VecGetArrayWrite(v, &x));
3234   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
3235   if (diagDense) {
3236     for (PetscInt i = 0, row = 0; i < ambs; i++) {
3237       aa_j = aa + adiag[i] * bs2;
3238       for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
3239     }
3240   } else {
3241     for (PetscInt i = 0, row = 0; i < ambs; i++) {
3242       const PetscInt j = adiag[i];
3243 
3244       if (j != ai[i + 1]) {
3245         aa_j = aa + j * bs2;
3246         for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
3247       } else {
3248         for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0;
3249       }
3250     }
3251   }
3252   PetscCall(VecRestoreArrayWrite(v, &x));
3253   PetscFunctionReturn(PETSC_SUCCESS);
3254 }
3255 
MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)3256 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3257 {
3258   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3259   const PetscScalar *l, *r, *li, *ri;
3260   PetscScalar        x;
3261   MatScalar         *aa, *v;
3262   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3263   const PetscInt    *ai, *aj;
3264 
3265   PetscFunctionBegin;
3266   ai  = a->i;
3267   aj  = a->j;
3268   aa  = a->a;
3269   m   = A->rmap->n;
3270   n   = A->cmap->n;
3271   bs  = A->rmap->bs;
3272   mbs = a->mbs;
3273   bs2 = a->bs2;
3274   if (ll) {
3275     PetscCall(VecGetArrayRead(ll, &l));
3276     PetscCall(VecGetLocalSize(ll, &lm));
3277     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3278     for (i = 0; i < mbs; i++) { /* for each block row */
3279       M  = ai[i + 1] - ai[i];
3280       li = l + i * bs;
3281       v  = PetscSafePointerPlusOffset(aa, bs2 * ai[i]);
3282       for (j = 0; j < M; j++) { /* for each block */
3283         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3284       }
3285     }
3286     PetscCall(VecRestoreArrayRead(ll, &l));
3287     PetscCall(PetscLogFlops(a->nz));
3288   }
3289 
3290   if (rr) {
3291     PetscCall(VecGetArrayRead(rr, &r));
3292     PetscCall(VecGetLocalSize(rr, &rn));
3293     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3294     for (i = 0; i < mbs; i++) { /* for each block row */
3295       iai = ai[i];
3296       M   = ai[i + 1] - iai;
3297       v   = PetscSafePointerPlusOffset(aa, bs2 * iai);
3298       for (j = 0; j < M; j++) { /* for each block */
3299         ri = r + bs * aj[iai + j];
3300         for (k = 0; k < bs; k++) {
3301           x = ri[k];
3302           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3303           v += bs;
3304         }
3305       }
3306     }
3307     PetscCall(VecRestoreArrayRead(rr, &r));
3308     PetscCall(PetscLogFlops(a->nz));
3309   }
3310   PetscFunctionReturn(PETSC_SUCCESS);
3311 }
3312 
MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo * info)3313 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3314 {
3315   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3316 
3317   PetscFunctionBegin;
3318   info->block_size   = a->bs2;
3319   info->nz_allocated = a->bs2 * a->maxnz;
3320   info->nz_used      = a->bs2 * a->nz;
3321   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3322   info->assemblies   = A->num_ass;
3323   info->mallocs      = A->info.mallocs;
3324   info->memory       = 0; /* REVIEW ME */
3325   if (A->factortype) {
3326     info->fill_ratio_given  = A->info.fill_ratio_given;
3327     info->fill_ratio_needed = A->info.fill_ratio_needed;
3328     info->factor_mallocs    = A->info.factor_mallocs;
3329   } else {
3330     info->fill_ratio_given  = 0;
3331     info->fill_ratio_needed = 0;
3332     info->factor_mallocs    = 0;
3333   }
3334   PetscFunctionReturn(PETSC_SUCCESS);
3335 }
3336 
MatZeroEntries_SeqBAIJ(Mat A)3337 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3338 {
3339   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3340 
3341   PetscFunctionBegin;
3342   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3343   PetscFunctionReturn(PETSC_SUCCESS);
3344 }
3345 
MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)3346 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3347 {
3348   PetscFunctionBegin;
3349   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3350   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3351   PetscFunctionReturn(PETSC_SUCCESS);
3352 }
3353 
MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)3354 PetscErrorCode MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3355 {
3356   PetscFunctionBegin;
3357   PetscCall(MatTransposeMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3358   C->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqBAIJ_SeqDense;
3359   PetscFunctionReturn(PETSC_SUCCESS);
3360 }
3361 
MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3362 static PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3363 {
3364   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3365   PetscScalar       *z = NULL, sum1;
3366   const PetscScalar *xb;
3367   PetscScalar        x1;
3368   const MatScalar   *v, *vv;
3369   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3370   PetscBool          usecprow = a->compressedrow.use;
3371 
3372   PetscFunctionBegin;
3373   idx = a->j;
3374   v   = a->a;
3375   if (usecprow) {
3376     mbs  = a->compressedrow.nrows;
3377     ii   = a->compressedrow.i;
3378     ridx = a->compressedrow.rindex;
3379   } else {
3380     mbs = a->mbs;
3381     ii  = a->i;
3382     z   = c;
3383   }
3384 
3385   for (i = 0; i < mbs; i++) {
3386     n = ii[1] - ii[0];
3387     ii++;
3388     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3389     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3390     if (usecprow) z = c + ridx[i];
3391     jj = idx;
3392     vv = v;
3393     for (k = 0; k < cn; k++) {
3394       idx  = jj;
3395       v    = vv;
3396       sum1 = 0.0;
3397       for (j = 0; j < n; j++) {
3398         xb = b + (*idx++);
3399         x1 = xb[0 + k * bm];
3400         sum1 += v[0] * x1;
3401         v += 1;
3402       }
3403       z[0 + k * cm] = sum1;
3404     }
3405     if (!usecprow) z += 1;
3406   }
3407   PetscFunctionReturn(PETSC_SUCCESS);
3408 }
3409 
MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3410 static PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3411 {
3412   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3413   PetscScalar       *z = NULL, sum1, sum2;
3414   const PetscScalar *xb;
3415   PetscScalar        x1, x2;
3416   const MatScalar   *v, *vv;
3417   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3418   PetscBool          usecprow = a->compressedrow.use;
3419 
3420   PetscFunctionBegin;
3421   idx = a->j;
3422   v   = a->a;
3423   if (usecprow) {
3424     mbs  = a->compressedrow.nrows;
3425     ii   = a->compressedrow.i;
3426     ridx = a->compressedrow.rindex;
3427   } else {
3428     mbs = a->mbs;
3429     ii  = a->i;
3430     z   = c;
3431   }
3432 
3433   for (i = 0; i < mbs; i++) {
3434     n = ii[1] - ii[0];
3435     ii++;
3436     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3437     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3438     if (usecprow) z = c + 2 * ridx[i];
3439     jj = idx;
3440     vv = v;
3441     for (k = 0; k < cn; k++) {
3442       idx  = jj;
3443       v    = vv;
3444       sum1 = 0.0;
3445       sum2 = 0.0;
3446       for (j = 0; j < n; j++) {
3447         xb = b + 2 * (*idx++);
3448         x1 = xb[0 + k * bm];
3449         x2 = xb[1 + k * bm];
3450         sum1 += v[0] * x1 + v[2] * x2;
3451         sum2 += v[1] * x1 + v[3] * x2;
3452         v += 4;
3453       }
3454       z[0 + k * cm] = sum1;
3455       z[1 + k * cm] = sum2;
3456     }
3457     if (!usecprow) z += 2;
3458   }
3459   PetscFunctionReturn(PETSC_SUCCESS);
3460 }
3461 
MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3462 static PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3463 {
3464   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3465   PetscScalar       *z = NULL, sum1, sum2, sum3;
3466   const PetscScalar *xb;
3467   PetscScalar        x1, x2, x3;
3468   const MatScalar   *v, *vv;
3469   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3470   PetscBool          usecprow = a->compressedrow.use;
3471 
3472   PetscFunctionBegin;
3473   idx = a->j;
3474   v   = a->a;
3475   if (usecprow) {
3476     mbs  = a->compressedrow.nrows;
3477     ii   = a->compressedrow.i;
3478     ridx = a->compressedrow.rindex;
3479   } else {
3480     mbs = a->mbs;
3481     ii  = a->i;
3482     z   = c;
3483   }
3484 
3485   for (i = 0; i < mbs; i++) {
3486     n = ii[1] - ii[0];
3487     ii++;
3488     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3489     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3490     if (usecprow) z = c + 3 * ridx[i];
3491     jj = idx;
3492     vv = v;
3493     for (k = 0; k < cn; k++) {
3494       idx  = jj;
3495       v    = vv;
3496       sum1 = 0.0;
3497       sum2 = 0.0;
3498       sum3 = 0.0;
3499       for (j = 0; j < n; j++) {
3500         xb = b + 3 * (*idx++);
3501         x1 = xb[0 + k * bm];
3502         x2 = xb[1 + k * bm];
3503         x3 = xb[2 + k * bm];
3504         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3505         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3506         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3507         v += 9;
3508       }
3509       z[0 + k * cm] = sum1;
3510       z[1 + k * cm] = sum2;
3511       z[2 + k * cm] = sum3;
3512     }
3513     if (!usecprow) z += 3;
3514   }
3515   PetscFunctionReturn(PETSC_SUCCESS);
3516 }
3517 
MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3518 static PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3519 {
3520   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3521   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3522   const PetscScalar *xb;
3523   PetscScalar        x1, x2, x3, x4;
3524   const MatScalar   *v, *vv;
3525   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3526   PetscBool          usecprow = a->compressedrow.use;
3527 
3528   PetscFunctionBegin;
3529   idx = a->j;
3530   v   = a->a;
3531   if (usecprow) {
3532     mbs  = a->compressedrow.nrows;
3533     ii   = a->compressedrow.i;
3534     ridx = a->compressedrow.rindex;
3535   } else {
3536     mbs = a->mbs;
3537     ii  = a->i;
3538     z   = c;
3539   }
3540 
3541   for (i = 0; i < mbs; i++) {
3542     n = ii[1] - ii[0];
3543     ii++;
3544     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3545     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3546     if (usecprow) z = c + 4 * ridx[i];
3547     jj = idx;
3548     vv = v;
3549     for (k = 0; k < cn; k++) {
3550       idx  = jj;
3551       v    = vv;
3552       sum1 = 0.0;
3553       sum2 = 0.0;
3554       sum3 = 0.0;
3555       sum4 = 0.0;
3556       for (j = 0; j < n; j++) {
3557         xb = b + 4 * (*idx++);
3558         x1 = xb[0 + k * bm];
3559         x2 = xb[1 + k * bm];
3560         x3 = xb[2 + k * bm];
3561         x4 = xb[3 + k * bm];
3562         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3563         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3564         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3565         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3566         v += 16;
3567       }
3568       z[0 + k * cm] = sum1;
3569       z[1 + k * cm] = sum2;
3570       z[2 + k * cm] = sum3;
3571       z[3 + k * cm] = sum4;
3572     }
3573     if (!usecprow) z += 4;
3574   }
3575   PetscFunctionReturn(PETSC_SUCCESS);
3576 }
3577 
MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3578 static PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3579 {
3580   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3581   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3582   const PetscScalar *xb;
3583   PetscScalar        x1, x2, x3, x4, x5;
3584   const MatScalar   *v, *vv;
3585   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3586   PetscBool          usecprow = a->compressedrow.use;
3587 
3588   PetscFunctionBegin;
3589   idx = a->j;
3590   v   = a->a;
3591   if (usecprow) {
3592     mbs  = a->compressedrow.nrows;
3593     ii   = a->compressedrow.i;
3594     ridx = a->compressedrow.rindex;
3595   } else {
3596     mbs = a->mbs;
3597     ii  = a->i;
3598     z   = c;
3599   }
3600 
3601   for (i = 0; i < mbs; i++) {
3602     n = ii[1] - ii[0];
3603     ii++;
3604     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3605     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3606     if (usecprow) z = c + 5 * ridx[i];
3607     jj = idx;
3608     vv = v;
3609     for (k = 0; k < cn; k++) {
3610       idx  = jj;
3611       v    = vv;
3612       sum1 = 0.0;
3613       sum2 = 0.0;
3614       sum3 = 0.0;
3615       sum4 = 0.0;
3616       sum5 = 0.0;
3617       for (j = 0; j < n; j++) {
3618         xb = b + 5 * (*idx++);
3619         x1 = xb[0 + k * bm];
3620         x2 = xb[1 + k * bm];
3621         x3 = xb[2 + k * bm];
3622         x4 = xb[3 + k * bm];
3623         x5 = xb[4 + k * bm];
3624         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3625         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3626         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3627         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3628         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3629         v += 25;
3630       }
3631       z[0 + k * cm] = sum1;
3632       z[1 + k * cm] = sum2;
3633       z[2 + k * cm] = sum3;
3634       z[3 + k * cm] = sum4;
3635       z[4 + k * cm] = sum5;
3636     }
3637     if (!usecprow) z += 5;
3638   }
3639   PetscFunctionReturn(PETSC_SUCCESS);
3640 }
3641 
MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)3642 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3643 {
3644   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3645   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3646   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3647   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3648   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3649   PetscBLASInt     bbs, bcn, bbm, bcm;
3650   PetscScalar     *z = NULL;
3651   PetscScalar     *c, *b;
3652   const MatScalar *v;
3653   const PetscInt  *idx, *ii, *ridx = NULL;
3654   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3655   PetscBool        usecprow = a->compressedrow.use;
3656 
3657   PetscFunctionBegin;
3658   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3659   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3660   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3661   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3662   b = bd->v;
3663   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3664   PetscCall(MatDenseGetArrayWrite(C, &c));
3665   switch (bs) {
3666   case 1:
3667     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3668     break;
3669   case 2:
3670     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3671     break;
3672   case 3:
3673     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3674     break;
3675   case 4:
3676     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3677     break;
3678   case 5:
3679     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3680     break;
3681   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3682     PetscCall(PetscBLASIntCast(bs, &bbs));
3683     PetscCall(PetscBLASIntCast(cn, &bcn));
3684     PetscCall(PetscBLASIntCast(bm, &bbm));
3685     PetscCall(PetscBLASIntCast(cm, &bcm));
3686     idx = a->j;
3687     v   = a->a;
3688     if (usecprow) {
3689       mbs  = a->compressedrow.nrows;
3690       ii   = a->compressedrow.i;
3691       ridx = a->compressedrow.rindex;
3692     } else {
3693       mbs = a->mbs;
3694       ii  = a->i;
3695       z   = c;
3696     }
3697     for (i = 0; i < mbs; i++) {
3698       n = ii[1] - ii[0];
3699       ii++;
3700       if (usecprow) z = c + bs * ridx[i];
3701       if (n) {
3702         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3703         v += bs2;
3704       }
3705       for (j = 1; j < n; j++) {
3706         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3707         v += bs2;
3708       }
3709       if (!usecprow) z += bs;
3710     }
3711   }
3712   PetscCall(MatDenseRestoreArrayWrite(C, &c));
3713   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3714   PetscFunctionReturn(PETSC_SUCCESS);
3715 }
3716 
MatTransposeMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3717 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3718 {
3719   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3720   const MatScalar   *v;
3721   PetscInt           mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3722   PetscBool          usecprow = a->compressedrow.use;
3723   const PetscScalar *bi;
3724 
3725   PetscFunctionBegin;
3726   idx = a->j;
3727   v   = a->a;
3728   if (usecprow) {
3729     mbs  = a->compressedrow.nrows;
3730     ii   = a->compressedrow.i;
3731     ridx = a->compressedrow.rindex;
3732   } else {
3733     mbs = a->mbs;
3734     ii  = a->i;
3735   }
3736 
3737   for (i = 0; i < mbs; i++) {
3738     PetscInt brow = usecprow ? ridx[i] : i;
3739 
3740     n = ii[1] - ii[0];
3741     ii++;
3742     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3743     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3744     for (j = 0, bi = b + 1 * brow; j < n; j++) {
3745       PetscScalar *zcol = c + 1 * (*idx++);
3746 
3747       for (k = 0; k < cn; k++) zcol[0 + k * cm] += v[0] * bi[k * bm];
3748       ++v;
3749     }
3750   }
3751   PetscFunctionReturn(PETSC_SUCCESS);
3752 }
3753 
MatTransposeMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3754 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3755 {
3756   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3757   const MatScalar   *v;
3758   PetscInt           mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3759   PetscBool          usecprow = a->compressedrow.use;
3760   const PetscScalar *bi;
3761   PetscScalar        x1, x2;
3762 
3763   PetscFunctionBegin;
3764   idx = a->j;
3765   v   = a->a;
3766   if (usecprow) {
3767     mbs  = a->compressedrow.nrows;
3768     ii   = a->compressedrow.i;
3769     ridx = a->compressedrow.rindex;
3770   } else {
3771     mbs = a->mbs;
3772     ii  = a->i;
3773   }
3774 
3775   for (i = 0; i < mbs; i++) {
3776     PetscInt brow = usecprow ? ridx[i] : i;
3777 
3778     n = ii[1] - ii[0];
3779     ii++;
3780     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3781     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3782     for (j = 0, bi = b + 2 * brow; j < n; j++) {
3783       PetscScalar *zcol = c + 2 * (*idx++);
3784 
3785       for (k = 0; k < cn; k++) {
3786         x1 = bi[0 + k * bm];
3787         x2 = bi[1 + k * bm];
3788         zcol[0 + k * cm] += v[0] * x1 + v[1] * x2;
3789         zcol[1 + k * cm] += v[2] * x1 + v[3] * x2;
3790       }
3791       v += 4;
3792     }
3793   }
3794   PetscFunctionReturn(PETSC_SUCCESS);
3795 }
3796 
MatTransposeMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3797 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3798 {
3799   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3800   const MatScalar   *v;
3801   PetscInt           mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3802   PetscBool          usecprow = a->compressedrow.use;
3803   const PetscScalar *bi;
3804   PetscScalar        x1, x2, x3;
3805 
3806   PetscFunctionBegin;
3807   idx = a->j;
3808   v   = a->a;
3809   if (usecprow) {
3810     mbs  = a->compressedrow.nrows;
3811     ii   = a->compressedrow.i;
3812     ridx = a->compressedrow.rindex;
3813   } else {
3814     mbs = a->mbs;
3815     ii  = a->i;
3816   }
3817 
3818   for (i = 0; i < mbs; i++) {
3819     PetscInt brow = usecprow ? ridx[i] : i;
3820 
3821     n = ii[1] - ii[0];
3822     ii++;
3823     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3824     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3825     for (j = 0, bi = b + 3 * brow; j < n; j++) {
3826       PetscScalar *zcol = c + 3 * (*idx++);
3827 
3828       for (k = 0; k < cn; k++) {
3829         x1 = bi[0 + k * bm];
3830         x2 = bi[1 + k * bm];
3831         x3 = bi[2 + k * bm];
3832         zcol[0 + k * cm] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3833         zcol[1 + k * cm] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3834         zcol[2 + k * cm] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3835       }
3836       v += 9;
3837     }
3838   }
3839   PetscFunctionReturn(PETSC_SUCCESS);
3840 }
3841 
MatTransposeMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3842 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3843 {
3844   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3845   const MatScalar   *v;
3846   PetscInt           mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3847   PetscBool          usecprow = a->compressedrow.use;
3848   const PetscScalar *bi;
3849   PetscScalar        x1, x2, x3, x4;
3850 
3851   PetscFunctionBegin;
3852   idx = a->j;
3853   v   = a->a;
3854   if (usecprow) {
3855     mbs  = a->compressedrow.nrows;
3856     ii   = a->compressedrow.i;
3857     ridx = a->compressedrow.rindex;
3858   } else {
3859     mbs = a->mbs;
3860     ii  = a->i;
3861   }
3862 
3863   for (i = 0; i < mbs; i++) {
3864     PetscInt brow = usecprow ? ridx[i] : i;
3865 
3866     n = ii[1] - ii[0];
3867     ii++;
3868     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3869     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3870     for (j = 0, bi = b + 4 * brow; j < n; j++) {
3871       PetscScalar *zcol = c + 4 * (*idx++);
3872 
3873       for (k = 0; k < cn; k++) {
3874         x1 = bi[0 + k * bm];
3875         x2 = bi[1 + k * bm];
3876         x3 = bi[2 + k * bm];
3877         x4 = bi[3 + k * bm];
3878         zcol[0 + k * cm] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3879         zcol[1 + k * cm] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3880         zcol[2 + k * cm] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3881         zcol[3 + k * cm] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3882       }
3883       v += 16;
3884     }
3885   }
3886   PetscFunctionReturn(PETSC_SUCCESS);
3887 }
3888 
MatTransposeMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)3889 static PetscErrorCode MatTransposeMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3890 {
3891   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3892   const MatScalar   *v;
3893   PetscInt           mbs, i, *idx, *ii, j, n, k, *ridx = NULL;
3894   PetscBool          usecprow = a->compressedrow.use;
3895   const PetscScalar *bi;
3896   PetscScalar        x1, x2, x3, x4, x5;
3897 
3898   PetscFunctionBegin;
3899   idx = a->j;
3900   v   = a->a;
3901   if (usecprow) {
3902     mbs  = a->compressedrow.nrows;
3903     ii   = a->compressedrow.i;
3904     ridx = a->compressedrow.rindex;
3905   } else {
3906     mbs = a->mbs;
3907     ii  = a->i;
3908   }
3909 
3910   for (i = 0; i < mbs; i++) {
3911     PetscInt brow = usecprow ? ridx[i] : i;
3912 
3913     n = ii[1] - ii[0];
3914     ii++;
3915     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3916     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3917     for (j = 0, bi = b + 5 * brow; j < n; j++) {
3918       PetscScalar *zcol = c + 5 * (*idx++);
3919 
3920       for (k = 0; k < cn; k++) {
3921         x1 = bi[0 + k * bm];
3922         x2 = bi[1 + k * bm];
3923         x3 = bi[2 + k * bm];
3924         x4 = bi[3 + k * bm];
3925         x5 = bi[4 + k * bm];
3926         zcol[0 + k * cm] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3927         zcol[1 + k * cm] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3928         zcol[2 + k * cm] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3929         zcol[3 + k * cm] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3930         zcol[4 + k * cm] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3931       }
3932       v += 25;
3933     }
3934   }
3935   PetscFunctionReturn(PETSC_SUCCESS);
3936 }
3937 
MatTransposeMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)3938 PetscErrorCode MatTransposeMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3939 {
3940   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3941   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3942   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3943   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3944   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3945   PetscBLASInt     bbs, bcn, bbm, bcm;
3946   PetscScalar     *c, *b;
3947   const MatScalar *v;
3948   const PetscInt  *idx, *ii, *ridx = NULL;
3949   PetscScalar      _DOne    = 1.0;
3950   PetscBool        usecprow = a->compressedrow.use;
3951 
3952   PetscFunctionBegin;
3953   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3954   PetscCheck(B->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->rmap->n, B->rmap->n);
3955   PetscCheck(A->cmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal columns in A %" PetscInt_FMT, C->rmap->n, A->cmap->n);
3956   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3957   b = bd->v;
3958   PetscCall(MatZeroEntries(C));
3959   PetscCall(MatDenseGetArrayWrite(C, &c));
3960   switch (bs) {
3961   case 1:
3962     PetscCall(MatTransposeMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3963     break;
3964   case 2:
3965     PetscCall(MatTransposeMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3966     break;
3967   case 3:
3968     PetscCall(MatTransposeMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3969     break;
3970   case 4:
3971     PetscCall(MatTransposeMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3972     break;
3973   case 5:
3974     PetscCall(MatTransposeMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3975     break;
3976   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3977     PetscCall(PetscBLASIntCast(bs, &bbs));
3978     PetscCall(PetscBLASIntCast(cn, &bcn));
3979     PetscCall(PetscBLASIntCast(bm, &bbm));
3980     PetscCall(PetscBLASIntCast(cm, &bcm));
3981     idx = a->j;
3982     v   = a->a;
3983     if (usecprow) {
3984       mbs  = a->compressedrow.nrows;
3985       ii   = a->compressedrow.i;
3986       ridx = a->compressedrow.rindex;
3987     } else {
3988       mbs = a->mbs;
3989       ii  = a->i;
3990     }
3991     for (i = 0; i < mbs; i++) {
3992       const PetscScalar *bi = b + bs * (usecprow ? ridx[i] : i);
3993 
3994       n = ii[1] - ii[0];
3995       ii++;
3996       for (j = 0; j < n; j++) {
3997         PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, bi, &bbm, &_DOne, c + bs * (*idx++), &bcm));
3998         v += bs2;
3999       }
4000     }
4001   }
4002   PetscCall(MatDenseRestoreArrayWrite(C, &c));
4003   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
4004   PetscFunctionReturn(PETSC_SUCCESS);
4005 }
4006