xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision f5ff9c666c0d37e8a7ec3aa2f8e2aa9e44449bdb)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscbt.h>
6 #include <petscblaslapack.h>
7 
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 #include <immintrin.h>
10 #endif
11 
12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77 {
78   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
79   PetscErrorCode ierr;
80   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
82   const PetscInt *irow,*icol;
83   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84   PetscInt       *aj = a->j,*ai = a->i;
85   MatScalar      *mat_a;
86   Mat            C;
87   PetscBool      flag;
88 
89   PetscFunctionBegin;
90   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
91   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
93   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
94 
95   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
96   ssmap = smap;
97   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
98   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99   /* determine lens of each row */
100   for (i=0; i<nrows; i++) {
101     kstart  = ai[irow[i]];
102     kend    = kstart + a->ilen[irow[i]];
103     lens[i] = 0;
104     for (k=kstart; k<kend; k++) {
105       if (ssmap[aj[k]]) lens[i]++;
106     }
107   }
108   /* Create and fill new matrix */
109   if (scall == MAT_REUSE_MATRIX) {
110     c = (Mat_SeqBAIJ*)((*B)->data);
111 
112     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
114     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
116     C    = *B;
117   } else {
118     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
119     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
120     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
121     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
122   }
123   c = (Mat_SeqBAIJ*)(C->data);
124   for (i=0; i<nrows; i++) {
125     row      = irow[i];
126     kstart   = ai[row];
127     kend     = kstart + a->ilen[row];
128     mat_i    = c->i[i];
129     mat_j    = c->j + mat_i;
130     mat_a    = c->a + mat_i*bs2;
131     mat_ilen = c->ilen + i;
132     for (k=kstart; k<kend; k++) {
133       if ((tcol=ssmap[a->j[k]])) {
134         *mat_j++ = tcol - 1;
135         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
136         mat_a   += bs2;
137         (*mat_ilen)++;
138       }
139     }
140   }
141   /* sort */
142   {
143     MatScalar *work;
144     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
145     for (i=0; i<nrows; i++) {
146       PetscInt ilen;
147       mat_i = c->i[i];
148       mat_j = c->j + mat_i;
149       mat_a = c->a + mat_i*bs2;
150       ilen  = c->ilen[i];
151       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
152     }
153     ierr = PetscFree(work);CHKERRQ(ierr);
154   }
155 
156   /* Free work space */
157   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
158   ierr = PetscFree(smap);CHKERRQ(ierr);
159   ierr = PetscFree(lens);CHKERRQ(ierr);
160   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
162 
163   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
164   *B   = C;
165   PetscFunctionReturn(0);
166 }
167 
168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169 {
170   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
171   IS             is1,is2;
172   PetscErrorCode ierr;
173   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174   const PetscInt *irow,*icol;
175 
176   PetscFunctionBegin;
177   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
178   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
179   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
180   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
181 
182   /* Verify if the indices corespond to each element in a block
183    and form the IS with compressed IS */
184   maxmnbs = PetscMax(a->mbs,a->nbs);
185   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
186   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
187   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188   for (i=0; i<a->mbs; i++) {
189     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190   }
191   count = 0;
192   for (i=0; i<nrows; i++) {
193     j = irow[i] / bs;
194     if ((vary[j]--)==bs) iary[count++] = j;
195   }
196   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
197 
198   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
199   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200   for (i=0; i<a->nbs; i++) {
201     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202   }
203   count = 0;
204   for (i=0; i<ncols; i++) {
205     j = icol[i] / bs;
206     if ((vary[j]--)==bs) iary[count++] = j;
207   }
208   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
209   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
210   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
211   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
212 
213   ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
214   ierr = ISDestroy(&is1);CHKERRQ(ierr);
215   ierr = ISDestroy(&is2);CHKERRQ(ierr);
216   PetscFunctionReturn(0);
217 }
218 
219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220 {
221   PetscErrorCode ierr;
222   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
223   Mat_SubSppt    *submatj = c->submatis1;
224 
225   PetscFunctionBegin;
226   ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
227   ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232 {
233   PetscErrorCode ierr;
234   PetscInt       i;
235   Mat            C;
236   Mat_SeqBAIJ    *c;
237   Mat_SubSppt    *submatj;
238 
239   PetscFunctionBegin;
240   for (i=0; i<n; i++) {
241     C       = (*mat)[i];
242     c       = (Mat_SeqBAIJ*)C->data;
243     submatj = c->submatis1;
244     if (submatj) {
245       ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
246       ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
247       ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
248       ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr);
249       ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr);
250       ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr);
251     } else {
252       ierr = MatDestroy(&C);CHKERRQ(ierr);
253     }
254   }
255   ierr = PetscFree(*mat);CHKERRQ(ierr);
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260 {
261   PetscErrorCode ierr;
262   PetscInt       i;
263 
264   PetscFunctionBegin;
265   if (scall == MAT_INITIAL_MATRIX) {
266     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
267   }
268 
269   for (i=0; i<n; i++) {
270     ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
271   }
272   PetscFunctionReturn(0);
273 }
274 
275 /* -------------------------------------------------------*/
276 /* Should check that shapes of vectors and matrices match */
277 /* -------------------------------------------------------*/
278 
279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
282   PetscScalar       *z,sum;
283   const PetscScalar *x;
284   const MatScalar   *v;
285   PetscErrorCode    ierr;
286   PetscInt          mbs,i,n;
287   const PetscInt    *idx,*ii,*ridx=NULL;
288   PetscBool         usecprow=a->compressedrow.use;
289 
290   PetscFunctionBegin;
291   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
292   ierr = VecGetArrayWrite(zz,&z);CHKERRQ(ierr);
293 
294   if (usecprow) {
295     mbs  = a->compressedrow.nrows;
296     ii   = a->compressedrow.i;
297     ridx = a->compressedrow.rindex;
298     ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr);
299   } else {
300     mbs = a->mbs;
301     ii  = a->i;
302   }
303 
304   for (i=0; i<mbs; i++) {
305     n   = ii[1] - ii[0];
306     v   = a->a + ii[0];
307     idx = a->j + ii[0];
308     ii++;
309     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
310     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311     sum = 0.0;
312     PetscSparseDensePlusDot(sum,x,v,idx,n);
313     if (usecprow) {
314       z[ridx[i]] = sum;
315     } else {
316       z[i]       = sum;
317     }
318   }
319   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
320   ierr = VecRestoreArrayWrite(zz,&z);CHKERRQ(ierr);
321   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
322   PetscFunctionReturn(0);
323 }
324 
325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326 {
327   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
328   PetscScalar       *z = NULL,sum1,sum2,*zarray;
329   const PetscScalar *x,*xb;
330   PetscScalar       x1,x2;
331   const MatScalar   *v;
332   PetscErrorCode    ierr;
333   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
334   PetscBool         usecprow=a->compressedrow.use;
335 
336   PetscFunctionBegin;
337   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
338   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
339 
340   idx = a->j;
341   v   = a->a;
342   if (usecprow) {
343     mbs  = a->compressedrow.nrows;
344     ii   = a->compressedrow.i;
345     ridx = a->compressedrow.rindex;
346     ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr);
347   } else {
348     mbs = a->mbs;
349     ii  = a->i;
350     z   = zarray;
351   }
352 
353   for (i=0; i<mbs; i++) {
354     n           = ii[1] - ii[0]; ii++;
355     sum1        = 0.0; sum2 = 0.0;
356     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
357     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358     for (j=0; j<n; j++) {
359       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
360       sum1 += v[0]*x1 + v[2]*x2;
361       sum2 += v[1]*x1 + v[3]*x2;
362       v    += 4;
363     }
364     if (usecprow) z = zarray + 2*ridx[i];
365     z[0] = sum1; z[1] = sum2;
366     if (!usecprow) z += 2;
367   }
368   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
369   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
370   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
371   PetscFunctionReturn(0);
372 }
373 
374 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
375 {
376   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
377   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
378   const PetscScalar *x,*xb;
379   const MatScalar   *v;
380   PetscErrorCode    ierr;
381   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
382   PetscBool         usecprow=a->compressedrow.use;
383 
384 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
385 #pragma disjoint(*v,*z,*xb)
386 #endif
387 
388   PetscFunctionBegin;
389   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
390   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
391 
392   idx = a->j;
393   v   = a->a;
394   if (usecprow) {
395     mbs  = a->compressedrow.nrows;
396     ii   = a->compressedrow.i;
397     ridx = a->compressedrow.rindex;
398     ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr);
399   } else {
400     mbs = a->mbs;
401     ii  = a->i;
402     z   = zarray;
403   }
404 
405   for (i=0; i<mbs; i++) {
406     n           = ii[1] - ii[0]; ii++;
407     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
408     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
409     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410     for (j=0; j<n; j++) {
411       xb = x + 3*(*idx++);
412       x1 = xb[0];
413       x2 = xb[1];
414       x3 = xb[2];
415 
416       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
417       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
418       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
419       v    += 9;
420     }
421     if (usecprow) z = zarray + 3*ridx[i];
422     z[0] = sum1; z[1] = sum2; z[2] = sum3;
423     if (!usecprow) z += 3;
424   }
425   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
426   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
427   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
432 {
433   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
434   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
435   const PetscScalar *x,*xb;
436   const MatScalar   *v;
437   PetscErrorCode    ierr;
438   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
439   PetscBool         usecprow=a->compressedrow.use;
440 
441   PetscFunctionBegin;
442   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
443   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
444 
445   idx = a->j;
446   v   = a->a;
447   if (usecprow) {
448     mbs  = a->compressedrow.nrows;
449     ii   = a->compressedrow.i;
450     ridx = a->compressedrow.rindex;
451     ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr);
452   } else {
453     mbs = a->mbs;
454     ii  = a->i;
455     z   = zarray;
456   }
457 
458   for (i=0; i<mbs; i++) {
459     n = ii[1] - ii[0];
460     ii++;
461     sum1 = 0.0;
462     sum2 = 0.0;
463     sum3 = 0.0;
464     sum4 = 0.0;
465 
466     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
467     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468     for (j=0; j<n; j++) {
469       xb    = x + 4*(*idx++);
470       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
471       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
472       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
473       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
474       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
475       v    += 16;
476     }
477     if (usecprow) z = zarray + 4*ridx[i];
478     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
479     if (!usecprow) z += 4;
480   }
481   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
482   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
483   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
484   PetscFunctionReturn(0);
485 }
486 
487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
488 {
489   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
490   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
491   const PetscScalar *xb,*x;
492   const MatScalar   *v;
493   PetscErrorCode    ierr;
494   const PetscInt    *idx,*ii,*ridx=NULL;
495   PetscInt          mbs,i,j,n;
496   PetscBool         usecprow=a->compressedrow.use;
497 
498   PetscFunctionBegin;
499   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
500   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
501 
502   idx = a->j;
503   v   = a->a;
504   if (usecprow) {
505     mbs  = a->compressedrow.nrows;
506     ii   = a->compressedrow.i;
507     ridx = a->compressedrow.rindex;
508     ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr);
509   } else {
510     mbs = a->mbs;
511     ii  = a->i;
512     z   = zarray;
513   }
514 
515   for (i=0; i<mbs; i++) {
516     n           = ii[1] - ii[0]; ii++;
517     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
518     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
519     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520     for (j=0; j<n; j++) {
521       xb    = x + 5*(*idx++);
522       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
523       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
524       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
525       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
526       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
527       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
528       v    += 25;
529     }
530     if (usecprow) z = zarray + 5*ridx[i];
531     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
532     if (!usecprow) z += 5;
533   }
534   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
535   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
536   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
537   PetscFunctionReturn(0);
538 }
539 
540 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
541 {
542   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
543   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
544   const PetscScalar *x,*xb;
545   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
546   const MatScalar   *v;
547   PetscErrorCode    ierr;
548   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
549   PetscBool         usecprow=a->compressedrow.use;
550 
551   PetscFunctionBegin;
552   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
553   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
554 
555   idx = a->j;
556   v   = a->a;
557   if (usecprow) {
558     mbs  = a->compressedrow.nrows;
559     ii   = a->compressedrow.i;
560     ridx = a->compressedrow.rindex;
561     ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr);
562   } else {
563     mbs = a->mbs;
564     ii  = a->i;
565     z   = zarray;
566   }
567 
568   for (i=0; i<mbs; i++) {
569     n  = ii[1] - ii[0];
570     ii++;
571     sum1 = 0.0;
572     sum2 = 0.0;
573     sum3 = 0.0;
574     sum4 = 0.0;
575     sum5 = 0.0;
576     sum6 = 0.0;
577 
578     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
579     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580     for (j=0; j<n; j++) {
581       xb    = x + 6*(*idx++);
582       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
583       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
584       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
585       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
586       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
587       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
588       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
589       v    += 36;
590     }
591     if (usecprow) z = zarray + 6*ridx[i];
592     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
593     if (!usecprow) z += 6;
594   }
595 
596   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
597   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
598   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
599   PetscFunctionReturn(0);
600 }
601 
602 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
603 {
604   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
605   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
606   const PetscScalar *x,*xb;
607   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
608   const MatScalar   *v;
609   PetscErrorCode    ierr;
610   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
611   PetscBool         usecprow=a->compressedrow.use;
612 
613   PetscFunctionBegin;
614   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
615   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
616 
617   idx = a->j;
618   v   = a->a;
619   if (usecprow) {
620     mbs  = a->compressedrow.nrows;
621     ii   = a->compressedrow.i;
622     ridx = a->compressedrow.rindex;
623     ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr);
624   } else {
625     mbs = a->mbs;
626     ii  = a->i;
627     z   = zarray;
628   }
629 
630   for (i=0; i<mbs; i++) {
631     n  = ii[1] - ii[0];
632     ii++;
633     sum1 = 0.0;
634     sum2 = 0.0;
635     sum3 = 0.0;
636     sum4 = 0.0;
637     sum5 = 0.0;
638     sum6 = 0.0;
639     sum7 = 0.0;
640 
641     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
642     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643     for (j=0; j<n; j++) {
644       xb    = x + 7*(*idx++);
645       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
646       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
647       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
648       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
649       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
650       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
651       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
652       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
653       v    += 49;
654     }
655     if (usecprow) z = zarray + 7*ridx[i];
656     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
657     if (!usecprow) z += 7;
658   }
659 
660   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
661   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
662   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
663   PetscFunctionReturn(0);
664 }
665 
666 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
667 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
668 {
669   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
670   PetscScalar       *z = NULL,*work,*workt,*zarray;
671   const PetscScalar *x,*xb;
672   const MatScalar   *v;
673   PetscErrorCode    ierr;
674   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
675   const PetscInt    *idx,*ii,*ridx=NULL;
676   PetscInt          k;
677   PetscBool         usecprow=a->compressedrow.use;
678 
679   __m256d a0,a1,a2,a3,a4,a5;
680   __m256d w0,w1,w2,w3;
681   __m256d z0,z1,z2;
682   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
683 
684   PetscFunctionBegin;
685   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
686   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
687 
688   idx = a->j;
689   v   = a->a;
690   if (usecprow) {
691     mbs  = a->compressedrow.nrows;
692     ii   = a->compressedrow.i;
693     ridx = a->compressedrow.rindex;
694     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
695   } else {
696     mbs = a->mbs;
697     ii  = a->i;
698     z   = zarray;
699   }
700 
701   if (!a->mult_work) {
702     k    = PetscMax(A->rmap->n,A->cmap->n);
703     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
704   }
705 
706   work = a->mult_work;
707   for (i=0; i<mbs; i++) {
708     n           = ii[1] - ii[0]; ii++;
709     workt       = work;
710     for (j=0; j<n; j++) {
711       xb = x + bs*(*idx++);
712       for (k=0; k<bs; k++) workt[k] = xb[k];
713       workt += bs;
714     }
715     if (usecprow) z = zarray + bs*ridx[i];
716 
717     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
718 
719     for (j=0; j<n; j++) {
720       /* first column of a */
721       w0 = _mm256_set1_pd(work[j*9  ]);
722       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
723       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
724       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
725 
726       /* second column of a */
727       w1 = _mm256_set1_pd(work[j*9+ 1]);
728       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
729       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
730       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
731 
732       /* third column of a */
733       w2 = _mm256_set1_pd(work[j*9 +2]);
734       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
735       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
736       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
737 
738       /* fourth column of a */
739       w3 = _mm256_set1_pd(work[j*9+ 3]);
740       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
741       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
742       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
743 
744       /* fifth column of a */
745       w0 = _mm256_set1_pd(work[j*9+ 4]);
746       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
747       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
748       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
749 
750       /* sixth column of a */
751       w1 = _mm256_set1_pd(work[j*9+ 5]);
752       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
753       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
754       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
755 
756       /* seventh column of a */
757       w2 = _mm256_set1_pd(work[j*9+ 6]);
758       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
759       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
760       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
761 
762       /* eigth column of a */
763       w3 = _mm256_set1_pd(work[j*9+ 7]);
764       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
765       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
766       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
767 
768       /* ninth column of a */
769       w0 = _mm256_set1_pd(work[j*9+ 8]);
770       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
771       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
772       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
773     }
774 
775     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
776 
777     v += n*bs2;
778     if (!usecprow) z += bs;
779   }
780   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
781   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
782   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
783   PetscFunctionReturn(0);
784 }
785 #endif
786 
787 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
788 {
789   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
790   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
791   const PetscScalar *x,*xb;
792   PetscScalar       *zarray,xv;
793   const MatScalar   *v;
794   PetscErrorCode    ierr;
795   const PetscInt    *ii,*ij=a->j,*idx;
796   PetscInt          mbs,i,j,k,n,*ridx=NULL;
797   PetscBool         usecprow=a->compressedrow.use;
798 
799   PetscFunctionBegin;
800   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
801   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
802 
803   v = a->a;
804   if (usecprow) {
805     mbs  = a->compressedrow.nrows;
806     ii   = a->compressedrow.i;
807     ridx = a->compressedrow.rindex;
808     ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr);
809   } else {
810     mbs = a->mbs;
811     ii  = a->i;
812     z   = zarray;
813   }
814 
815   for (i=0; i<mbs; i++) {
816     n    = ii[i+1] - ii[i];
817     idx  = ij + ii[i];
818     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
819     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
820 
821     for (j=0; j<n; j++) {
822       xb = x + 11*(idx[j]);
823 
824       for (k=0; k<11; k++) {
825         xv     =  xb[k];
826         sum1  += v[0]*xv;
827         sum2  += v[1]*xv;
828         sum3  += v[2]*xv;
829         sum4  += v[3]*xv;
830         sum5  += v[4]*xv;
831         sum6  += v[5]*xv;
832         sum7  += v[6]*xv;
833         sum8  += v[7]*xv;
834         sum9  += v[8]*xv;
835         sum10 += v[9]*xv;
836         sum11 += v[10]*xv;
837         v     += 11;
838       }
839     }
840     if (usecprow) z = zarray + 11*ridx[i];
841     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
843 
844     if (!usecprow) z += 11;
845   }
846 
847   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
848   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
849   ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
850   PetscFunctionReturn(0);
851 }
852 
853 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
854 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
855 {
856   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
857   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
858   const PetscScalar *x,*xb;
859   PetscScalar       *zarray,xv;
860   const MatScalar   *v;
861   PetscErrorCode    ierr;
862   const PetscInt    *ii,*ij=a->j,*idx;
863   PetscInt          mbs,i,j,k,n,*ridx=NULL;
864   PetscBool         usecprow=a->compressedrow.use;
865 
866   PetscFunctionBegin;
867   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
868   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
869 
870   v = a->a;
871   if (usecprow) {
872     mbs  = a->compressedrow.nrows;
873     ii   = a->compressedrow.i;
874     ridx = a->compressedrow.rindex;
875     ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr);
876   } else {
877     mbs = a->mbs;
878     ii  = a->i;
879     z   = zarray;
880   }
881 
882   for (i=0; i<mbs; i++) {
883     n    = ii[i+1] - ii[i];
884     idx  = ij + ii[i];
885     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
886     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
887 
888     for (j=0; j<n; j++) {
889       xb = x + 12*(idx[j]);
890 
891       for (k=0; k<12; k++) {
892         xv     =  xb[k];
893         sum1  += v[0]*xv;
894         sum2  += v[1]*xv;
895         sum3  += v[2]*xv;
896         sum4  += v[3]*xv;
897         sum5  += v[4]*xv;
898         sum6  += v[5]*xv;
899         sum7  += v[6]*xv;
900         sum8  += v[7]*xv;
901         sum9  += v[8]*xv;
902         sum10 += v[9]*xv;
903         sum11 += v[10]*xv;
904         sum12 += v[11]*xv;
905         v     += 12;
906       }
907     }
908     if (usecprow) z = zarray + 12*ridx[i];
909     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
910     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
911     if (!usecprow) z += 12;
912   }
913   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
914   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
915   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
916   PetscFunctionReturn(0);
917 }
918 
919 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
920 {
921   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
922   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
923   const PetscScalar *x,*xb;
924   PetscScalar       *zarray,*yarray,xv;
925   const MatScalar   *v;
926   PetscErrorCode    ierr;
927   const PetscInt    *ii,*ij=a->j,*idx;
928   PetscInt          mbs = a->mbs,i,j,k,n,*ridx=NULL;
929   PetscBool         usecprow=a->compressedrow.use;
930 
931   PetscFunctionBegin;
932   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
933   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
934 
935   v = a->a;
936   if (usecprow) {
937    if (zz != yy) {
938      ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr);
939     }
940     mbs  = a->compressedrow.nrows;
941     ii   = a->compressedrow.i;
942     ridx = a->compressedrow.rindex;
943   } else {
944     ii  = a->i;
945     y   = yarray;
946     z   = zarray;
947   }
948 
949   for (i=0; i<mbs; i++) {
950     n    = ii[i+1] - ii[i];
951     idx  = ij + ii[i];
952 
953     if (usecprow){
954       y = yarray + 12*ridx[i];
955       z = zarray + 12*ridx[i];
956     }
957     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
958     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
959 
960     for (j=0; j<n; j++) {
961       xb = x + 12*(idx[j]);
962 
963       for (k=0; k<12; k++) {
964         xv     =  xb[k];
965         sum1  += v[0]*xv;
966         sum2  += v[1]*xv;
967         sum3  += v[2]*xv;
968         sum4  += v[3]*xv;
969         sum5  += v[4]*xv;
970         sum6  += v[5]*xv;
971         sum7  += v[6]*xv;
972         sum8  += v[7]*xv;
973         sum9  += v[8]*xv;
974         sum10 += v[9]*xv;
975         sum11 += v[10]*xv;
976         sum12 += v[11]*xv;
977         v     += 12;
978       }
979     }
980 
981     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
982     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
983     if (!usecprow) {
984       y += 12;
985       z += 12;
986     }
987   }
988   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
989   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
990   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
991   PetscFunctionReturn(0);
992 }
993 
994 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
995 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
996 {
997   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
998   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
999   const PetscScalar *x,*xb;
1000   PetscScalar       x1,x2,x3,x4,*zarray;
1001   const MatScalar   *v;
1002   PetscErrorCode    ierr;
1003   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1004   PetscInt          mbs,i,j,n;
1005   PetscBool         usecprow=a->compressedrow.use;
1006 
1007   PetscFunctionBegin;
1008   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1009   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1010 
1011   v = a->a;
1012   if (usecprow) {
1013     mbs  = a->compressedrow.nrows;
1014     ii   = a->compressedrow.i;
1015     ridx = a->compressedrow.rindex;
1016     ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr);
1017   } else {
1018     mbs = a->mbs;
1019     ii  = a->i;
1020     z   = zarray;
1021   }
1022 
1023   for (i=0; i<mbs; i++) {
1024     n    = ii[i+1] - ii[i];
1025     idx  = ij + ii[i];
1026 
1027     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1028     for (j=0; j<n; j++) {
1029       xb = x + 12*(idx[j]);
1030       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1031 
1032       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1033       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1034       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1035       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1036       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1037       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1038       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1039       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1040       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1041       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1042       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1043       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1044       v += 48;
1045 
1046       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1047 
1048       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1049       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1050       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1051       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1052       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1053       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1054       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1055       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1056       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1057       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1058       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1059       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1060       v     += 48;
1061 
1062       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1063       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1064       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1065       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1066       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1067       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1068       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1069       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1070       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1071       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1072       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1073       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1074       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1075       v     += 48;
1076 
1077     }
1078     if (usecprow) z = zarray + 12*ridx[i];
1079     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1080     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1081     if (!usecprow) z += 12;
1082   }
1083   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1084   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1085   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1090 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1091 {
1092   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1093   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1094   const PetscScalar *x,*xb;
1095   PetscScalar       x1,x2,x3,x4,*zarray,*yarray;
1096   const MatScalar   *v;
1097   PetscErrorCode    ierr;
1098   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1099   PetscInt          mbs = a->mbs,i,j,n;
1100   PetscBool         usecprow=a->compressedrow.use;
1101 
1102   PetscFunctionBegin;
1103   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1104   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1105 
1106   v = a->a;
1107   if (usecprow) {
1108     if (zz != yy) {
1109       ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr);
1110     }
1111     mbs  = a->compressedrow.nrows;
1112     ii   = a->compressedrow.i;
1113     ridx = a->compressedrow.rindex;
1114   } else {
1115     ii  = a->i;
1116     y   = yarray;
1117     z   = zarray;
1118   }
1119 
1120   for (i=0; i<mbs; i++) {
1121     n    = ii[i+1] - ii[i];
1122     idx  = ij + ii[i];
1123 
1124     if (usecprow) {
1125       y = yarray + 12*ridx[i];
1126       z = zarray + 12*ridx[i];
1127     }
1128     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1129     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1130 
1131     for (j=0; j<n; j++) {
1132       xb = x + 12*(idx[j]);
1133       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1134 
1135       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1136       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1137       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1138       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1139       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1140       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1141       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1142       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1143       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1144       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1145       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1146       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1147       v += 48;
1148 
1149       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1150 
1151       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1152       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1153       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1154       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1155       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1156       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1157       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1158       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1159       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1160       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1161       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1162       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1163       v     += 48;
1164 
1165       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1166       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1167       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1168       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1169       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1170       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1171       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1172       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1173       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1174       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1175       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1176       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1177       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1178       v     += 48;
1179 
1180     }
1181     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1182     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1183     if (!usecprow) {
1184       y += 12;
1185       z += 12;
1186     }
1187   }
1188   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1189   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1190   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
1191   PetscFunctionReturn(0);
1192 }
1193 
1194 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1195 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1196 {
1197   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1198   PetscScalar       *z = NULL,*zarray;
1199   const PetscScalar *x,*work;
1200   const MatScalar   *v = a->a;
1201   PetscErrorCode    ierr;
1202   PetscInt          mbs,i,j,n;
1203   const PetscInt    *idx = a->j,*ii,*ridx=NULL;
1204   PetscBool         usecprow=a->compressedrow.use;
1205   const PetscInt    bs = 12, bs2 = 144;
1206 
1207   __m256d a0,a1,a2,a3,a4,a5;
1208   __m256d w0,w1,w2,w3;
1209   __m256d z0,z1,z2;
1210 
1211   PetscFunctionBegin;
1212   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1213   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1214 
1215   if (usecprow) {
1216     mbs  = a->compressedrow.nrows;
1217     ii   = a->compressedrow.i;
1218     ridx = a->compressedrow.rindex;
1219     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1220   } else {
1221     mbs = a->mbs;
1222     ii  = a->i;
1223     z   = zarray;
1224   }
1225 
1226   for (i=0; i<mbs; i++) {
1227     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1228 
1229     n  = ii[1] - ii[0]; ii++;
1230     for (j=0; j<n; j++) {
1231       work = x + bs*(*idx++);
1232 
1233       /* first column of a */
1234       w0 = _mm256_set1_pd(work[0]);
1235       a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1236       a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1237       a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1238 
1239       /* second column of a */
1240       w1 = _mm256_set1_pd(work[1]);
1241       a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1242       a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1243       a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1244 
1245       /* third column of a */
1246       w2 = _mm256_set1_pd(work[2]);
1247       a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1248       a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1249       a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1250 
1251       /* fourth column of a */
1252       w3 = _mm256_set1_pd(work[3]);
1253       a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1254       a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1255       a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1256 
1257       /* fifth column of a */
1258       w0 = _mm256_set1_pd(work[4]);
1259       a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1260       a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1261       a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1262 
1263       /* sixth column of a */
1264       w1 = _mm256_set1_pd(work[5]);
1265       a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1266       a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1267       a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1268 
1269       /* seventh column of a */
1270       w2 = _mm256_set1_pd(work[6]);
1271       a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1272       a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1273       a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1274 
1275       /* eigth column of a */
1276       w3 = _mm256_set1_pd(work[7]);
1277       a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1278       a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1279       a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1280 
1281       /* ninth column of a */
1282       w0 = _mm256_set1_pd(work[8]);
1283       a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1284       a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1285       a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1286 
1287       /* tenth column of a */
1288       w1 = _mm256_set1_pd(work[9]);
1289       a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1290       a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1291       a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1292 
1293       /* eleventh column of a */
1294       w2 = _mm256_set1_pd(work[10]);
1295       a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1296       a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1297       a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1298 
1299       /* twelveth column of a */
1300       w3 = _mm256_set1_pd(work[11]);
1301       a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1302       a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1303       a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1304 
1305       v += bs2;
1306     }
1307     if (usecprow) z = zarray + bs*ridx[i];
1308     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1309     if (!usecprow) z += bs;
1310   }
1311   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1312   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1313   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1314   PetscFunctionReturn(0);
1315 }
1316 #endif
1317 
1318 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1319 /* Default MatMult for block size 15 */
1320 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1321 {
1322   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1323   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1324   const PetscScalar *x,*xb;
1325   PetscScalar       *zarray,xv;
1326   const MatScalar   *v;
1327   PetscErrorCode    ierr;
1328   const PetscInt    *ii,*ij=a->j,*idx;
1329   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1330   PetscBool         usecprow=a->compressedrow.use;
1331 
1332   PetscFunctionBegin;
1333   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1334   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1335 
1336   v = a->a;
1337   if (usecprow) {
1338     mbs  = a->compressedrow.nrows;
1339     ii   = a->compressedrow.i;
1340     ridx = a->compressedrow.rindex;
1341     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1342   } else {
1343     mbs = a->mbs;
1344     ii  = a->i;
1345     z   = zarray;
1346   }
1347 
1348   for (i=0; i<mbs; i++) {
1349     n    = ii[i+1] - ii[i];
1350     idx  = ij + ii[i];
1351     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1352     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1353 
1354     for (j=0; j<n; j++) {
1355       xb = x + 15*(idx[j]);
1356 
1357       for (k=0; k<15; k++) {
1358         xv     =  xb[k];
1359         sum1  += v[0]*xv;
1360         sum2  += v[1]*xv;
1361         sum3  += v[2]*xv;
1362         sum4  += v[3]*xv;
1363         sum5  += v[4]*xv;
1364         sum6  += v[5]*xv;
1365         sum7  += v[6]*xv;
1366         sum8  += v[7]*xv;
1367         sum9  += v[8]*xv;
1368         sum10 += v[9]*xv;
1369         sum11 += v[10]*xv;
1370         sum12 += v[11]*xv;
1371         sum13 += v[12]*xv;
1372         sum14 += v[13]*xv;
1373         sum15 += v[14]*xv;
1374         v     += 15;
1375       }
1376     }
1377     if (usecprow) z = zarray + 15*ridx[i];
1378     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1379     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1380 
1381     if (!usecprow) z += 15;
1382   }
1383 
1384   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1385   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1386   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1387   PetscFunctionReturn(0);
1388 }
1389 
1390 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1391 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1392 {
1393   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1394   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1395   const PetscScalar *x,*xb;
1396   PetscScalar       x1,x2,x3,x4,*zarray;
1397   const MatScalar   *v;
1398   PetscErrorCode    ierr;
1399   const PetscInt    *ii,*ij=a->j,*idx;
1400   PetscInt          mbs,i,j,n,*ridx=NULL;
1401   PetscBool         usecprow=a->compressedrow.use;
1402 
1403   PetscFunctionBegin;
1404   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1405   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1406 
1407   v = a->a;
1408   if (usecprow) {
1409     mbs  = a->compressedrow.nrows;
1410     ii   = a->compressedrow.i;
1411     ridx = a->compressedrow.rindex;
1412     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1413   } else {
1414     mbs = a->mbs;
1415     ii  = a->i;
1416     z   = zarray;
1417   }
1418 
1419   for (i=0; i<mbs; i++) {
1420     n    = ii[i+1] - ii[i];
1421     idx  = ij + ii[i];
1422     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1423     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1424 
1425     for (j=0; j<n; j++) {
1426       xb = x + 15*(idx[j]);
1427       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1428 
1429       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1430       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1431       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1432       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1433       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1434       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1435       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1436       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1437       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1438       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1439       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1440       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1441       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1442       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1443       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1444 
1445       v += 60;
1446 
1447       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1448 
1449       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1450       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1451       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1452       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1453       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1454       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1455       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1456       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1457       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1458       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1459       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1460       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1461       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1462       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1463       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1464       v     += 60;
1465 
1466       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1467       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1468       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1469       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1470       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1471       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1472       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1473       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1474       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1475       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1476       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1477       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1478       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1479       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1480       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1481       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1482       v     += 60;
1483 
1484       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1485       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1486       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1487       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1488       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1489       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1490       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1491       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1492       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1493       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1494       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1495       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1496       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1497       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1498       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1499       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1500       v     += 45;
1501     }
1502     if (usecprow) z = zarray + 15*ridx[i];
1503     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1504     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1505 
1506     if (!usecprow) z += 15;
1507   }
1508 
1509   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1510   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1511   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1516 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1517 {
1518   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1519   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1520   const PetscScalar *x,*xb;
1521   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1522   const MatScalar   *v;
1523   PetscErrorCode    ierr;
1524   const PetscInt    *ii,*ij=a->j,*idx;
1525   PetscInt          mbs,i,j,n,*ridx=NULL;
1526   PetscBool         usecprow=a->compressedrow.use;
1527 
1528   PetscFunctionBegin;
1529   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1530   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1531 
1532   v = a->a;
1533   if (usecprow) {
1534     mbs  = a->compressedrow.nrows;
1535     ii   = a->compressedrow.i;
1536     ridx = a->compressedrow.rindex;
1537     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1538   } else {
1539     mbs = a->mbs;
1540     ii  = a->i;
1541     z   = zarray;
1542   }
1543 
1544   for (i=0; i<mbs; i++) {
1545     n    = ii[i+1] - ii[i];
1546     idx  = ij + ii[i];
1547     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1548     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1549 
1550     for (j=0; j<n; j++) {
1551       xb = x + 15*(idx[j]);
1552       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1553       x8 = xb[7];
1554 
1555       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1556       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1557       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1558       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1559       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1560       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1561       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1562       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1563       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1564       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1565       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1566       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1567       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1568       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1569       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1570       v     += 120;
1571 
1572       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1573 
1574       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1575       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1576       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1577       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1578       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1579       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1580       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1581       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1582       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1583       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1584       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1585       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1586       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1587       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1588       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1589       v     += 105;
1590     }
1591     if (usecprow) z = zarray + 15*ridx[i];
1592     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1593     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1594 
1595     if (!usecprow) z += 15;
1596   }
1597 
1598   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1599   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1600   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1605 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1606 {
1607   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1608   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1609   const PetscScalar *x,*xb;
1610   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1611   const MatScalar   *v;
1612   PetscErrorCode    ierr;
1613   const PetscInt    *ii,*ij=a->j,*idx;
1614   PetscInt          mbs,i,j,n,*ridx=NULL;
1615   PetscBool         usecprow=a->compressedrow.use;
1616 
1617   PetscFunctionBegin;
1618   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1619   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1620 
1621   v = a->a;
1622   if (usecprow) {
1623     mbs  = a->compressedrow.nrows;
1624     ii   = a->compressedrow.i;
1625     ridx = a->compressedrow.rindex;
1626     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1627   } else {
1628     mbs = a->mbs;
1629     ii  = a->i;
1630     z   = zarray;
1631   }
1632 
1633   for (i=0; i<mbs; i++) {
1634     n    = ii[i+1] - ii[i];
1635     idx  = ij + ii[i];
1636     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1637     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1638 
1639     for (j=0; j<n; j++) {
1640       xb = x + 15*(idx[j]);
1641       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1642       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1643 
1644       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1645       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1646       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1647       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1648       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1649       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1650       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1651       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1652       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1653       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1654       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1655       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1656       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1657       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1658       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1659       v     += 225;
1660     }
1661     if (usecprow) z = zarray + 15*ridx[i];
1662     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1663     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1664 
1665     if (!usecprow) z += 15;
1666   }
1667 
1668   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1669   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1670   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 /*
1675     This will not work with MatScalar == float because it calls the BLAS
1676 */
1677 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1678 {
1679   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1680   PetscScalar       *z = NULL,*work,*workt,*zarray;
1681   const PetscScalar *x,*xb;
1682   const MatScalar   *v;
1683   PetscErrorCode    ierr;
1684   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1685   const PetscInt    *idx,*ii,*ridx=NULL;
1686   PetscInt          ncols,k;
1687   PetscBool         usecprow=a->compressedrow.use;
1688 
1689   PetscFunctionBegin;
1690   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1691   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1692 
1693   idx = a->j;
1694   v   = a->a;
1695   if (usecprow) {
1696     mbs  = a->compressedrow.nrows;
1697     ii   = a->compressedrow.i;
1698     ridx = a->compressedrow.rindex;
1699     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1700   } else {
1701     mbs = a->mbs;
1702     ii  = a->i;
1703     z   = zarray;
1704   }
1705 
1706   if (!a->mult_work) {
1707     k    = PetscMax(A->rmap->n,A->cmap->n);
1708     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1709   }
1710   work = a->mult_work;
1711   for (i=0; i<mbs; i++) {
1712     n           = ii[1] - ii[0]; ii++;
1713     ncols       = n*bs;
1714     workt       = work;
1715     for (j=0; j<n; j++) {
1716       xb = x + bs*(*idx++);
1717       for (k=0; k<bs; k++) workt[k] = xb[k];
1718       workt += bs;
1719     }
1720     if (usecprow) z = zarray + bs*ridx[i];
1721     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1722     v += n*bs2;
1723     if (!usecprow) z += bs;
1724   }
1725   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1726   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1727   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1728   PetscFunctionReturn(0);
1729 }
1730 
1731 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1732 {
1733   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1734   const PetscScalar *x;
1735   PetscScalar       *y,*z,sum;
1736   const MatScalar   *v;
1737   PetscErrorCode    ierr;
1738   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1739   const PetscInt    *idx,*ii;
1740   PetscBool         usecprow=a->compressedrow.use;
1741 
1742   PetscFunctionBegin;
1743   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1744   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1745 
1746   idx = a->j;
1747   v   = a->a;
1748   if (usecprow) {
1749     if (zz != yy) {
1750       ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr);
1751     }
1752     mbs  = a->compressedrow.nrows;
1753     ii   = a->compressedrow.i;
1754     ridx = a->compressedrow.rindex;
1755   } else {
1756     ii = a->i;
1757   }
1758 
1759   for (i=0; i<mbs; i++) {
1760     n = ii[1] - ii[0];
1761     ii++;
1762     if (!usecprow) {
1763       sum = y[i];
1764     } else {
1765       sum = y[ridx[i]];
1766     }
1767     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1768     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1769     PetscSparseDensePlusDot(sum,x,v,idx,n);
1770     v   += n;
1771     idx += n;
1772     if (usecprow) {
1773       z[ridx[i]] = sum;
1774     } else {
1775       z[i] = sum;
1776     }
1777   }
1778   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1779   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1780   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1781   PetscFunctionReturn(0);
1782 }
1783 
1784 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1785 {
1786   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1787   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1788   const PetscScalar *x,*xb;
1789   PetscScalar       x1,x2,*yarray,*zarray;
1790   const MatScalar   *v;
1791   PetscErrorCode    ierr;
1792   PetscInt          mbs = a->mbs,i,n,j;
1793   const PetscInt    *idx,*ii,*ridx = NULL;
1794   PetscBool         usecprow = a->compressedrow.use;
1795 
1796   PetscFunctionBegin;
1797   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1798   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1799 
1800   idx = a->j;
1801   v   = a->a;
1802   if (usecprow) {
1803     if (zz != yy) {
1804       ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr);
1805     }
1806     mbs  = a->compressedrow.nrows;
1807     ii   = a->compressedrow.i;
1808     ridx = a->compressedrow.rindex;
1809   } else {
1810     ii = a->i;
1811     y  = yarray;
1812     z  = zarray;
1813   }
1814 
1815   for (i=0; i<mbs; i++) {
1816     n = ii[1] - ii[0]; ii++;
1817     if (usecprow) {
1818       z = zarray + 2*ridx[i];
1819       y = yarray + 2*ridx[i];
1820     }
1821     sum1 = y[0]; sum2 = y[1];
1822     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1823     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1824     for (j=0; j<n; j++) {
1825       xb = x + 2*(*idx++);
1826       x1 = xb[0];
1827       x2 = xb[1];
1828 
1829       sum1 += v[0]*x1 + v[2]*x2;
1830       sum2 += v[1]*x1 + v[3]*x2;
1831       v    += 4;
1832     }
1833     z[0] = sum1; z[1] = sum2;
1834     if (!usecprow) {
1835       z += 2; y += 2;
1836     }
1837   }
1838   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1839   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1840   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1841   PetscFunctionReturn(0);
1842 }
1843 
1844 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1845 {
1846   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1847   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1848   const PetscScalar *x,*xb;
1849   const MatScalar   *v;
1850   PetscErrorCode    ierr;
1851   PetscInt          mbs = a->mbs,i,j,n;
1852   const PetscInt    *idx,*ii,*ridx = NULL;
1853   PetscBool         usecprow = a->compressedrow.use;
1854 
1855   PetscFunctionBegin;
1856   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1857   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1858 
1859   idx = a->j;
1860   v   = a->a;
1861   if (usecprow) {
1862     if (zz != yy) {
1863       ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr);
1864     }
1865     mbs  = a->compressedrow.nrows;
1866     ii   = a->compressedrow.i;
1867     ridx = a->compressedrow.rindex;
1868   } else {
1869     ii = a->i;
1870     y  = yarray;
1871     z  = zarray;
1872   }
1873 
1874   for (i=0; i<mbs; i++) {
1875     n = ii[1] - ii[0]; ii++;
1876     if (usecprow) {
1877       z = zarray + 3*ridx[i];
1878       y = yarray + 3*ridx[i];
1879     }
1880     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1881     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1882     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1883     for (j=0; j<n; j++) {
1884       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1885       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1886       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1887       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1888       v    += 9;
1889     }
1890     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1891     if (!usecprow) {
1892       z += 3; y += 3;
1893     }
1894   }
1895   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1896   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1897   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1902 {
1903   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1904   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1905   const PetscScalar *x,*xb;
1906   const MatScalar   *v;
1907   PetscErrorCode    ierr;
1908   PetscInt          mbs = a->mbs,i,j,n;
1909   const PetscInt    *idx,*ii,*ridx=NULL;
1910   PetscBool         usecprow=a->compressedrow.use;
1911 
1912   PetscFunctionBegin;
1913   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1914   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1915 
1916   idx = a->j;
1917   v   = a->a;
1918   if (usecprow) {
1919     if (zz != yy) {
1920       ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr);
1921     }
1922     mbs  = a->compressedrow.nrows;
1923     ii   = a->compressedrow.i;
1924     ridx = a->compressedrow.rindex;
1925   } else {
1926     ii = a->i;
1927     y  = yarray;
1928     z  = zarray;
1929   }
1930 
1931   for (i=0; i<mbs; i++) {
1932     n = ii[1] - ii[0]; ii++;
1933     if (usecprow) {
1934       z = zarray + 4*ridx[i];
1935       y = yarray + 4*ridx[i];
1936     }
1937     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1938     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1939     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1940     for (j=0; j<n; j++) {
1941       xb    = x + 4*(*idx++);
1942       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1943       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1944       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1945       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1946       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1947       v    += 16;
1948     }
1949     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1950     if (!usecprow) {
1951       z += 4; y += 4;
1952     }
1953   }
1954   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1955   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1956   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1957   PetscFunctionReturn(0);
1958 }
1959 
1960 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1961 {
1962   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1963   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1964   const PetscScalar *x,*xb;
1965   PetscScalar       *yarray,*zarray;
1966   const MatScalar   *v;
1967   PetscErrorCode    ierr;
1968   PetscInt          mbs = a->mbs,i,j,n;
1969   const PetscInt    *idx,*ii,*ridx = NULL;
1970   PetscBool         usecprow=a->compressedrow.use;
1971 
1972   PetscFunctionBegin;
1973   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1974   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1975 
1976   idx = a->j;
1977   v   = a->a;
1978   if (usecprow) {
1979     if (zz != yy) {
1980       ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr);
1981     }
1982     mbs  = a->compressedrow.nrows;
1983     ii   = a->compressedrow.i;
1984     ridx = a->compressedrow.rindex;
1985   } else {
1986     ii = a->i;
1987     y  = yarray;
1988     z  = zarray;
1989   }
1990 
1991   for (i=0; i<mbs; i++) {
1992     n = ii[1] - ii[0]; ii++;
1993     if (usecprow) {
1994       z = zarray + 5*ridx[i];
1995       y = yarray + 5*ridx[i];
1996     }
1997     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1998     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1999     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2000     for (j=0; j<n; j++) {
2001       xb    = x + 5*(*idx++);
2002       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
2003       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2004       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2005       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2006       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2007       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2008       v    += 25;
2009     }
2010     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
2011     if (!usecprow) {
2012       z += 5; y += 5;
2013     }
2014   }
2015   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2016   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2017   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
2018   PetscFunctionReturn(0);
2019 }
2020 
2021 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
2022 {
2023   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2024   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2025   const PetscScalar *x,*xb;
2026   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2027   const MatScalar   *v;
2028   PetscErrorCode    ierr;
2029   PetscInt          mbs = a->mbs,i,j,n;
2030   const PetscInt    *idx,*ii,*ridx=NULL;
2031   PetscBool         usecprow=a->compressedrow.use;
2032 
2033   PetscFunctionBegin;
2034   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2035   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2036 
2037   idx = a->j;
2038   v   = a->a;
2039   if (usecprow) {
2040     if (zz != yy) {
2041       ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr);
2042     }
2043     mbs  = a->compressedrow.nrows;
2044     ii   = a->compressedrow.i;
2045     ridx = a->compressedrow.rindex;
2046   } else {
2047     ii = a->i;
2048     y  = yarray;
2049     z  = zarray;
2050   }
2051 
2052   for (i=0; i<mbs; i++) {
2053     n = ii[1] - ii[0]; ii++;
2054     if (usecprow) {
2055       z = zarray + 6*ridx[i];
2056       y = yarray + 6*ridx[i];
2057     }
2058     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2059     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2060     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2061     for (j=0; j<n; j++) {
2062       xb    = x + 6*(*idx++);
2063       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2064       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2065       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2066       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2067       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2068       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2069       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2070       v    += 36;
2071     }
2072     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2073     if (!usecprow) {
2074       z += 6; y += 6;
2075     }
2076   }
2077   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2078   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2079   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
2080   PetscFunctionReturn(0);
2081 }
2082 
2083 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2084 {
2085   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2086   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2087   const PetscScalar *x,*xb;
2088   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2089   const MatScalar   *v;
2090   PetscErrorCode    ierr;
2091   PetscInt          mbs = a->mbs,i,j,n;
2092   const PetscInt    *idx,*ii,*ridx = NULL;
2093   PetscBool         usecprow=a->compressedrow.use;
2094 
2095   PetscFunctionBegin;
2096   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2097   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2098 
2099   idx = a->j;
2100   v   = a->a;
2101   if (usecprow) {
2102     if (zz != yy) {
2103       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
2104     }
2105     mbs  = a->compressedrow.nrows;
2106     ii   = a->compressedrow.i;
2107     ridx = a->compressedrow.rindex;
2108   } else {
2109     ii = a->i;
2110     y  = yarray;
2111     z  = zarray;
2112   }
2113 
2114   for (i=0; i<mbs; i++) {
2115     n = ii[1] - ii[0]; ii++;
2116     if (usecprow) {
2117       z = zarray + 7*ridx[i];
2118       y = yarray + 7*ridx[i];
2119     }
2120     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2121     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2122     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2123     for (j=0; j<n; j++) {
2124       xb    = x + 7*(*idx++);
2125       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2126       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2127       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2128       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2129       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2130       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2131       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2132       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2133       v    += 49;
2134     }
2135     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2136     if (!usecprow) {
2137       z += 7; y += 7;
2138     }
2139   }
2140   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2141   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2142   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
2143   PetscFunctionReturn(0);
2144 }
2145 
2146 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2147 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2148 {
2149   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2150   PetscScalar       *z = NULL,*work,*workt,*zarray;
2151   const PetscScalar *x,*xb;
2152   const MatScalar   *v;
2153   PetscErrorCode    ierr;
2154   PetscInt          mbs,i,j,n;
2155   PetscInt          k;
2156   PetscBool         usecprow=a->compressedrow.use;
2157   const PetscInt    *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2158 
2159   __m256d a0,a1,a2,a3,a4,a5;
2160   __m256d w0,w1,w2,w3;
2161   __m256d z0,z1,z2;
2162   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2163 
2164   PetscFunctionBegin;
2165   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2166   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2167   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2168 
2169   idx = a->j;
2170   v   = a->a;
2171   if (usecprow) {
2172     mbs  = a->compressedrow.nrows;
2173     ii   = a->compressedrow.i;
2174     ridx = a->compressedrow.rindex;
2175   } else {
2176     mbs = a->mbs;
2177     ii  = a->i;
2178     z   = zarray;
2179   }
2180 
2181   if (!a->mult_work) {
2182     k    = PetscMax(A->rmap->n,A->cmap->n);
2183     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2184   }
2185 
2186   work = a->mult_work;
2187   for (i=0; i<mbs; i++) {
2188     n           = ii[1] - ii[0]; ii++;
2189     workt       = work;
2190     for (j=0; j<n; j++) {
2191       xb = x + bs*(*idx++);
2192       for (k=0; k<bs; k++) workt[k] = xb[k];
2193       workt += bs;
2194     }
2195     if (usecprow) z = zarray + bs*ridx[i];
2196 
2197     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2198 
2199     for (j=0; j<n; j++) {
2200       /* first column of a */
2201       w0 = _mm256_set1_pd(work[j*9  ]);
2202       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2203       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2204       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2205 
2206       /* second column of a */
2207       w1 = _mm256_set1_pd(work[j*9+ 1]);
2208       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2209       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2210       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2211 
2212       /* third column of a */
2213       w2 = _mm256_set1_pd(work[j*9+ 2]);
2214       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2215       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2216       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2217 
2218       /* fourth column of a */
2219       w3 = _mm256_set1_pd(work[j*9+ 3]);
2220       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2221       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2222       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2223 
2224       /* fifth column of a */
2225       w0 = _mm256_set1_pd(work[j*9+ 4]);
2226       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2227       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2228       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2229 
2230       /* sixth column of a */
2231       w1 = _mm256_set1_pd(work[j*9+ 5]);
2232       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2233       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2234       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2235 
2236       /* seventh column of a */
2237       w2 = _mm256_set1_pd(work[j*9+ 6]);
2238       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2239       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2240       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2241 
2242       /* eigth column of a */
2243       w3 = _mm256_set1_pd(work[j*9+ 7]);
2244       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2245       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2246       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2247 
2248       /* ninth column of a */
2249       w0 = _mm256_set1_pd(work[j*9+ 8]);
2250       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2251       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2252       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2253     }
2254 
2255     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2256 
2257     v += n*bs2;
2258     if (!usecprow) z += bs;
2259   }
2260   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2261   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2262   ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr);
2263   PetscFunctionReturn(0);
2264 }
2265 #endif
2266 
2267 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2268 {
2269   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2270   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2271   const PetscScalar *x,*xb;
2272   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2273   const MatScalar   *v;
2274   PetscErrorCode    ierr;
2275   PetscInt          mbs = a->mbs,i,j,n;
2276   const PetscInt    *idx,*ii,*ridx = NULL;
2277   PetscBool         usecprow=a->compressedrow.use;
2278 
2279   PetscFunctionBegin;
2280   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2281   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2282 
2283   idx = a->j;
2284   v   = a->a;
2285   if (usecprow) {
2286     if (zz != yy) {
2287       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
2288     }
2289     mbs  = a->compressedrow.nrows;
2290     ii   = a->compressedrow.i;
2291     ridx = a->compressedrow.rindex;
2292   } else {
2293     ii = a->i;
2294     y  = yarray;
2295     z  = zarray;
2296   }
2297 
2298   for (i=0; i<mbs; i++) {
2299     n = ii[1] - ii[0]; ii++;
2300     if (usecprow) {
2301       z = zarray + 11*ridx[i];
2302       y = yarray + 11*ridx[i];
2303     }
2304     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2305     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2306     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2307     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2308     for (j=0; j<n; j++) {
2309       xb    = x + 11*(*idx++);
2310       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2311       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
2312       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
2313       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
2314       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
2315       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
2316       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
2317       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
2318       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
2319       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
2320       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
2321       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
2322       v    += 121;
2323     }
2324     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2325     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
2326     if (!usecprow) {
2327       z += 11; y += 11;
2328     }
2329   }
2330   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2331   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2332   ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
2333   PetscFunctionReturn(0);
2334 }
2335 
2336 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2337 {
2338   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2339   PetscScalar       *z = NULL,*work,*workt,*zarray;
2340   const PetscScalar *x,*xb;
2341   const MatScalar   *v;
2342   PetscErrorCode    ierr;
2343   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2344   PetscInt          ncols,k;
2345   const PetscInt    *ridx = NULL,*idx,*ii;
2346   PetscBool         usecprow = a->compressedrow.use;
2347 
2348   PetscFunctionBegin;
2349   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2350   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2351   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2352 
2353   idx = a->j;
2354   v   = a->a;
2355   if (usecprow) {
2356     mbs  = a->compressedrow.nrows;
2357     ii   = a->compressedrow.i;
2358     ridx = a->compressedrow.rindex;
2359   } else {
2360     mbs = a->mbs;
2361     ii  = a->i;
2362     z   = zarray;
2363   }
2364 
2365   if (!a->mult_work) {
2366     k    = PetscMax(A->rmap->n,A->cmap->n);
2367     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2368   }
2369   work = a->mult_work;
2370   for (i=0; i<mbs; i++) {
2371     n     = ii[1] - ii[0]; ii++;
2372     ncols = n*bs;
2373     workt = work;
2374     for (j=0; j<n; j++) {
2375       xb = x + bs*(*idx++);
2376       for (k=0; k<bs; k++) workt[k] = xb[k];
2377       workt += bs;
2378     }
2379     if (usecprow) z = zarray + bs*ridx[i];
2380     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2381     v += n*bs2;
2382     if (!usecprow) z += bs;
2383   }
2384   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2385   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2386   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
2387   PetscFunctionReturn(0);
2388 }
2389 
2390 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2391 {
2392   PetscScalar    zero = 0.0;
2393   PetscErrorCode ierr;
2394 
2395   PetscFunctionBegin;
2396   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2397   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2398   PetscFunctionReturn(0);
2399 }
2400 
2401 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2402 {
2403   PetscScalar    zero = 0.0;
2404   PetscErrorCode ierr;
2405 
2406   PetscFunctionBegin;
2407   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2408   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2409   PetscFunctionReturn(0);
2410 }
2411 
2412 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2413 {
2414   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2415   PetscScalar       *z,x1,x2,x3,x4,x5;
2416   const PetscScalar *x,*xb = NULL;
2417   const MatScalar   *v;
2418   PetscErrorCode    ierr;
2419   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2420   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2421   Mat_CompressedRow cprow = a->compressedrow;
2422   PetscBool         usecprow = cprow.use;
2423 
2424   PetscFunctionBegin;
2425   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2426   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2427   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2428 
2429   idx = a->j;
2430   v   = a->a;
2431   if (usecprow) {
2432     mbs  = cprow.nrows;
2433     ii   = cprow.i;
2434     ridx = cprow.rindex;
2435   } else {
2436     mbs=a->mbs;
2437     ii = a->i;
2438     xb = x;
2439   }
2440 
2441   switch (bs) {
2442   case 1:
2443     for (i=0; i<mbs; i++) {
2444       if (usecprow) xb = x + ridx[i];
2445       x1 = xb[0];
2446       ib = idx + ii[0];
2447       n  = ii[1] - ii[0]; ii++;
2448       for (j=0; j<n; j++) {
2449         rval     = ib[j];
2450         z[rval] += PetscConj(*v) * x1;
2451         v++;
2452       }
2453       if (!usecprow) xb++;
2454     }
2455     break;
2456   case 2:
2457     for (i=0; i<mbs; i++) {
2458       if (usecprow) xb = x + 2*ridx[i];
2459       x1 = xb[0]; x2 = xb[1];
2460       ib = idx + ii[0];
2461       n  = ii[1] - ii[0]; ii++;
2462       for (j=0; j<n; j++) {
2463         rval       = ib[j]*2;
2464         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2465         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2466         v         += 4;
2467       }
2468       if (!usecprow) xb += 2;
2469     }
2470     break;
2471   case 3:
2472     for (i=0; i<mbs; i++) {
2473       if (usecprow) xb = x + 3*ridx[i];
2474       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2475       ib = idx + ii[0];
2476       n  = ii[1] - ii[0]; ii++;
2477       for (j=0; j<n; j++) {
2478         rval       = ib[j]*3;
2479         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2480         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2481         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2482         v         += 9;
2483       }
2484       if (!usecprow) xb += 3;
2485     }
2486     break;
2487   case 4:
2488     for (i=0; i<mbs; i++) {
2489       if (usecprow) xb = x + 4*ridx[i];
2490       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2491       ib = idx + ii[0];
2492       n  = ii[1] - ii[0]; ii++;
2493       for (j=0; j<n; j++) {
2494         rval       = ib[j]*4;
2495         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2496         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2497         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2498         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2499         v         += 16;
2500       }
2501       if (!usecprow) xb += 4;
2502     }
2503     break;
2504   case 5:
2505     for (i=0; i<mbs; i++) {
2506       if (usecprow) xb = x + 5*ridx[i];
2507       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2508       x4 = xb[3]; x5 = xb[4];
2509       ib = idx + ii[0];
2510       n  = ii[1] - ii[0]; ii++;
2511       for (j=0; j<n; j++) {
2512         rval       = ib[j]*5;
2513         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2514         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2515         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2516         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2517         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2518         v         += 25;
2519       }
2520       if (!usecprow) xb += 5;
2521     }
2522     break;
2523   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2524     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2525 #if 0
2526     {
2527       PetscInt          ncols,k,bs2=a->bs2;
2528       PetscScalar       *work,*workt,zb;
2529       const PetscScalar *xtmp;
2530       if (!a->mult_work) {
2531         k    = PetscMax(A->rmap->n,A->cmap->n);
2532         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2533       }
2534       work = a->mult_work;
2535       xtmp = x;
2536       for (i=0; i<mbs; i++) {
2537         n     = ii[1] - ii[0]; ii++;
2538         ncols = n*bs;
2539         ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2540         if (usecprow) xtmp = x + bs*ridx[i];
2541         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2542         v += n*bs2;
2543         if (!usecprow) xtmp += bs;
2544         workt = work;
2545         for (j=0; j<n; j++) {
2546           zb = z + bs*(*idx++);
2547           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2548           workt += bs;
2549         }
2550       }
2551     }
2552 #endif
2553   }
2554   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2555   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2556   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2557   PetscFunctionReturn(0);
2558 }
2559 
2560 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2561 {
2562   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2563   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2564   const PetscScalar *x,*xb = NULL;
2565   const MatScalar   *v;
2566   PetscErrorCode    ierr;
2567   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2568   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2569   Mat_CompressedRow cprow   = a->compressedrow;
2570   PetscBool         usecprow=cprow.use;
2571 
2572   PetscFunctionBegin;
2573   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2574   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2575   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2576 
2577   idx = a->j;
2578   v   = a->a;
2579   if (usecprow) {
2580     mbs  = cprow.nrows;
2581     ii   = cprow.i;
2582     ridx = cprow.rindex;
2583   } else {
2584     mbs=a->mbs;
2585     ii = a->i;
2586     xb = x;
2587   }
2588 
2589   switch (bs) {
2590   case 1:
2591     for (i=0; i<mbs; i++) {
2592       if (usecprow) xb = x + ridx[i];
2593       x1 = xb[0];
2594       ib = idx + ii[0];
2595       n  = ii[1] - ii[0]; ii++;
2596       for (j=0; j<n; j++) {
2597         rval     = ib[j];
2598         z[rval] += *v * x1;
2599         v++;
2600       }
2601       if (!usecprow) xb++;
2602     }
2603     break;
2604   case 2:
2605     for (i=0; i<mbs; i++) {
2606       if (usecprow) xb = x + 2*ridx[i];
2607       x1 = xb[0]; x2 = xb[1];
2608       ib = idx + ii[0];
2609       n  = ii[1] - ii[0]; ii++;
2610       for (j=0; j<n; j++) {
2611         rval       = ib[j]*2;
2612         z[rval++] += v[0]*x1 + v[1]*x2;
2613         z[rval++] += v[2]*x1 + v[3]*x2;
2614         v         += 4;
2615       }
2616       if (!usecprow) xb += 2;
2617     }
2618     break;
2619   case 3:
2620     for (i=0; i<mbs; i++) {
2621       if (usecprow) xb = x + 3*ridx[i];
2622       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2623       ib = idx + ii[0];
2624       n  = ii[1] - ii[0]; ii++;
2625       for (j=0; j<n; j++) {
2626         rval       = ib[j]*3;
2627         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2628         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2629         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2630         v         += 9;
2631       }
2632       if (!usecprow) xb += 3;
2633     }
2634     break;
2635   case 4:
2636     for (i=0; i<mbs; i++) {
2637       if (usecprow) xb = x + 4*ridx[i];
2638       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2639       ib = idx + ii[0];
2640       n  = ii[1] - ii[0]; ii++;
2641       for (j=0; j<n; j++) {
2642         rval       = ib[j]*4;
2643         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2644         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2645         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2646         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2647         v         += 16;
2648       }
2649       if (!usecprow) xb += 4;
2650     }
2651     break;
2652   case 5:
2653     for (i=0; i<mbs; i++) {
2654       if (usecprow) xb = x + 5*ridx[i];
2655       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2656       x4 = xb[3]; x5 = xb[4];
2657       ib = idx + ii[0];
2658       n  = ii[1] - ii[0]; ii++;
2659       for (j=0; j<n; j++) {
2660         rval       = ib[j]*5;
2661         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2662         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2663         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2664         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2665         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2666         v         += 25;
2667       }
2668       if (!usecprow) xb += 5;
2669     }
2670     break;
2671   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2672     PetscInt          ncols,k;
2673     PetscScalar       *work,*workt;
2674     const PetscScalar *xtmp;
2675     if (!a->mult_work) {
2676       k    = PetscMax(A->rmap->n,A->cmap->n);
2677       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2678     }
2679     work = a->mult_work;
2680     xtmp = x;
2681     for (i=0; i<mbs; i++) {
2682       n     = ii[1] - ii[0]; ii++;
2683       ncols = n*bs;
2684       ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2685       if (usecprow) xtmp = x + bs*ridx[i];
2686       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2687       v += n*bs2;
2688       if (!usecprow) xtmp += bs;
2689       workt = work;
2690       for (j=0; j<n; j++) {
2691         zb = z + bs*(*idx++);
2692         for (k=0; k<bs; k++) zb[k] += workt[k];
2693         workt += bs;
2694       }
2695     }
2696     }
2697   }
2698   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2699   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2700   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2701   PetscFunctionReturn(0);
2702 }
2703 
2704 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2705 {
2706   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2707   PetscInt       totalnz = a->bs2*a->nz;
2708   PetscScalar    oalpha  = alpha;
2709   PetscErrorCode ierr;
2710   PetscBLASInt   one = 1,tnz;
2711 
2712   PetscFunctionBegin;
2713   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2714   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2715   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2716   PetscFunctionReturn(0);
2717 }
2718 
2719 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2720 {
2721   PetscErrorCode ierr;
2722   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2723   MatScalar      *v  = a->a;
2724   PetscReal      sum = 0.0;
2725   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2726 
2727   PetscFunctionBegin;
2728   if (type == NORM_FROBENIUS) {
2729 #if defined(PETSC_USE_REAL___FP16)
2730     PetscBLASInt one = 1,cnt = bs2*nz;
2731     PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2732 #else
2733     for (i=0; i<bs2*nz; i++) {
2734       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2735     }
2736 #endif
2737     *norm = PetscSqrtReal(sum);
2738     ierr = PetscLogFlops(2.0*bs2*nz);CHKERRQ(ierr);
2739   } else if (type == NORM_1) { /* maximum column sum */
2740     PetscReal *tmp;
2741     PetscInt  *bcol = a->j;
2742     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2743     for (i=0; i<nz; i++) {
2744       for (j=0; j<bs; j++) {
2745         k1 = bs*(*bcol) + j; /* column index */
2746         for (k=0; k<bs; k++) {
2747           tmp[k1] += PetscAbsScalar(*v); v++;
2748         }
2749       }
2750       bcol++;
2751     }
2752     *norm = 0.0;
2753     for (j=0; j<A->cmap->n; j++) {
2754       if (tmp[j] > *norm) *norm = tmp[j];
2755     }
2756     ierr = PetscFree(tmp);CHKERRQ(ierr);
2757     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2758   } else if (type == NORM_INFINITY) { /* maximum row sum */
2759     *norm = 0.0;
2760     for (k=0; k<bs; k++) {
2761       for (j=0; j<a->mbs; j++) {
2762         v   = a->a + bs2*a->i[j] + k;
2763         sum = 0.0;
2764         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2765           for (k1=0; k1<bs; k1++) {
2766             sum += PetscAbsScalar(*v);
2767             v   += bs;
2768           }
2769         }
2770         if (sum > *norm) *norm = sum;
2771       }
2772     }
2773     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2774   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2775   PetscFunctionReturn(0);
2776 }
2777 
2778 
2779 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2780 {
2781   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2782   PetscErrorCode ierr;
2783 
2784   PetscFunctionBegin;
2785   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2786   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2787     *flg = PETSC_FALSE;
2788     PetscFunctionReturn(0);
2789   }
2790 
2791   /* if the a->i are the same */
2792   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
2793   if (!*flg) PetscFunctionReturn(0);
2794 
2795   /* if a->j are the same */
2796   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
2797   if (!*flg) PetscFunctionReturn(0);
2798 
2799   /* if a->a are the same */
2800   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr);
2801   PetscFunctionReturn(0);
2802 
2803 }
2804 
2805 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2806 {
2807   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2808   PetscErrorCode ierr;
2809   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2810   PetscScalar    *x,zero = 0.0;
2811   MatScalar      *aa,*aa_j;
2812 
2813   PetscFunctionBegin;
2814   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2815   bs   = A->rmap->bs;
2816   aa   = a->a;
2817   ai   = a->i;
2818   aj   = a->j;
2819   ambs = a->mbs;
2820   bs2  = a->bs2;
2821 
2822   ierr = VecSet(v,zero);CHKERRQ(ierr);
2823   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2824   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2825   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2826   for (i=0; i<ambs; i++) {
2827     for (j=ai[i]; j<ai[i+1]; j++) {
2828       if (aj[j] == i) {
2829         row  = i*bs;
2830         aa_j = aa+j*bs2;
2831         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2832         break;
2833       }
2834     }
2835   }
2836   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2837   PetscFunctionReturn(0);
2838 }
2839 
2840 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2841 {
2842   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2843   const PetscScalar *l,*r,*li,*ri;
2844   PetscScalar       x;
2845   MatScalar         *aa, *v;
2846   PetscErrorCode    ierr;
2847   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2848   const PetscInt    *ai,*aj;
2849 
2850   PetscFunctionBegin;
2851   ai  = a->i;
2852   aj  = a->j;
2853   aa  = a->a;
2854   m   = A->rmap->n;
2855   n   = A->cmap->n;
2856   bs  = A->rmap->bs;
2857   mbs = a->mbs;
2858   bs2 = a->bs2;
2859   if (ll) {
2860     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2861     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2862     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2863     for (i=0; i<mbs; i++) { /* for each block row */
2864       M  = ai[i+1] - ai[i];
2865       li = l + i*bs;
2866       v  = aa + bs2*ai[i];
2867       for (j=0; j<M; j++) { /* for each block */
2868         for (k=0; k<bs2; k++) {
2869           (*v++) *= li[k%bs];
2870         }
2871       }
2872     }
2873     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2874     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2875   }
2876 
2877   if (rr) {
2878     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2879     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2880     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2881     for (i=0; i<mbs; i++) { /* for each block row */
2882       iai = ai[i];
2883       M   = ai[i+1] - iai;
2884       v   = aa + bs2*iai;
2885       for (j=0; j<M; j++) { /* for each block */
2886         ri = r + bs*aj[iai+j];
2887         for (k=0; k<bs; k++) {
2888           x = ri[k];
2889           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2890           v += bs;
2891         }
2892       }
2893     }
2894     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2895     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2896   }
2897   PetscFunctionReturn(0);
2898 }
2899 
2900 
2901 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2902 {
2903   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2904 
2905   PetscFunctionBegin;
2906   info->block_size   = a->bs2;
2907   info->nz_allocated = a->bs2*a->maxnz;
2908   info->nz_used      = a->bs2*a->nz;
2909   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2910   info->assemblies   = A->num_ass;
2911   info->mallocs      = A->info.mallocs;
2912   info->memory       = ((PetscObject)A)->mem;
2913   if (A->factortype) {
2914     info->fill_ratio_given  = A->info.fill_ratio_given;
2915     info->fill_ratio_needed = A->info.fill_ratio_needed;
2916     info->factor_mallocs    = A->info.factor_mallocs;
2917   } else {
2918     info->fill_ratio_given  = 0;
2919     info->fill_ratio_needed = 0;
2920     info->factor_mallocs    = 0;
2921   }
2922   PetscFunctionReturn(0);
2923 }
2924 
2925 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2926 {
2927   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2928   PetscErrorCode ierr;
2929 
2930   PetscFunctionBegin;
2931   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
2932   PetscFunctionReturn(0);
2933 }
2934 
2935 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2936 {
2937   PetscErrorCode ierr;
2938 
2939   PetscFunctionBegin;
2940   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
2941   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2942   PetscFunctionReturn(0);
2943 }
2944 
2945 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2946 {
2947   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2948   PetscScalar       *z = NULL,sum1;
2949   const PetscScalar *xb;
2950   PetscScalar       x1;
2951   const MatScalar   *v,*vv;
2952   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2953   PetscBool         usecprow=a->compressedrow.use;
2954 
2955   PetscFunctionBegin;
2956   idx = a->j;
2957   v   = a->a;
2958   if (usecprow) {
2959     mbs  = a->compressedrow.nrows;
2960     ii   = a->compressedrow.i;
2961     ridx = a->compressedrow.rindex;
2962   } else {
2963     mbs = a->mbs;
2964     ii  = a->i;
2965     z   = c;
2966   }
2967 
2968   for (i=0; i<mbs; i++) {
2969     n           = ii[1] - ii[0]; ii++;
2970     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2971     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2972     if (usecprow) z = c + ridx[i];
2973     jj = idx;
2974     vv = v;
2975     for (k=0; k<cn; k++) {
2976       idx = jj;
2977       v = vv;
2978       sum1    = 0.0;
2979       for (j=0; j<n; j++) {
2980         xb    = b + (*idx++); x1 = xb[0+k*bm];
2981         sum1 += v[0]*x1;
2982         v    += 1;
2983       }
2984       z[0+k*cm] = sum1;
2985     }
2986     if (!usecprow) z += 1;
2987   }
2988   PetscFunctionReturn(0);
2989 }
2990 
2991 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2992 {
2993   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2994   PetscScalar       *z = NULL,sum1,sum2;
2995   const PetscScalar *xb;
2996   PetscScalar       x1,x2;
2997   const MatScalar   *v,*vv;
2998   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2999   PetscBool         usecprow=a->compressedrow.use;
3000 
3001   PetscFunctionBegin;
3002   idx = a->j;
3003   v   = a->a;
3004   if (usecprow) {
3005     mbs  = a->compressedrow.nrows;
3006     ii   = a->compressedrow.i;
3007     ridx = a->compressedrow.rindex;
3008   } else {
3009     mbs = a->mbs;
3010     ii  = a->i;
3011     z   = c;
3012   }
3013 
3014   for (i=0; i<mbs; i++) {
3015     n           = ii[1] - ii[0]; ii++;
3016     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3017     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3018     if (usecprow) z = c + 2*ridx[i];
3019     jj = idx;
3020     vv = v;
3021     for (k=0; k<cn; k++) {
3022       idx = jj;
3023       v = vv;
3024       sum1    = 0.0; sum2 = 0.0;
3025       for (j=0; j<n; j++) {
3026         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
3027         sum1 += v[0]*x1 + v[2]*x2;
3028         sum2 += v[1]*x1 + v[3]*x2;
3029         v    += 4;
3030       }
3031       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
3032     }
3033     if (!usecprow) z += 2;
3034   }
3035   PetscFunctionReturn(0);
3036 }
3037 
3038 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3039 {
3040   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3041   PetscScalar       *z = NULL,sum1,sum2,sum3;
3042   const PetscScalar *xb;
3043   PetscScalar       x1,x2,x3;
3044   const MatScalar   *v,*vv;
3045   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3046   PetscBool         usecprow=a->compressedrow.use;
3047 
3048   PetscFunctionBegin;
3049   idx = a->j;
3050   v   = a->a;
3051   if (usecprow) {
3052     mbs  = a->compressedrow.nrows;
3053     ii   = a->compressedrow.i;
3054     ridx = a->compressedrow.rindex;
3055   } else {
3056     mbs = a->mbs;
3057     ii  = a->i;
3058     z   = c;
3059   }
3060 
3061   for (i=0; i<mbs; i++) {
3062     n           = ii[1] - ii[0]; ii++;
3063     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3064     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3065     if (usecprow) z = c + 3*ridx[i];
3066     jj = idx;
3067     vv = v;
3068     for (k=0; k<cn; k++) {
3069       idx = jj;
3070       v = vv;
3071       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
3072       for (j=0; j<n; j++) {
3073         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3074         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3075         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3076         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3077         v    += 9;
3078       }
3079       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3080     }
3081     if (!usecprow) z += 3;
3082   }
3083   PetscFunctionReturn(0);
3084 }
3085 
3086 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3087 {
3088   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3089   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
3090   const PetscScalar *xb;
3091   PetscScalar       x1,x2,x3,x4;
3092   const MatScalar   *v,*vv;
3093   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3094   PetscBool         usecprow=a->compressedrow.use;
3095 
3096   PetscFunctionBegin;
3097   idx = a->j;
3098   v   = a->a;
3099   if (usecprow) {
3100     mbs  = a->compressedrow.nrows;
3101     ii   = a->compressedrow.i;
3102     ridx = a->compressedrow.rindex;
3103   } else {
3104     mbs = a->mbs;
3105     ii  = a->i;
3106     z   = c;
3107   }
3108 
3109   for (i=0; i<mbs; i++) {
3110     n           = ii[1] - ii[0]; ii++;
3111     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3112     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3113     if (usecprow) z = c + 4*ridx[i];
3114     jj = idx;
3115     vv = v;
3116     for (k=0; k<cn; k++) {
3117       idx = jj;
3118       v = vv;
3119       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3120       for (j=0; j<n; j++) {
3121         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3122         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3123         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3124         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3125         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3126         v    += 16;
3127       }
3128       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3129     }
3130     if (!usecprow) z += 4;
3131   }
3132   PetscFunctionReturn(0);
3133 }
3134 
3135 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3136 {
3137   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3138   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
3139   const PetscScalar *xb;
3140   PetscScalar       x1,x2,x3,x4,x5;
3141   const MatScalar   *v,*vv;
3142   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3143   PetscBool         usecprow=a->compressedrow.use;
3144 
3145   PetscFunctionBegin;
3146   idx = a->j;
3147   v   = a->a;
3148   if (usecprow) {
3149     mbs  = a->compressedrow.nrows;
3150     ii   = a->compressedrow.i;
3151     ridx = a->compressedrow.rindex;
3152   } else {
3153     mbs = a->mbs;
3154     ii  = a->i;
3155     z   = c;
3156   }
3157 
3158   for (i=0; i<mbs; i++) {
3159     n           = ii[1] - ii[0]; ii++;
3160     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3161     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3162     if (usecprow) z = c + 5*ridx[i];
3163     jj = idx;
3164     vv = v;
3165     for (k=0; k<cn; k++) {
3166       idx = jj;
3167       v = vv;
3168       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3169       for (j=0; j<n; j++) {
3170         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3171         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3172         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3173         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3174         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3175         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3176         v    += 25;
3177       }
3178       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3179     }
3180     if (!usecprow) z += 5;
3181   }
3182   PetscFunctionReturn(0);
3183 }
3184 
3185 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3186 {
3187   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3188   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
3189   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
3190   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3191   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3192   PetscBLASInt      bbs,bcn,bbm,bcm;
3193   PetscScalar       *z = NULL;
3194   PetscScalar       *c,*b;
3195   const MatScalar   *v;
3196   const PetscInt    *idx,*ii,*ridx=NULL;
3197   PetscScalar       _DZero=0.0,_DOne=1.0;
3198   PetscBool         usecprow=a->compressedrow.use;
3199   PetscErrorCode    ierr;
3200 
3201   PetscFunctionBegin;
3202   if (!cm || !cn) PetscFunctionReturn(0);
3203   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
3204   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
3205   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
3206   b = bd->v;
3207   if (a->nonzerorowcnt != A->rmap->n) {
3208     ierr = MatZeroEntries(C);CHKERRQ(ierr);
3209   }
3210   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
3211   switch (bs) {
3212   case 1:
3213     ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3214     break;
3215   case 2:
3216     ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3217     break;
3218   case 3:
3219     ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3220     break;
3221   case 4:
3222     ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3223     break;
3224   case 5:
3225     ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3226     break;
3227   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3228     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
3229     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
3230     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
3231     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
3232     idx = a->j;
3233     v   = a->a;
3234     if (usecprow) {
3235       mbs  = a->compressedrow.nrows;
3236       ii   = a->compressedrow.i;
3237       ridx = a->compressedrow.rindex;
3238     } else {
3239       mbs = a->mbs;
3240       ii  = a->i;
3241       z   = c;
3242     }
3243     for (i=0; i<mbs; i++) {
3244       n = ii[1] - ii[0]; ii++;
3245       if (usecprow) z = c + bs*ridx[i];
3246       if (n) {
3247         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3248         v += bs2;
3249       }
3250       for (j=1; j<n; j++) {
3251         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3252         v += bs2;
3253       }
3254       if (!usecprow) z += bs;
3255     }
3256   }
3257   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
3258   ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr);
3259   PetscFunctionReturn(0);
3260 }
3261