xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision dfd57a172ac9fa6c7b5fe6de6ab5df85cefc2996)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscbt.h>
6 #include <petscblaslapack.h>
7 
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 #include <immintrin.h>
10 #endif
11 
12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77 {
78   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
79   PetscErrorCode ierr;
80   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
82   const PetscInt *irow,*icol;
83   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84   PetscInt       *aj = a->j,*ai = a->i;
85   MatScalar      *mat_a;
86   Mat            C;
87   PetscBool      flag;
88 
89   PetscFunctionBegin;
90   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
91   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
93   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
94 
95   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
96   ssmap = smap;
97   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
98   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99   /* determine lens of each row */
100   for (i=0; i<nrows; i++) {
101     kstart  = ai[irow[i]];
102     kend    = kstart + a->ilen[irow[i]];
103     lens[i] = 0;
104     for (k=kstart; k<kend; k++) {
105       if (ssmap[aj[k]]) lens[i]++;
106     }
107   }
108   /* Create and fill new matrix */
109   if (scall == MAT_REUSE_MATRIX) {
110     c = (Mat_SeqBAIJ*)((*B)->data);
111 
112     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
114     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
116     C    = *B;
117   } else {
118     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
119     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
120     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
121     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
122   }
123   c = (Mat_SeqBAIJ*)(C->data);
124   for (i=0; i<nrows; i++) {
125     row      = irow[i];
126     kstart   = ai[row];
127     kend     = kstart + a->ilen[row];
128     mat_i    = c->i[i];
129     mat_j    = c->j + mat_i;
130     mat_a    = c->a + mat_i*bs2;
131     mat_ilen = c->ilen + i;
132     for (k=kstart; k<kend; k++) {
133       if ((tcol=ssmap[a->j[k]])) {
134         *mat_j++ = tcol - 1;
135         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
136         mat_a   += bs2;
137         (*mat_ilen)++;
138       }
139     }
140   }
141   /* sort */
142   {
143     MatScalar *work;
144     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
145     for (i=0; i<nrows; i++) {
146       PetscInt ilen;
147       mat_i = c->i[i];
148       mat_j = c->j + mat_i;
149       mat_a = c->a + mat_i*bs2;
150       ilen  = c->ilen[i];
151       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
152     }
153     ierr = PetscFree(work);CHKERRQ(ierr);
154   }
155 
156   /* Free work space */
157   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
158   ierr = PetscFree(smap);CHKERRQ(ierr);
159   ierr = PetscFree(lens);CHKERRQ(ierr);
160   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
162 
163   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
164   *B   = C;
165   PetscFunctionReturn(0);
166 }
167 
168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169 {
170   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
171   IS             is1,is2;
172   PetscErrorCode ierr;
173   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174   const PetscInt *irow,*icol;
175 
176   PetscFunctionBegin;
177   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
178   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
179   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
180   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
181 
182   /* Verify if the indices corespond to each element in a block
183    and form the IS with compressed IS */
184   maxmnbs = PetscMax(a->mbs,a->nbs);
185   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
186   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
187   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188   for (i=0; i<a->mbs; i++) {
189     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190   }
191   count = 0;
192   for (i=0; i<nrows; i++) {
193     j = irow[i] / bs;
194     if ((vary[j]--)==bs) iary[count++] = j;
195   }
196   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
197 
198   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
199   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200   for (i=0; i<a->nbs; i++) {
201     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202   }
203   count = 0;
204   for (i=0; i<ncols; i++) {
205     j = icol[i] / bs;
206     if ((vary[j]--)==bs) iary[count++] = j;
207   }
208   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
209   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
210   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
211   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
212 
213   ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
214   ierr = ISDestroy(&is1);CHKERRQ(ierr);
215   ierr = ISDestroy(&is2);CHKERRQ(ierr);
216   PetscFunctionReturn(0);
217 }
218 
219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220 {
221   PetscErrorCode ierr;
222   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
223   Mat_SubSppt    *submatj = c->submatis1;
224 
225   PetscFunctionBegin;
226   ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
227   ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232 {
233   PetscErrorCode ierr;
234   PetscInt       i;
235   Mat            C;
236   Mat_SeqBAIJ    *c;
237   Mat_SubSppt    *submatj;
238 
239   PetscFunctionBegin;
240   for (i=0; i<n; i++) {
241     C       = (*mat)[i];
242     c       = (Mat_SeqBAIJ*)C->data;
243     submatj = c->submatis1;
244     if (submatj) {
245       if (--((PetscObject)C)->refct <= 0) {
246         ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
247         ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
248         ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
249         ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr);
250         ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr);
251         ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr);
252       }
253     } else {
254       ierr = MatDestroy(&C);CHKERRQ(ierr);
255     }
256   }
257 
258   /* Destroy Dummy submatrices created for reuse */
259   ierr = MatDestroySubMatrices_Dummy(n,mat);CHKERRQ(ierr);
260 
261   ierr = PetscFree(*mat);CHKERRQ(ierr);
262   PetscFunctionReturn(0);
263 }
264 
265 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
266 {
267   PetscErrorCode ierr;
268   PetscInt       i;
269 
270   PetscFunctionBegin;
271   if (scall == MAT_INITIAL_MATRIX) {
272     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
273   }
274 
275   for (i=0; i<n; i++) {
276     ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
277   }
278   PetscFunctionReturn(0);
279 }
280 
281 /* -------------------------------------------------------*/
282 /* Should check that shapes of vectors and matrices match */
283 /* -------------------------------------------------------*/
284 
285 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
286 {
287   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
288   PetscScalar       *z,sum;
289   const PetscScalar *x;
290   const MatScalar   *v;
291   PetscErrorCode    ierr;
292   PetscInt          mbs,i,n;
293   const PetscInt    *idx,*ii,*ridx=NULL;
294   PetscBool         usecprow=a->compressedrow.use;
295 
296   PetscFunctionBegin;
297   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
298   ierr = VecGetArrayWrite(zz,&z);CHKERRQ(ierr);
299 
300   if (usecprow) {
301     mbs  = a->compressedrow.nrows;
302     ii   = a->compressedrow.i;
303     ridx = a->compressedrow.rindex;
304     ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr);
305   } else {
306     mbs = a->mbs;
307     ii  = a->i;
308   }
309 
310   for (i=0; i<mbs; i++) {
311     n   = ii[1] - ii[0];
312     v   = a->a + ii[0];
313     idx = a->j + ii[0];
314     ii++;
315     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
316     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
317     sum = 0.0;
318     PetscSparseDensePlusDot(sum,x,v,idx,n);
319     if (usecprow) {
320       z[ridx[i]] = sum;
321     } else {
322       z[i]       = sum;
323     }
324   }
325   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
326   ierr = VecRestoreArrayWrite(zz,&z);CHKERRQ(ierr);
327   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
328   PetscFunctionReturn(0);
329 }
330 
331 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
332 {
333   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
334   PetscScalar       *z = NULL,sum1,sum2,*zarray;
335   const PetscScalar *x,*xb;
336   PetscScalar       x1,x2;
337   const MatScalar   *v;
338   PetscErrorCode    ierr;
339   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
340   PetscBool         usecprow=a->compressedrow.use;
341 
342   PetscFunctionBegin;
343   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
344   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
345 
346   idx = a->j;
347   v   = a->a;
348   if (usecprow) {
349     mbs  = a->compressedrow.nrows;
350     ii   = a->compressedrow.i;
351     ridx = a->compressedrow.rindex;
352     ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr);
353   } else {
354     mbs = a->mbs;
355     ii  = a->i;
356     z   = zarray;
357   }
358 
359   for (i=0; i<mbs; i++) {
360     n           = ii[1] - ii[0]; ii++;
361     sum1        = 0.0; sum2 = 0.0;
362     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
363     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
364     for (j=0; j<n; j++) {
365       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
366       sum1 += v[0]*x1 + v[2]*x2;
367       sum2 += v[1]*x1 + v[3]*x2;
368       v    += 4;
369     }
370     if (usecprow) z = zarray + 2*ridx[i];
371     z[0] = sum1; z[1] = sum2;
372     if (!usecprow) z += 2;
373   }
374   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
375   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
376   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
381 {
382   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
383   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
384   const PetscScalar *x,*xb;
385   const MatScalar   *v;
386   PetscErrorCode    ierr;
387   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
388   PetscBool         usecprow=a->compressedrow.use;
389 
390 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
391 #pragma disjoint(*v,*z,*xb)
392 #endif
393 
394   PetscFunctionBegin;
395   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
396   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
397 
398   idx = a->j;
399   v   = a->a;
400   if (usecprow) {
401     mbs  = a->compressedrow.nrows;
402     ii   = a->compressedrow.i;
403     ridx = a->compressedrow.rindex;
404     ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr);
405   } else {
406     mbs = a->mbs;
407     ii  = a->i;
408     z   = zarray;
409   }
410 
411   for (i=0; i<mbs; i++) {
412     n           = ii[1] - ii[0]; ii++;
413     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
414     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
415     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
416     for (j=0; j<n; j++) {
417       xb = x + 3*(*idx++);
418       x1 = xb[0];
419       x2 = xb[1];
420       x3 = xb[2];
421 
422       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
423       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
424       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
425       v    += 9;
426     }
427     if (usecprow) z = zarray + 3*ridx[i];
428     z[0] = sum1; z[1] = sum2; z[2] = sum3;
429     if (!usecprow) z += 3;
430   }
431   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
432   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
433   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
434   PetscFunctionReturn(0);
435 }
436 
437 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
438 {
439   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
440   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
441   const PetscScalar *x,*xb;
442   const MatScalar   *v;
443   PetscErrorCode    ierr;
444   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
445   PetscBool         usecprow=a->compressedrow.use;
446 
447   PetscFunctionBegin;
448   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
449   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
450 
451   idx = a->j;
452   v   = a->a;
453   if (usecprow) {
454     mbs  = a->compressedrow.nrows;
455     ii   = a->compressedrow.i;
456     ridx = a->compressedrow.rindex;
457     ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr);
458   } else {
459     mbs = a->mbs;
460     ii  = a->i;
461     z   = zarray;
462   }
463 
464   for (i=0; i<mbs; i++) {
465     n = ii[1] - ii[0];
466     ii++;
467     sum1 = 0.0;
468     sum2 = 0.0;
469     sum3 = 0.0;
470     sum4 = 0.0;
471 
472     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
473     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
474     for (j=0; j<n; j++) {
475       xb    = x + 4*(*idx++);
476       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
477       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
478       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
479       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
480       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
481       v    += 16;
482     }
483     if (usecprow) z = zarray + 4*ridx[i];
484     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
485     if (!usecprow) z += 4;
486   }
487   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
488   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
489   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
490   PetscFunctionReturn(0);
491 }
492 
493 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
494 {
495   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
496   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
497   const PetscScalar *xb,*x;
498   const MatScalar   *v;
499   PetscErrorCode    ierr;
500   const PetscInt    *idx,*ii,*ridx=NULL;
501   PetscInt          mbs,i,j,n;
502   PetscBool         usecprow=a->compressedrow.use;
503 
504   PetscFunctionBegin;
505   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
506   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
507 
508   idx = a->j;
509   v   = a->a;
510   if (usecprow) {
511     mbs  = a->compressedrow.nrows;
512     ii   = a->compressedrow.i;
513     ridx = a->compressedrow.rindex;
514     ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr);
515   } else {
516     mbs = a->mbs;
517     ii  = a->i;
518     z   = zarray;
519   }
520 
521   for (i=0; i<mbs; i++) {
522     n           = ii[1] - ii[0]; ii++;
523     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
524     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
525     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
526     for (j=0; j<n; j++) {
527       xb    = x + 5*(*idx++);
528       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
529       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
530       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
531       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
532       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
533       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
534       v    += 25;
535     }
536     if (usecprow) z = zarray + 5*ridx[i];
537     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
538     if (!usecprow) z += 5;
539   }
540   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
541   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
542   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
543   PetscFunctionReturn(0);
544 }
545 
546 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
547 {
548   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
549   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
550   const PetscScalar *x,*xb;
551   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
552   const MatScalar   *v;
553   PetscErrorCode    ierr;
554   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
555   PetscBool         usecprow=a->compressedrow.use;
556 
557   PetscFunctionBegin;
558   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
559   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
560 
561   idx = a->j;
562   v   = a->a;
563   if (usecprow) {
564     mbs  = a->compressedrow.nrows;
565     ii   = a->compressedrow.i;
566     ridx = a->compressedrow.rindex;
567     ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr);
568   } else {
569     mbs = a->mbs;
570     ii  = a->i;
571     z   = zarray;
572   }
573 
574   for (i=0; i<mbs; i++) {
575     n  = ii[1] - ii[0];
576     ii++;
577     sum1 = 0.0;
578     sum2 = 0.0;
579     sum3 = 0.0;
580     sum4 = 0.0;
581     sum5 = 0.0;
582     sum6 = 0.0;
583 
584     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
585     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
586     for (j=0; j<n; j++) {
587       xb    = x + 6*(*idx++);
588       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
589       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
590       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
591       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
592       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
593       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
594       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
595       v    += 36;
596     }
597     if (usecprow) z = zarray + 6*ridx[i];
598     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
599     if (!usecprow) z += 6;
600   }
601 
602   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
603   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
604   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
605   PetscFunctionReturn(0);
606 }
607 
608 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
609 {
610   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
611   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
612   const PetscScalar *x,*xb;
613   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
614   const MatScalar   *v;
615   PetscErrorCode    ierr;
616   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
617   PetscBool         usecprow=a->compressedrow.use;
618 
619   PetscFunctionBegin;
620   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
621   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
622 
623   idx = a->j;
624   v   = a->a;
625   if (usecprow) {
626     mbs  = a->compressedrow.nrows;
627     ii   = a->compressedrow.i;
628     ridx = a->compressedrow.rindex;
629     ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr);
630   } else {
631     mbs = a->mbs;
632     ii  = a->i;
633     z   = zarray;
634   }
635 
636   for (i=0; i<mbs; i++) {
637     n  = ii[1] - ii[0];
638     ii++;
639     sum1 = 0.0;
640     sum2 = 0.0;
641     sum3 = 0.0;
642     sum4 = 0.0;
643     sum5 = 0.0;
644     sum6 = 0.0;
645     sum7 = 0.0;
646 
647     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
648     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
649     for (j=0; j<n; j++) {
650       xb    = x + 7*(*idx++);
651       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
652       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
653       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
654       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
655       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
656       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
657       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
658       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
659       v    += 49;
660     }
661     if (usecprow) z = zarray + 7*ridx[i];
662     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
663     if (!usecprow) z += 7;
664   }
665 
666   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
667   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
668   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
669   PetscFunctionReturn(0);
670 }
671 
672 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
673 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
674 {
675   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
676   PetscScalar       *z = NULL,*work,*workt,*zarray;
677   const PetscScalar *x,*xb;
678   const MatScalar   *v;
679   PetscErrorCode    ierr;
680   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
681   const PetscInt    *idx,*ii,*ridx=NULL;
682   PetscInt          k;
683   PetscBool         usecprow=a->compressedrow.use;
684 
685   __m256d a0,a1,a2,a3,a4,a5;
686   __m256d w0,w1,w2,w3;
687   __m256d z0,z1,z2;
688   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
689 
690   PetscFunctionBegin;
691   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
692   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
693 
694   idx = a->j;
695   v   = a->a;
696   if (usecprow) {
697     mbs  = a->compressedrow.nrows;
698     ii   = a->compressedrow.i;
699     ridx = a->compressedrow.rindex;
700     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
701   } else {
702     mbs = a->mbs;
703     ii  = a->i;
704     z   = zarray;
705   }
706 
707   if (!a->mult_work) {
708     k    = PetscMax(A->rmap->n,A->cmap->n);
709     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
710   }
711 
712   work = a->mult_work;
713   for (i=0; i<mbs; i++) {
714     n           = ii[1] - ii[0]; ii++;
715     workt       = work;
716     for (j=0; j<n; j++) {
717       xb = x + bs*(*idx++);
718       for (k=0; k<bs; k++) workt[k] = xb[k];
719       workt += bs;
720     }
721     if (usecprow) z = zarray + bs*ridx[i];
722 
723     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
724 
725     for (j=0; j<n; j++) {
726       /* first column of a */
727       w0 = _mm256_set1_pd(work[j*9  ]);
728       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
729       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
730       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
731 
732       /* second column of a */
733       w1 = _mm256_set1_pd(work[j*9+ 1]);
734       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
735       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
736       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
737 
738       /* third column of a */
739       w2 = _mm256_set1_pd(work[j*9 +2]);
740       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
741       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
742       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
743 
744       /* fourth column of a */
745       w3 = _mm256_set1_pd(work[j*9+ 3]);
746       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
747       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
748       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
749 
750       /* fifth column of a */
751       w0 = _mm256_set1_pd(work[j*9+ 4]);
752       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
753       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
754       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
755 
756       /* sixth column of a */
757       w1 = _mm256_set1_pd(work[j*9+ 5]);
758       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
759       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
760       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
761 
762       /* seventh column of a */
763       w2 = _mm256_set1_pd(work[j*9+ 6]);
764       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
765       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
766       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
767 
768       /* eigth column of a */
769       w3 = _mm256_set1_pd(work[j*9+ 7]);
770       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
771       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
772       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
773 
774       /* ninth column of a */
775       w0 = _mm256_set1_pd(work[j*9+ 8]);
776       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
777       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
778       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
779     }
780 
781     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
782 
783     v += n*bs2;
784     if (!usecprow) z += bs;
785   }
786   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
787   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
788   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 #endif
792 
793 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
794 {
795   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
796   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
797   const PetscScalar *x,*xb;
798   PetscScalar       *zarray,xv;
799   const MatScalar   *v;
800   PetscErrorCode    ierr;
801   const PetscInt    *ii,*ij=a->j,*idx;
802   PetscInt          mbs,i,j,k,n,*ridx=NULL;
803   PetscBool         usecprow=a->compressedrow.use;
804 
805   PetscFunctionBegin;
806   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
807   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
808 
809   v = a->a;
810   if (usecprow) {
811     mbs  = a->compressedrow.nrows;
812     ii   = a->compressedrow.i;
813     ridx = a->compressedrow.rindex;
814     ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr);
815   } else {
816     mbs = a->mbs;
817     ii  = a->i;
818     z   = zarray;
819   }
820 
821   for (i=0; i<mbs; i++) {
822     n    = ii[i+1] - ii[i];
823     idx  = ij + ii[i];
824     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
825     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
826 
827     for (j=0; j<n; j++) {
828       xb = x + 11*(idx[j]);
829 
830       for (k=0; k<11; k++) {
831         xv     =  xb[k];
832         sum1  += v[0]*xv;
833         sum2  += v[1]*xv;
834         sum3  += v[2]*xv;
835         sum4  += v[3]*xv;
836         sum5  += v[4]*xv;
837         sum6  += v[5]*xv;
838         sum7  += v[6]*xv;
839         sum8  += v[7]*xv;
840         sum9  += v[8]*xv;
841         sum10 += v[9]*xv;
842         sum11 += v[10]*xv;
843         v     += 11;
844       }
845     }
846     if (usecprow) z = zarray + 11*ridx[i];
847     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
848     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
849 
850     if (!usecprow) z += 11;
851   }
852 
853   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
854   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
855   ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
856   PetscFunctionReturn(0);
857 }
858 
859 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
860 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
861 {
862   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
863   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
864   const PetscScalar *x,*xb;
865   PetscScalar       *zarray,xv;
866   const MatScalar   *v;
867   PetscErrorCode    ierr;
868   const PetscInt    *ii,*ij=a->j,*idx;
869   PetscInt          mbs,i,j,k,n,*ridx=NULL;
870   PetscBool         usecprow=a->compressedrow.use;
871 
872   PetscFunctionBegin;
873   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
874   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
875 
876   v = a->a;
877   if (usecprow) {
878     mbs  = a->compressedrow.nrows;
879     ii   = a->compressedrow.i;
880     ridx = a->compressedrow.rindex;
881     ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr);
882   } else {
883     mbs = a->mbs;
884     ii  = a->i;
885     z   = zarray;
886   }
887 
888   for (i=0; i<mbs; i++) {
889     n    = ii[i+1] - ii[i];
890     idx  = ij + ii[i];
891     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
892     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
893 
894     for (j=0; j<n; j++) {
895       xb = x + 12*(idx[j]);
896 
897       for (k=0; k<12; k++) {
898         xv     =  xb[k];
899         sum1  += v[0]*xv;
900         sum2  += v[1]*xv;
901         sum3  += v[2]*xv;
902         sum4  += v[3]*xv;
903         sum5  += v[4]*xv;
904         sum6  += v[5]*xv;
905         sum7  += v[6]*xv;
906         sum8  += v[7]*xv;
907         sum9  += v[8]*xv;
908         sum10 += v[9]*xv;
909         sum11 += v[10]*xv;
910         sum12 += v[11]*xv;
911         v     += 12;
912       }
913     }
914     if (usecprow) z = zarray + 12*ridx[i];
915     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
916     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
917     if (!usecprow) z += 12;
918   }
919   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
920   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
921   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
922   PetscFunctionReturn(0);
923 }
924 
925 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
926 {
927   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
928   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
929   const PetscScalar *x,*xb;
930   PetscScalar       *zarray,*yarray,xv;
931   const MatScalar   *v;
932   PetscErrorCode    ierr;
933   const PetscInt    *ii,*ij=a->j,*idx;
934   PetscInt          mbs = a->mbs,i,j,k,n,*ridx=NULL;
935   PetscBool         usecprow=a->compressedrow.use;
936 
937   PetscFunctionBegin;
938   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
939   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
940 
941   v = a->a;
942   if (usecprow) {
943    if (zz != yy) {
944      ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr);
945     }
946     mbs  = a->compressedrow.nrows;
947     ii   = a->compressedrow.i;
948     ridx = a->compressedrow.rindex;
949   } else {
950     ii  = a->i;
951     y   = yarray;
952     z   = zarray;
953   }
954 
955   for (i=0; i<mbs; i++) {
956     n    = ii[i+1] - ii[i];
957     idx  = ij + ii[i];
958 
959     if (usecprow) {
960       y = yarray + 12*ridx[i];
961       z = zarray + 12*ridx[i];
962     }
963     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
964     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
965 
966     for (j=0; j<n; j++) {
967       xb = x + 12*(idx[j]);
968 
969       for (k=0; k<12; k++) {
970         xv     =  xb[k];
971         sum1  += v[0]*xv;
972         sum2  += v[1]*xv;
973         sum3  += v[2]*xv;
974         sum4  += v[3]*xv;
975         sum5  += v[4]*xv;
976         sum6  += v[5]*xv;
977         sum7  += v[6]*xv;
978         sum8  += v[7]*xv;
979         sum9  += v[8]*xv;
980         sum10 += v[9]*xv;
981         sum11 += v[10]*xv;
982         sum12 += v[11]*xv;
983         v     += 12;
984       }
985     }
986 
987     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
988     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
989     if (!usecprow) {
990       y += 12;
991       z += 12;
992     }
993   }
994   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
995   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
996   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
997   PetscFunctionReturn(0);
998 }
999 
1000 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1001 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
1002 {
1003   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1004   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1005   const PetscScalar *x,*xb;
1006   PetscScalar       x1,x2,x3,x4,*zarray;
1007   const MatScalar   *v;
1008   PetscErrorCode    ierr;
1009   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1010   PetscInt          mbs,i,j,n;
1011   PetscBool         usecprow=a->compressedrow.use;
1012 
1013   PetscFunctionBegin;
1014   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1015   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1016 
1017   v = a->a;
1018   if (usecprow) {
1019     mbs  = a->compressedrow.nrows;
1020     ii   = a->compressedrow.i;
1021     ridx = a->compressedrow.rindex;
1022     ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr);
1023   } else {
1024     mbs = a->mbs;
1025     ii  = a->i;
1026     z   = zarray;
1027   }
1028 
1029   for (i=0; i<mbs; i++) {
1030     n    = ii[i+1] - ii[i];
1031     idx  = ij + ii[i];
1032 
1033     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1034     for (j=0; j<n; j++) {
1035       xb = x + 12*(idx[j]);
1036       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1037 
1038       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1039       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1040       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1041       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1042       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1043       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1044       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1045       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1046       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1047       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1048       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1049       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1050       v += 48;
1051 
1052       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1053 
1054       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1055       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1056       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1057       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1058       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1059       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1060       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1061       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1062       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1063       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1064       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1065       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1066       v     += 48;
1067 
1068       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1069       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1070       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1071       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1072       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1073       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1074       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1075       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1076       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1077       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1078       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1079       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1080       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1081       v     += 48;
1082 
1083     }
1084     if (usecprow) z = zarray + 12*ridx[i];
1085     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1086     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1087     if (!usecprow) z += 12;
1088   }
1089   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1090   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1091   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1096 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1097 {
1098   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1099   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1100   const PetscScalar *x,*xb;
1101   PetscScalar       x1,x2,x3,x4,*zarray,*yarray;
1102   const MatScalar   *v;
1103   PetscErrorCode    ierr;
1104   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1105   PetscInt          mbs = a->mbs,i,j,n;
1106   PetscBool         usecprow=a->compressedrow.use;
1107 
1108   PetscFunctionBegin;
1109   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1110   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1111 
1112   v = a->a;
1113   if (usecprow) {
1114     if (zz != yy) {
1115       ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr);
1116     }
1117     mbs  = a->compressedrow.nrows;
1118     ii   = a->compressedrow.i;
1119     ridx = a->compressedrow.rindex;
1120   } else {
1121     ii  = a->i;
1122     y   = yarray;
1123     z   = zarray;
1124   }
1125 
1126   for (i=0; i<mbs; i++) {
1127     n    = ii[i+1] - ii[i];
1128     idx  = ij + ii[i];
1129 
1130     if (usecprow) {
1131       y = yarray + 12*ridx[i];
1132       z = zarray + 12*ridx[i];
1133     }
1134     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1135     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1136 
1137     for (j=0; j<n; j++) {
1138       xb = x + 12*(idx[j]);
1139       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1140 
1141       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1142       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1143       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1144       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1145       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1146       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1147       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1148       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1149       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1150       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1151       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1152       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1153       v += 48;
1154 
1155       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1156 
1157       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1158       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1159       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1160       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1161       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1162       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1163       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1164       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1165       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1166       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1167       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1168       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1169       v     += 48;
1170 
1171       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1172       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1173       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1174       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1175       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1176       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1177       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1178       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1179       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1180       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1181       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1182       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1183       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1184       v     += 48;
1185 
1186     }
1187     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1188     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1189     if (!usecprow) {
1190       y += 12;
1191       z += 12;
1192     }
1193   }
1194   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1195   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1196   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1201 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1202 {
1203   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1204   PetscScalar       *z = NULL,*zarray;
1205   const PetscScalar *x,*work;
1206   const MatScalar   *v = a->a;
1207   PetscErrorCode    ierr;
1208   PetscInt          mbs,i,j,n;
1209   const PetscInt    *idx = a->j,*ii,*ridx=NULL;
1210   PetscBool         usecprow=a->compressedrow.use;
1211   const PetscInt    bs = 12, bs2 = 144;
1212 
1213   __m256d a0,a1,a2,a3,a4,a5;
1214   __m256d w0,w1,w2,w3;
1215   __m256d z0,z1,z2;
1216 
1217   PetscFunctionBegin;
1218   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1219   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1220 
1221   if (usecprow) {
1222     mbs  = a->compressedrow.nrows;
1223     ii   = a->compressedrow.i;
1224     ridx = a->compressedrow.rindex;
1225     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1226   } else {
1227     mbs = a->mbs;
1228     ii  = a->i;
1229     z   = zarray;
1230   }
1231 
1232   for (i=0; i<mbs; i++) {
1233     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1234 
1235     n  = ii[1] - ii[0]; ii++;
1236     for (j=0; j<n; j++) {
1237       work = x + bs*(*idx++);
1238 
1239       /* first column of a */
1240       w0 = _mm256_set1_pd(work[0]);
1241       a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1242       a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1243       a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1244 
1245       /* second column of a */
1246       w1 = _mm256_set1_pd(work[1]);
1247       a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1248       a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1249       a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1250 
1251       /* third column of a */
1252       w2 = _mm256_set1_pd(work[2]);
1253       a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1254       a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1255       a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1256 
1257       /* fourth column of a */
1258       w3 = _mm256_set1_pd(work[3]);
1259       a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1260       a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1261       a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1262 
1263       /* fifth column of a */
1264       w0 = _mm256_set1_pd(work[4]);
1265       a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1266       a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1267       a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1268 
1269       /* sixth column of a */
1270       w1 = _mm256_set1_pd(work[5]);
1271       a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1272       a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1273       a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1274 
1275       /* seventh column of a */
1276       w2 = _mm256_set1_pd(work[6]);
1277       a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1278       a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1279       a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1280 
1281       /* eigth column of a */
1282       w3 = _mm256_set1_pd(work[7]);
1283       a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1284       a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1285       a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1286 
1287       /* ninth column of a */
1288       w0 = _mm256_set1_pd(work[8]);
1289       a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1290       a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1291       a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1292 
1293       /* tenth column of a */
1294       w1 = _mm256_set1_pd(work[9]);
1295       a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1296       a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1297       a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1298 
1299       /* eleventh column of a */
1300       w2 = _mm256_set1_pd(work[10]);
1301       a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1302       a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1303       a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1304 
1305       /* twelveth column of a */
1306       w3 = _mm256_set1_pd(work[11]);
1307       a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1308       a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1309       a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1310 
1311       v += bs2;
1312     }
1313     if (usecprow) z = zarray + bs*ridx[i];
1314     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1315     if (!usecprow) z += bs;
1316   }
1317   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1318   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1319   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1320   PetscFunctionReturn(0);
1321 }
1322 #endif
1323 
1324 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1325 /* Default MatMult for block size 15 */
1326 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1327 {
1328   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1329   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1330   const PetscScalar *x,*xb;
1331   PetscScalar       *zarray,xv;
1332   const MatScalar   *v;
1333   PetscErrorCode    ierr;
1334   const PetscInt    *ii,*ij=a->j,*idx;
1335   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1336   PetscBool         usecprow=a->compressedrow.use;
1337 
1338   PetscFunctionBegin;
1339   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1340   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1341 
1342   v = a->a;
1343   if (usecprow) {
1344     mbs  = a->compressedrow.nrows;
1345     ii   = a->compressedrow.i;
1346     ridx = a->compressedrow.rindex;
1347     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1348   } else {
1349     mbs = a->mbs;
1350     ii  = a->i;
1351     z   = zarray;
1352   }
1353 
1354   for (i=0; i<mbs; i++) {
1355     n    = ii[i+1] - ii[i];
1356     idx  = ij + ii[i];
1357     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1358     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1359 
1360     for (j=0; j<n; j++) {
1361       xb = x + 15*(idx[j]);
1362 
1363       for (k=0; k<15; k++) {
1364         xv     =  xb[k];
1365         sum1  += v[0]*xv;
1366         sum2  += v[1]*xv;
1367         sum3  += v[2]*xv;
1368         sum4  += v[3]*xv;
1369         sum5  += v[4]*xv;
1370         sum6  += v[5]*xv;
1371         sum7  += v[6]*xv;
1372         sum8  += v[7]*xv;
1373         sum9  += v[8]*xv;
1374         sum10 += v[9]*xv;
1375         sum11 += v[10]*xv;
1376         sum12 += v[11]*xv;
1377         sum13 += v[12]*xv;
1378         sum14 += v[13]*xv;
1379         sum15 += v[14]*xv;
1380         v     += 15;
1381       }
1382     }
1383     if (usecprow) z = zarray + 15*ridx[i];
1384     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1385     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1386 
1387     if (!usecprow) z += 15;
1388   }
1389 
1390   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1391   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1392   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1393   PetscFunctionReturn(0);
1394 }
1395 
1396 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1397 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1398 {
1399   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1400   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1401   const PetscScalar *x,*xb;
1402   PetscScalar       x1,x2,x3,x4,*zarray;
1403   const MatScalar   *v;
1404   PetscErrorCode    ierr;
1405   const PetscInt    *ii,*ij=a->j,*idx;
1406   PetscInt          mbs,i,j,n,*ridx=NULL;
1407   PetscBool         usecprow=a->compressedrow.use;
1408 
1409   PetscFunctionBegin;
1410   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1411   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1412 
1413   v = a->a;
1414   if (usecprow) {
1415     mbs  = a->compressedrow.nrows;
1416     ii   = a->compressedrow.i;
1417     ridx = a->compressedrow.rindex;
1418     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1419   } else {
1420     mbs = a->mbs;
1421     ii  = a->i;
1422     z   = zarray;
1423   }
1424 
1425   for (i=0; i<mbs; i++) {
1426     n    = ii[i+1] - ii[i];
1427     idx  = ij + ii[i];
1428     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1429     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1430 
1431     for (j=0; j<n; j++) {
1432       xb = x + 15*(idx[j]);
1433       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1434 
1435       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1436       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1437       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1438       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1439       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1440       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1441       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1442       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1443       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1444       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1445       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1446       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1447       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1448       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1449       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1450 
1451       v += 60;
1452 
1453       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1454 
1455       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1456       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1457       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1458       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1459       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1460       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1461       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1462       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1463       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1464       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1465       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1466       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1467       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1468       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1469       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1470       v     += 60;
1471 
1472       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1473       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1474       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1475       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1476       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1477       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1478       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1479       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1480       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1481       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1482       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1483       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1484       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1485       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1486       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1487       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1488       v     += 60;
1489 
1490       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1491       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1492       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1493       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1494       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1495       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1496       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1497       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1498       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1499       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1500       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1501       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1502       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1503       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1504       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1505       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1506       v     += 45;
1507     }
1508     if (usecprow) z = zarray + 15*ridx[i];
1509     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1510     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1511 
1512     if (!usecprow) z += 15;
1513   }
1514 
1515   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1516   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1517   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1522 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1523 {
1524   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1525   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1526   const PetscScalar *x,*xb;
1527   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1528   const MatScalar   *v;
1529   PetscErrorCode    ierr;
1530   const PetscInt    *ii,*ij=a->j,*idx;
1531   PetscInt          mbs,i,j,n,*ridx=NULL;
1532   PetscBool         usecprow=a->compressedrow.use;
1533 
1534   PetscFunctionBegin;
1535   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1536   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1537 
1538   v = a->a;
1539   if (usecprow) {
1540     mbs  = a->compressedrow.nrows;
1541     ii   = a->compressedrow.i;
1542     ridx = a->compressedrow.rindex;
1543     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1544   } else {
1545     mbs = a->mbs;
1546     ii  = a->i;
1547     z   = zarray;
1548   }
1549 
1550   for (i=0; i<mbs; i++) {
1551     n    = ii[i+1] - ii[i];
1552     idx  = ij + ii[i];
1553     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1554     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1555 
1556     for (j=0; j<n; j++) {
1557       xb = x + 15*(idx[j]);
1558       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1559       x8 = xb[7];
1560 
1561       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1562       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1563       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1564       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1565       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1566       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1567       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1568       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1569       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1570       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1571       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1572       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1573       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1574       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1575       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1576       v     += 120;
1577 
1578       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1579 
1580       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1581       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1582       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1583       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1584       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1585       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1586       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1587       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1588       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1589       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1590       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1591       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1592       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1593       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1594       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1595       v     += 105;
1596     }
1597     if (usecprow) z = zarray + 15*ridx[i];
1598     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1599     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1600 
1601     if (!usecprow) z += 15;
1602   }
1603 
1604   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1605   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1606   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1611 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1612 {
1613   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1614   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1615   const PetscScalar *x,*xb;
1616   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1617   const MatScalar   *v;
1618   PetscErrorCode    ierr;
1619   const PetscInt    *ii,*ij=a->j,*idx;
1620   PetscInt          mbs,i,j,n,*ridx=NULL;
1621   PetscBool         usecprow=a->compressedrow.use;
1622 
1623   PetscFunctionBegin;
1624   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1625   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1626 
1627   v = a->a;
1628   if (usecprow) {
1629     mbs  = a->compressedrow.nrows;
1630     ii   = a->compressedrow.i;
1631     ridx = a->compressedrow.rindex;
1632     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1633   } else {
1634     mbs = a->mbs;
1635     ii  = a->i;
1636     z   = zarray;
1637   }
1638 
1639   for (i=0; i<mbs; i++) {
1640     n    = ii[i+1] - ii[i];
1641     idx  = ij + ii[i];
1642     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1643     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1644 
1645     for (j=0; j<n; j++) {
1646       xb = x + 15*(idx[j]);
1647       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1648       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1649 
1650       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1651       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1652       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1653       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1654       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1655       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1656       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1657       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1658       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1659       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1660       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1661       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1662       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1663       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1664       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1665       v     += 225;
1666     }
1667     if (usecprow) z = zarray + 15*ridx[i];
1668     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1669     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1670 
1671     if (!usecprow) z += 15;
1672   }
1673 
1674   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1675   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1676   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1677   PetscFunctionReturn(0);
1678 }
1679 
1680 /*
1681     This will not work with MatScalar == float because it calls the BLAS
1682 */
1683 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1684 {
1685   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1686   PetscScalar       *z = NULL,*work,*workt,*zarray;
1687   const PetscScalar *x,*xb;
1688   const MatScalar   *v;
1689   PetscErrorCode    ierr;
1690   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1691   const PetscInt    *idx,*ii,*ridx=NULL;
1692   PetscInt          ncols,k;
1693   PetscBool         usecprow=a->compressedrow.use;
1694 
1695   PetscFunctionBegin;
1696   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1697   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1698 
1699   idx = a->j;
1700   v   = a->a;
1701   if (usecprow) {
1702     mbs  = a->compressedrow.nrows;
1703     ii   = a->compressedrow.i;
1704     ridx = a->compressedrow.rindex;
1705     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1706   } else {
1707     mbs = a->mbs;
1708     ii  = a->i;
1709     z   = zarray;
1710   }
1711 
1712   if (!a->mult_work) {
1713     k    = PetscMax(A->rmap->n,A->cmap->n);
1714     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1715   }
1716   work = a->mult_work;
1717   for (i=0; i<mbs; i++) {
1718     n           = ii[1] - ii[0]; ii++;
1719     ncols       = n*bs;
1720     workt       = work;
1721     for (j=0; j<n; j++) {
1722       xb = x + bs*(*idx++);
1723       for (k=0; k<bs; k++) workt[k] = xb[k];
1724       workt += bs;
1725     }
1726     if (usecprow) z = zarray + bs*ridx[i];
1727     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1728     v += n*bs2;
1729     if (!usecprow) z += bs;
1730   }
1731   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1732   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1733   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1734   PetscFunctionReturn(0);
1735 }
1736 
1737 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1738 {
1739   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1740   const PetscScalar *x;
1741   PetscScalar       *y,*z,sum;
1742   const MatScalar   *v;
1743   PetscErrorCode    ierr;
1744   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1745   const PetscInt    *idx,*ii;
1746   PetscBool         usecprow=a->compressedrow.use;
1747 
1748   PetscFunctionBegin;
1749   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1750   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1751 
1752   idx = a->j;
1753   v   = a->a;
1754   if (usecprow) {
1755     if (zz != yy) {
1756       ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr);
1757     }
1758     mbs  = a->compressedrow.nrows;
1759     ii   = a->compressedrow.i;
1760     ridx = a->compressedrow.rindex;
1761   } else {
1762     ii = a->i;
1763   }
1764 
1765   for (i=0; i<mbs; i++) {
1766     n = ii[1] - ii[0];
1767     ii++;
1768     if (!usecprow) {
1769       sum = y[i];
1770     } else {
1771       sum = y[ridx[i]];
1772     }
1773     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1774     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1775     PetscSparseDensePlusDot(sum,x,v,idx,n);
1776     v   += n;
1777     idx += n;
1778     if (usecprow) {
1779       z[ridx[i]] = sum;
1780     } else {
1781       z[i] = sum;
1782     }
1783   }
1784   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1785   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1786   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1787   PetscFunctionReturn(0);
1788 }
1789 
1790 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1791 {
1792   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1793   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1794   const PetscScalar *x,*xb;
1795   PetscScalar       x1,x2,*yarray,*zarray;
1796   const MatScalar   *v;
1797   PetscErrorCode    ierr;
1798   PetscInt          mbs = a->mbs,i,n,j;
1799   const PetscInt    *idx,*ii,*ridx = NULL;
1800   PetscBool         usecprow = a->compressedrow.use;
1801 
1802   PetscFunctionBegin;
1803   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1804   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1805 
1806   idx = a->j;
1807   v   = a->a;
1808   if (usecprow) {
1809     if (zz != yy) {
1810       ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr);
1811     }
1812     mbs  = a->compressedrow.nrows;
1813     ii   = a->compressedrow.i;
1814     ridx = a->compressedrow.rindex;
1815   } else {
1816     ii = a->i;
1817     y  = yarray;
1818     z  = zarray;
1819   }
1820 
1821   for (i=0; i<mbs; i++) {
1822     n = ii[1] - ii[0]; ii++;
1823     if (usecprow) {
1824       z = zarray + 2*ridx[i];
1825       y = yarray + 2*ridx[i];
1826     }
1827     sum1 = y[0]; sum2 = y[1];
1828     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1829     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1830     for (j=0; j<n; j++) {
1831       xb = x + 2*(*idx++);
1832       x1 = xb[0];
1833       x2 = xb[1];
1834 
1835       sum1 += v[0]*x1 + v[2]*x2;
1836       sum2 += v[1]*x1 + v[3]*x2;
1837       v    += 4;
1838     }
1839     z[0] = sum1; z[1] = sum2;
1840     if (!usecprow) {
1841       z += 2; y += 2;
1842     }
1843   }
1844   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1845   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1846   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1847   PetscFunctionReturn(0);
1848 }
1849 
1850 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1851 {
1852   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1853   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1854   const PetscScalar *x,*xb;
1855   const MatScalar   *v;
1856   PetscErrorCode    ierr;
1857   PetscInt          mbs = a->mbs,i,j,n;
1858   const PetscInt    *idx,*ii,*ridx = NULL;
1859   PetscBool         usecprow = a->compressedrow.use;
1860 
1861   PetscFunctionBegin;
1862   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1863   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1864 
1865   idx = a->j;
1866   v   = a->a;
1867   if (usecprow) {
1868     if (zz != yy) {
1869       ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr);
1870     }
1871     mbs  = a->compressedrow.nrows;
1872     ii   = a->compressedrow.i;
1873     ridx = a->compressedrow.rindex;
1874   } else {
1875     ii = a->i;
1876     y  = yarray;
1877     z  = zarray;
1878   }
1879 
1880   for (i=0; i<mbs; i++) {
1881     n = ii[1] - ii[0]; ii++;
1882     if (usecprow) {
1883       z = zarray + 3*ridx[i];
1884       y = yarray + 3*ridx[i];
1885     }
1886     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1887     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1888     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1889     for (j=0; j<n; j++) {
1890       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1891       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1892       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1893       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1894       v    += 9;
1895     }
1896     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1897     if (!usecprow) {
1898       z += 3; y += 3;
1899     }
1900   }
1901   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1902   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1903   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1908 {
1909   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1910   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1911   const PetscScalar *x,*xb;
1912   const MatScalar   *v;
1913   PetscErrorCode    ierr;
1914   PetscInt          mbs = a->mbs,i,j,n;
1915   const PetscInt    *idx,*ii,*ridx=NULL;
1916   PetscBool         usecprow=a->compressedrow.use;
1917 
1918   PetscFunctionBegin;
1919   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1920   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1921 
1922   idx = a->j;
1923   v   = a->a;
1924   if (usecprow) {
1925     if (zz != yy) {
1926       ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr);
1927     }
1928     mbs  = a->compressedrow.nrows;
1929     ii   = a->compressedrow.i;
1930     ridx = a->compressedrow.rindex;
1931   } else {
1932     ii = a->i;
1933     y  = yarray;
1934     z  = zarray;
1935   }
1936 
1937   for (i=0; i<mbs; i++) {
1938     n = ii[1] - ii[0]; ii++;
1939     if (usecprow) {
1940       z = zarray + 4*ridx[i];
1941       y = yarray + 4*ridx[i];
1942     }
1943     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1944     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1945     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1946     for (j=0; j<n; j++) {
1947       xb    = x + 4*(*idx++);
1948       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1949       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1950       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1951       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1952       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1953       v    += 16;
1954     }
1955     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1956     if (!usecprow) {
1957       z += 4; y += 4;
1958     }
1959   }
1960   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1961   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1962   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1963   PetscFunctionReturn(0);
1964 }
1965 
1966 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1967 {
1968   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1969   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1970   const PetscScalar *x,*xb;
1971   PetscScalar       *yarray,*zarray;
1972   const MatScalar   *v;
1973   PetscErrorCode    ierr;
1974   PetscInt          mbs = a->mbs,i,j,n;
1975   const PetscInt    *idx,*ii,*ridx = NULL;
1976   PetscBool         usecprow=a->compressedrow.use;
1977 
1978   PetscFunctionBegin;
1979   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1980   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1981 
1982   idx = a->j;
1983   v   = a->a;
1984   if (usecprow) {
1985     if (zz != yy) {
1986       ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr);
1987     }
1988     mbs  = a->compressedrow.nrows;
1989     ii   = a->compressedrow.i;
1990     ridx = a->compressedrow.rindex;
1991   } else {
1992     ii = a->i;
1993     y  = yarray;
1994     z  = zarray;
1995   }
1996 
1997   for (i=0; i<mbs; i++) {
1998     n = ii[1] - ii[0]; ii++;
1999     if (usecprow) {
2000       z = zarray + 5*ridx[i];
2001       y = yarray + 5*ridx[i];
2002     }
2003     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
2004     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2005     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2006     for (j=0; j<n; j++) {
2007       xb    = x + 5*(*idx++);
2008       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
2009       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2010       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2011       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2012       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2013       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2014       v    += 25;
2015     }
2016     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
2017     if (!usecprow) {
2018       z += 5; y += 5;
2019     }
2020   }
2021   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2022   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2023   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
2024   PetscFunctionReturn(0);
2025 }
2026 
2027 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
2028 {
2029   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2030   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2031   const PetscScalar *x,*xb;
2032   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2033   const MatScalar   *v;
2034   PetscErrorCode    ierr;
2035   PetscInt          mbs = a->mbs,i,j,n;
2036   const PetscInt    *idx,*ii,*ridx=NULL;
2037   PetscBool         usecprow=a->compressedrow.use;
2038 
2039   PetscFunctionBegin;
2040   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2041   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2042 
2043   idx = a->j;
2044   v   = a->a;
2045   if (usecprow) {
2046     if (zz != yy) {
2047       ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr);
2048     }
2049     mbs  = a->compressedrow.nrows;
2050     ii   = a->compressedrow.i;
2051     ridx = a->compressedrow.rindex;
2052   } else {
2053     ii = a->i;
2054     y  = yarray;
2055     z  = zarray;
2056   }
2057 
2058   for (i=0; i<mbs; i++) {
2059     n = ii[1] - ii[0]; ii++;
2060     if (usecprow) {
2061       z = zarray + 6*ridx[i];
2062       y = yarray + 6*ridx[i];
2063     }
2064     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2065     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2066     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2067     for (j=0; j<n; j++) {
2068       xb    = x + 6*(*idx++);
2069       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2070       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2071       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2072       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2073       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2074       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2075       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2076       v    += 36;
2077     }
2078     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2079     if (!usecprow) {
2080       z += 6; y += 6;
2081     }
2082   }
2083   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2084   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2085   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
2086   PetscFunctionReturn(0);
2087 }
2088 
2089 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2090 {
2091   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2092   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2093   const PetscScalar *x,*xb;
2094   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2095   const MatScalar   *v;
2096   PetscErrorCode    ierr;
2097   PetscInt          mbs = a->mbs,i,j,n;
2098   const PetscInt    *idx,*ii,*ridx = NULL;
2099   PetscBool         usecprow=a->compressedrow.use;
2100 
2101   PetscFunctionBegin;
2102   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2103   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2104 
2105   idx = a->j;
2106   v   = a->a;
2107   if (usecprow) {
2108     if (zz != yy) {
2109       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
2110     }
2111     mbs  = a->compressedrow.nrows;
2112     ii   = a->compressedrow.i;
2113     ridx = a->compressedrow.rindex;
2114   } else {
2115     ii = a->i;
2116     y  = yarray;
2117     z  = zarray;
2118   }
2119 
2120   for (i=0; i<mbs; i++) {
2121     n = ii[1] - ii[0]; ii++;
2122     if (usecprow) {
2123       z = zarray + 7*ridx[i];
2124       y = yarray + 7*ridx[i];
2125     }
2126     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2127     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2128     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2129     for (j=0; j<n; j++) {
2130       xb    = x + 7*(*idx++);
2131       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2132       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2133       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2134       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2135       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2136       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2137       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2138       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2139       v    += 49;
2140     }
2141     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2142     if (!usecprow) {
2143       z += 7; y += 7;
2144     }
2145   }
2146   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2147   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2148   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
2149   PetscFunctionReturn(0);
2150 }
2151 
2152 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2153 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2154 {
2155   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2156   PetscScalar       *z = NULL,*work,*workt,*zarray;
2157   const PetscScalar *x,*xb;
2158   const MatScalar   *v;
2159   PetscErrorCode    ierr;
2160   PetscInt          mbs,i,j,n;
2161   PetscInt          k;
2162   PetscBool         usecprow=a->compressedrow.use;
2163   const PetscInt    *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2164 
2165   __m256d a0,a1,a2,a3,a4,a5;
2166   __m256d w0,w1,w2,w3;
2167   __m256d z0,z1,z2;
2168   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2169 
2170   PetscFunctionBegin;
2171   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2172   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2173   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2174 
2175   idx = a->j;
2176   v   = a->a;
2177   if (usecprow) {
2178     mbs  = a->compressedrow.nrows;
2179     ii   = a->compressedrow.i;
2180     ridx = a->compressedrow.rindex;
2181   } else {
2182     mbs = a->mbs;
2183     ii  = a->i;
2184     z   = zarray;
2185   }
2186 
2187   if (!a->mult_work) {
2188     k    = PetscMax(A->rmap->n,A->cmap->n);
2189     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2190   }
2191 
2192   work = a->mult_work;
2193   for (i=0; i<mbs; i++) {
2194     n           = ii[1] - ii[0]; ii++;
2195     workt       = work;
2196     for (j=0; j<n; j++) {
2197       xb = x + bs*(*idx++);
2198       for (k=0; k<bs; k++) workt[k] = xb[k];
2199       workt += bs;
2200     }
2201     if (usecprow) z = zarray + bs*ridx[i];
2202 
2203     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2204 
2205     for (j=0; j<n; j++) {
2206       /* first column of a */
2207       w0 = _mm256_set1_pd(work[j*9  ]);
2208       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2209       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2210       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2211 
2212       /* second column of a */
2213       w1 = _mm256_set1_pd(work[j*9+ 1]);
2214       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2215       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2216       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2217 
2218       /* third column of a */
2219       w2 = _mm256_set1_pd(work[j*9+ 2]);
2220       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2221       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2222       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2223 
2224       /* fourth column of a */
2225       w3 = _mm256_set1_pd(work[j*9+ 3]);
2226       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2227       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2228       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2229 
2230       /* fifth column of a */
2231       w0 = _mm256_set1_pd(work[j*9+ 4]);
2232       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2233       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2234       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2235 
2236       /* sixth column of a */
2237       w1 = _mm256_set1_pd(work[j*9+ 5]);
2238       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2239       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2240       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2241 
2242       /* seventh column of a */
2243       w2 = _mm256_set1_pd(work[j*9+ 6]);
2244       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2245       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2246       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2247 
2248       /* eigth column of a */
2249       w3 = _mm256_set1_pd(work[j*9+ 7]);
2250       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2251       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2252       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2253 
2254       /* ninth column of a */
2255       w0 = _mm256_set1_pd(work[j*9+ 8]);
2256       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2257       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2258       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2259     }
2260 
2261     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2262 
2263     v += n*bs2;
2264     if (!usecprow) z += bs;
2265   }
2266   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2267   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2268   ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr);
2269   PetscFunctionReturn(0);
2270 }
2271 #endif
2272 
2273 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2274 {
2275   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2276   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2277   const PetscScalar *x,*xb;
2278   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2279   const MatScalar   *v;
2280   PetscErrorCode    ierr;
2281   PetscInt          mbs = a->mbs,i,j,n;
2282   const PetscInt    *idx,*ii,*ridx = NULL;
2283   PetscBool         usecprow=a->compressedrow.use;
2284 
2285   PetscFunctionBegin;
2286   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2287   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2288 
2289   idx = a->j;
2290   v   = a->a;
2291   if (usecprow) {
2292     if (zz != yy) {
2293       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
2294     }
2295     mbs  = a->compressedrow.nrows;
2296     ii   = a->compressedrow.i;
2297     ridx = a->compressedrow.rindex;
2298   } else {
2299     ii = a->i;
2300     y  = yarray;
2301     z  = zarray;
2302   }
2303 
2304   for (i=0; i<mbs; i++) {
2305     n = ii[1] - ii[0]; ii++;
2306     if (usecprow) {
2307       z = zarray + 11*ridx[i];
2308       y = yarray + 11*ridx[i];
2309     }
2310     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2311     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2312     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2313     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2314     for (j=0; j<n; j++) {
2315       xb    = x + 11*(*idx++);
2316       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2317       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
2318       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
2319       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
2320       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
2321       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
2322       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
2323       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
2324       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
2325       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
2326       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
2327       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
2328       v    += 121;
2329     }
2330     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2331     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
2332     if (!usecprow) {
2333       z += 11; y += 11;
2334     }
2335   }
2336   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2337   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2338   ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
2339   PetscFunctionReturn(0);
2340 }
2341 
2342 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2343 {
2344   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2345   PetscScalar       *z = NULL,*work,*workt,*zarray;
2346   const PetscScalar *x,*xb;
2347   const MatScalar   *v;
2348   PetscErrorCode    ierr;
2349   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2350   PetscInt          ncols,k;
2351   const PetscInt    *ridx = NULL,*idx,*ii;
2352   PetscBool         usecprow = a->compressedrow.use;
2353 
2354   PetscFunctionBegin;
2355   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2356   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2357   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2358 
2359   idx = a->j;
2360   v   = a->a;
2361   if (usecprow) {
2362     mbs  = a->compressedrow.nrows;
2363     ii   = a->compressedrow.i;
2364     ridx = a->compressedrow.rindex;
2365   } else {
2366     mbs = a->mbs;
2367     ii  = a->i;
2368     z   = zarray;
2369   }
2370 
2371   if (!a->mult_work) {
2372     k    = PetscMax(A->rmap->n,A->cmap->n);
2373     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2374   }
2375   work = a->mult_work;
2376   for (i=0; i<mbs; i++) {
2377     n     = ii[1] - ii[0]; ii++;
2378     ncols = n*bs;
2379     workt = work;
2380     for (j=0; j<n; j++) {
2381       xb = x + bs*(*idx++);
2382       for (k=0; k<bs; k++) workt[k] = xb[k];
2383       workt += bs;
2384     }
2385     if (usecprow) z = zarray + bs*ridx[i];
2386     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2387     v += n*bs2;
2388     if (!usecprow) z += bs;
2389   }
2390   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2391   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2392   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
2393   PetscFunctionReturn(0);
2394 }
2395 
2396 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2397 {
2398   PetscScalar    zero = 0.0;
2399   PetscErrorCode ierr;
2400 
2401   PetscFunctionBegin;
2402   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2403   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2404   PetscFunctionReturn(0);
2405 }
2406 
2407 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2408 {
2409   PetscScalar    zero = 0.0;
2410   PetscErrorCode ierr;
2411 
2412   PetscFunctionBegin;
2413   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2414   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2415   PetscFunctionReturn(0);
2416 }
2417 
2418 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2419 {
2420   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2421   PetscScalar       *z,x1,x2,x3,x4,x5;
2422   const PetscScalar *x,*xb = NULL;
2423   const MatScalar   *v;
2424   PetscErrorCode    ierr;
2425   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2426   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2427   Mat_CompressedRow cprow = a->compressedrow;
2428   PetscBool         usecprow = cprow.use;
2429 
2430   PetscFunctionBegin;
2431   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2432   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2433   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2434 
2435   idx = a->j;
2436   v   = a->a;
2437   if (usecprow) {
2438     mbs  = cprow.nrows;
2439     ii   = cprow.i;
2440     ridx = cprow.rindex;
2441   } else {
2442     mbs=a->mbs;
2443     ii = a->i;
2444     xb = x;
2445   }
2446 
2447   switch (bs) {
2448   case 1:
2449     for (i=0; i<mbs; i++) {
2450       if (usecprow) xb = x + ridx[i];
2451       x1 = xb[0];
2452       ib = idx + ii[0];
2453       n  = ii[1] - ii[0]; ii++;
2454       for (j=0; j<n; j++) {
2455         rval     = ib[j];
2456         z[rval] += PetscConj(*v) * x1;
2457         v++;
2458       }
2459       if (!usecprow) xb++;
2460     }
2461     break;
2462   case 2:
2463     for (i=0; i<mbs; i++) {
2464       if (usecprow) xb = x + 2*ridx[i];
2465       x1 = xb[0]; x2 = xb[1];
2466       ib = idx + ii[0];
2467       n  = ii[1] - ii[0]; ii++;
2468       for (j=0; j<n; j++) {
2469         rval       = ib[j]*2;
2470         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2471         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2472         v         += 4;
2473       }
2474       if (!usecprow) xb += 2;
2475     }
2476     break;
2477   case 3:
2478     for (i=0; i<mbs; i++) {
2479       if (usecprow) xb = x + 3*ridx[i];
2480       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2481       ib = idx + ii[0];
2482       n  = ii[1] - ii[0]; ii++;
2483       for (j=0; j<n; j++) {
2484         rval       = ib[j]*3;
2485         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2486         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2487         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2488         v         += 9;
2489       }
2490       if (!usecprow) xb += 3;
2491     }
2492     break;
2493   case 4:
2494     for (i=0; i<mbs; i++) {
2495       if (usecprow) xb = x + 4*ridx[i];
2496       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2497       ib = idx + ii[0];
2498       n  = ii[1] - ii[0]; ii++;
2499       for (j=0; j<n; j++) {
2500         rval       = ib[j]*4;
2501         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2502         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2503         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2504         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2505         v         += 16;
2506       }
2507       if (!usecprow) xb += 4;
2508     }
2509     break;
2510   case 5:
2511     for (i=0; i<mbs; i++) {
2512       if (usecprow) xb = x + 5*ridx[i];
2513       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2514       x4 = xb[3]; x5 = xb[4];
2515       ib = idx + ii[0];
2516       n  = ii[1] - ii[0]; ii++;
2517       for (j=0; j<n; j++) {
2518         rval       = ib[j]*5;
2519         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2520         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2521         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2522         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2523         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2524         v         += 25;
2525       }
2526       if (!usecprow) xb += 5;
2527     }
2528     break;
2529   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2530     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2531 #if 0
2532     {
2533       PetscInt          ncols,k,bs2=a->bs2;
2534       PetscScalar       *work,*workt,zb;
2535       const PetscScalar *xtmp;
2536       if (!a->mult_work) {
2537         k    = PetscMax(A->rmap->n,A->cmap->n);
2538         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2539       }
2540       work = a->mult_work;
2541       xtmp = x;
2542       for (i=0; i<mbs; i++) {
2543         n     = ii[1] - ii[0]; ii++;
2544         ncols = n*bs;
2545         ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2546         if (usecprow) xtmp = x + bs*ridx[i];
2547         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2548         v += n*bs2;
2549         if (!usecprow) xtmp += bs;
2550         workt = work;
2551         for (j=0; j<n; j++) {
2552           zb = z + bs*(*idx++);
2553           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2554           workt += bs;
2555         }
2556       }
2557     }
2558 #endif
2559   }
2560   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2561   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2562   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2563   PetscFunctionReturn(0);
2564 }
2565 
2566 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2567 {
2568   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2569   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2570   const PetscScalar *x,*xb = NULL;
2571   const MatScalar   *v;
2572   PetscErrorCode    ierr;
2573   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2574   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2575   Mat_CompressedRow cprow   = a->compressedrow;
2576   PetscBool         usecprow=cprow.use;
2577 
2578   PetscFunctionBegin;
2579   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2580   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2581   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2582 
2583   idx = a->j;
2584   v   = a->a;
2585   if (usecprow) {
2586     mbs  = cprow.nrows;
2587     ii   = cprow.i;
2588     ridx = cprow.rindex;
2589   } else {
2590     mbs=a->mbs;
2591     ii = a->i;
2592     xb = x;
2593   }
2594 
2595   switch (bs) {
2596   case 1:
2597     for (i=0; i<mbs; i++) {
2598       if (usecprow) xb = x + ridx[i];
2599       x1 = xb[0];
2600       ib = idx + ii[0];
2601       n  = ii[1] - ii[0]; ii++;
2602       for (j=0; j<n; j++) {
2603         rval     = ib[j];
2604         z[rval] += *v * x1;
2605         v++;
2606       }
2607       if (!usecprow) xb++;
2608     }
2609     break;
2610   case 2:
2611     for (i=0; i<mbs; i++) {
2612       if (usecprow) xb = x + 2*ridx[i];
2613       x1 = xb[0]; x2 = xb[1];
2614       ib = idx + ii[0];
2615       n  = ii[1] - ii[0]; ii++;
2616       for (j=0; j<n; j++) {
2617         rval       = ib[j]*2;
2618         z[rval++] += v[0]*x1 + v[1]*x2;
2619         z[rval++] += v[2]*x1 + v[3]*x2;
2620         v         += 4;
2621       }
2622       if (!usecprow) xb += 2;
2623     }
2624     break;
2625   case 3:
2626     for (i=0; i<mbs; i++) {
2627       if (usecprow) xb = x + 3*ridx[i];
2628       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2629       ib = idx + ii[0];
2630       n  = ii[1] - ii[0]; ii++;
2631       for (j=0; j<n; j++) {
2632         rval       = ib[j]*3;
2633         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2634         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2635         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2636         v         += 9;
2637       }
2638       if (!usecprow) xb += 3;
2639     }
2640     break;
2641   case 4:
2642     for (i=0; i<mbs; i++) {
2643       if (usecprow) xb = x + 4*ridx[i];
2644       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2645       ib = idx + ii[0];
2646       n  = ii[1] - ii[0]; ii++;
2647       for (j=0; j<n; j++) {
2648         rval       = ib[j]*4;
2649         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2650         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2651         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2652         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2653         v         += 16;
2654       }
2655       if (!usecprow) xb += 4;
2656     }
2657     break;
2658   case 5:
2659     for (i=0; i<mbs; i++) {
2660       if (usecprow) xb = x + 5*ridx[i];
2661       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2662       x4 = xb[3]; x5 = xb[4];
2663       ib = idx + ii[0];
2664       n  = ii[1] - ii[0]; ii++;
2665       for (j=0; j<n; j++) {
2666         rval       = ib[j]*5;
2667         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2668         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2669         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2670         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2671         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2672         v         += 25;
2673       }
2674       if (!usecprow) xb += 5;
2675     }
2676     break;
2677   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2678     PetscInt          ncols,k;
2679     PetscScalar       *work,*workt;
2680     const PetscScalar *xtmp;
2681     if (!a->mult_work) {
2682       k    = PetscMax(A->rmap->n,A->cmap->n);
2683       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2684     }
2685     work = a->mult_work;
2686     xtmp = x;
2687     for (i=0; i<mbs; i++) {
2688       n     = ii[1] - ii[0]; ii++;
2689       ncols = n*bs;
2690       ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2691       if (usecprow) xtmp = x + bs*ridx[i];
2692       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2693       v += n*bs2;
2694       if (!usecprow) xtmp += bs;
2695       workt = work;
2696       for (j=0; j<n; j++) {
2697         zb = z + bs*(*idx++);
2698         for (k=0; k<bs; k++) zb[k] += workt[k];
2699         workt += bs;
2700       }
2701     }
2702     }
2703   }
2704   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2705   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2706   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2707   PetscFunctionReturn(0);
2708 }
2709 
2710 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2711 {
2712   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2713   PetscInt       totalnz = a->bs2*a->nz;
2714   PetscScalar    oalpha  = alpha;
2715   PetscErrorCode ierr;
2716   PetscBLASInt   one = 1,tnz;
2717 
2718   PetscFunctionBegin;
2719   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2720   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2721   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2722   PetscFunctionReturn(0);
2723 }
2724 
2725 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2726 {
2727   PetscErrorCode ierr;
2728   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2729   MatScalar      *v  = a->a;
2730   PetscReal      sum = 0.0;
2731   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2732 
2733   PetscFunctionBegin;
2734   if (type == NORM_FROBENIUS) {
2735 #if defined(PETSC_USE_REAL___FP16)
2736     PetscBLASInt one = 1,cnt = bs2*nz;
2737     PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2738 #else
2739     for (i=0; i<bs2*nz; i++) {
2740       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2741     }
2742 #endif
2743     *norm = PetscSqrtReal(sum);
2744     ierr = PetscLogFlops(2.0*bs2*nz);CHKERRQ(ierr);
2745   } else if (type == NORM_1) { /* maximum column sum */
2746     PetscReal *tmp;
2747     PetscInt  *bcol = a->j;
2748     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2749     for (i=0; i<nz; i++) {
2750       for (j=0; j<bs; j++) {
2751         k1 = bs*(*bcol) + j; /* column index */
2752         for (k=0; k<bs; k++) {
2753           tmp[k1] += PetscAbsScalar(*v); v++;
2754         }
2755       }
2756       bcol++;
2757     }
2758     *norm = 0.0;
2759     for (j=0; j<A->cmap->n; j++) {
2760       if (tmp[j] > *norm) *norm = tmp[j];
2761     }
2762     ierr = PetscFree(tmp);CHKERRQ(ierr);
2763     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2764   } else if (type == NORM_INFINITY) { /* maximum row sum */
2765     *norm = 0.0;
2766     for (k=0; k<bs; k++) {
2767       for (j=0; j<a->mbs; j++) {
2768         v   = a->a + bs2*a->i[j] + k;
2769         sum = 0.0;
2770         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2771           for (k1=0; k1<bs; k1++) {
2772             sum += PetscAbsScalar(*v);
2773             v   += bs;
2774           }
2775         }
2776         if (sum > *norm) *norm = sum;
2777       }
2778     }
2779     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2780   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2781   PetscFunctionReturn(0);
2782 }
2783 
2784 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2785 {
2786   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2787   PetscErrorCode ierr;
2788 
2789   PetscFunctionBegin;
2790   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2791   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2792     *flg = PETSC_FALSE;
2793     PetscFunctionReturn(0);
2794   }
2795 
2796   /* if the a->i are the same */
2797   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
2798   if (!*flg) PetscFunctionReturn(0);
2799 
2800   /* if a->j are the same */
2801   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
2802   if (!*flg) PetscFunctionReturn(0);
2803 
2804   /* if a->a are the same */
2805   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr);
2806   PetscFunctionReturn(0);
2807 
2808 }
2809 
2810 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2811 {
2812   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2813   PetscErrorCode ierr;
2814   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2815   PetscScalar    *x,zero = 0.0;
2816   MatScalar      *aa,*aa_j;
2817 
2818   PetscFunctionBegin;
2819   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2820   bs   = A->rmap->bs;
2821   aa   = a->a;
2822   ai   = a->i;
2823   aj   = a->j;
2824   ambs = a->mbs;
2825   bs2  = a->bs2;
2826 
2827   ierr = VecSet(v,zero);CHKERRQ(ierr);
2828   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2829   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2830   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2831   for (i=0; i<ambs; i++) {
2832     for (j=ai[i]; j<ai[i+1]; j++) {
2833       if (aj[j] == i) {
2834         row  = i*bs;
2835         aa_j = aa+j*bs2;
2836         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2837         break;
2838       }
2839     }
2840   }
2841   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2842   PetscFunctionReturn(0);
2843 }
2844 
2845 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2846 {
2847   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2848   const PetscScalar *l,*r,*li,*ri;
2849   PetscScalar       x;
2850   MatScalar         *aa, *v;
2851   PetscErrorCode    ierr;
2852   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2853   const PetscInt    *ai,*aj;
2854 
2855   PetscFunctionBegin;
2856   ai  = a->i;
2857   aj  = a->j;
2858   aa  = a->a;
2859   m   = A->rmap->n;
2860   n   = A->cmap->n;
2861   bs  = A->rmap->bs;
2862   mbs = a->mbs;
2863   bs2 = a->bs2;
2864   if (ll) {
2865     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2866     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2867     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2868     for (i=0; i<mbs; i++) { /* for each block row */
2869       M  = ai[i+1] - ai[i];
2870       li = l + i*bs;
2871       v  = aa + bs2*ai[i];
2872       for (j=0; j<M; j++) { /* for each block */
2873         for (k=0; k<bs2; k++) {
2874           (*v++) *= li[k%bs];
2875         }
2876       }
2877     }
2878     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2879     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2880   }
2881 
2882   if (rr) {
2883     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2884     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2885     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2886     for (i=0; i<mbs; i++) { /* for each block row */
2887       iai = ai[i];
2888       M   = ai[i+1] - iai;
2889       v   = aa + bs2*iai;
2890       for (j=0; j<M; j++) { /* for each block */
2891         ri = r + bs*aj[iai+j];
2892         for (k=0; k<bs; k++) {
2893           x = ri[k];
2894           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2895           v += bs;
2896         }
2897       }
2898     }
2899     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2900     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2901   }
2902   PetscFunctionReturn(0);
2903 }
2904 
2905 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2906 {
2907   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2908 
2909   PetscFunctionBegin;
2910   info->block_size   = a->bs2;
2911   info->nz_allocated = a->bs2*a->maxnz;
2912   info->nz_used      = a->bs2*a->nz;
2913   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2914   info->assemblies   = A->num_ass;
2915   info->mallocs      = A->info.mallocs;
2916   info->memory       = ((PetscObject)A)->mem;
2917   if (A->factortype) {
2918     info->fill_ratio_given  = A->info.fill_ratio_given;
2919     info->fill_ratio_needed = A->info.fill_ratio_needed;
2920     info->factor_mallocs    = A->info.factor_mallocs;
2921   } else {
2922     info->fill_ratio_given  = 0;
2923     info->fill_ratio_needed = 0;
2924     info->factor_mallocs    = 0;
2925   }
2926   PetscFunctionReturn(0);
2927 }
2928 
2929 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2930 {
2931   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2932   PetscErrorCode ierr;
2933 
2934   PetscFunctionBegin;
2935   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
2936   PetscFunctionReturn(0);
2937 }
2938 
2939 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2940 {
2941   PetscErrorCode ierr;
2942 
2943   PetscFunctionBegin;
2944   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
2945   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2946   PetscFunctionReturn(0);
2947 }
2948 
2949 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2950 {
2951   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2952   PetscScalar       *z = NULL,sum1;
2953   const PetscScalar *xb;
2954   PetscScalar       x1;
2955   const MatScalar   *v,*vv;
2956   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2957   PetscBool         usecprow=a->compressedrow.use;
2958 
2959   PetscFunctionBegin;
2960   idx = a->j;
2961   v   = a->a;
2962   if (usecprow) {
2963     mbs  = a->compressedrow.nrows;
2964     ii   = a->compressedrow.i;
2965     ridx = a->compressedrow.rindex;
2966   } else {
2967     mbs = a->mbs;
2968     ii  = a->i;
2969     z   = c;
2970   }
2971 
2972   for (i=0; i<mbs; i++) {
2973     n           = ii[1] - ii[0]; ii++;
2974     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2975     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2976     if (usecprow) z = c + ridx[i];
2977     jj = idx;
2978     vv = v;
2979     for (k=0; k<cn; k++) {
2980       idx = jj;
2981       v = vv;
2982       sum1    = 0.0;
2983       for (j=0; j<n; j++) {
2984         xb    = b + (*idx++); x1 = xb[0+k*bm];
2985         sum1 += v[0]*x1;
2986         v    += 1;
2987       }
2988       z[0+k*cm] = sum1;
2989     }
2990     if (!usecprow) z += 1;
2991   }
2992   PetscFunctionReturn(0);
2993 }
2994 
2995 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2996 {
2997   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2998   PetscScalar       *z = NULL,sum1,sum2;
2999   const PetscScalar *xb;
3000   PetscScalar       x1,x2;
3001   const MatScalar   *v,*vv;
3002   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3003   PetscBool         usecprow=a->compressedrow.use;
3004 
3005   PetscFunctionBegin;
3006   idx = a->j;
3007   v   = a->a;
3008   if (usecprow) {
3009     mbs  = a->compressedrow.nrows;
3010     ii   = a->compressedrow.i;
3011     ridx = a->compressedrow.rindex;
3012   } else {
3013     mbs = a->mbs;
3014     ii  = a->i;
3015     z   = c;
3016   }
3017 
3018   for (i=0; i<mbs; i++) {
3019     n           = ii[1] - ii[0]; ii++;
3020     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3021     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3022     if (usecprow) z = c + 2*ridx[i];
3023     jj = idx;
3024     vv = v;
3025     for (k=0; k<cn; k++) {
3026       idx = jj;
3027       v = vv;
3028       sum1    = 0.0; sum2 = 0.0;
3029       for (j=0; j<n; j++) {
3030         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
3031         sum1 += v[0]*x1 + v[2]*x2;
3032         sum2 += v[1]*x1 + v[3]*x2;
3033         v    += 4;
3034       }
3035       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
3036     }
3037     if (!usecprow) z += 2;
3038   }
3039   PetscFunctionReturn(0);
3040 }
3041 
3042 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3043 {
3044   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3045   PetscScalar       *z = NULL,sum1,sum2,sum3;
3046   const PetscScalar *xb;
3047   PetscScalar       x1,x2,x3;
3048   const MatScalar   *v,*vv;
3049   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3050   PetscBool         usecprow=a->compressedrow.use;
3051 
3052   PetscFunctionBegin;
3053   idx = a->j;
3054   v   = a->a;
3055   if (usecprow) {
3056     mbs  = a->compressedrow.nrows;
3057     ii   = a->compressedrow.i;
3058     ridx = a->compressedrow.rindex;
3059   } else {
3060     mbs = a->mbs;
3061     ii  = a->i;
3062     z   = c;
3063   }
3064 
3065   for (i=0; i<mbs; i++) {
3066     n           = ii[1] - ii[0]; ii++;
3067     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3068     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3069     if (usecprow) z = c + 3*ridx[i];
3070     jj = idx;
3071     vv = v;
3072     for (k=0; k<cn; k++) {
3073       idx = jj;
3074       v = vv;
3075       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
3076       for (j=0; j<n; j++) {
3077         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3078         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3079         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3080         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3081         v    += 9;
3082       }
3083       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3084     }
3085     if (!usecprow) z += 3;
3086   }
3087   PetscFunctionReturn(0);
3088 }
3089 
3090 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3091 {
3092   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3093   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
3094   const PetscScalar *xb;
3095   PetscScalar       x1,x2,x3,x4;
3096   const MatScalar   *v,*vv;
3097   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3098   PetscBool         usecprow=a->compressedrow.use;
3099 
3100   PetscFunctionBegin;
3101   idx = a->j;
3102   v   = a->a;
3103   if (usecprow) {
3104     mbs  = a->compressedrow.nrows;
3105     ii   = a->compressedrow.i;
3106     ridx = a->compressedrow.rindex;
3107   } else {
3108     mbs = a->mbs;
3109     ii  = a->i;
3110     z   = c;
3111   }
3112 
3113   for (i=0; i<mbs; i++) {
3114     n           = ii[1] - ii[0]; ii++;
3115     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3116     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3117     if (usecprow) z = c + 4*ridx[i];
3118     jj = idx;
3119     vv = v;
3120     for (k=0; k<cn; k++) {
3121       idx = jj;
3122       v = vv;
3123       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3124       for (j=0; j<n; j++) {
3125         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3126         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3127         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3128         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3129         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3130         v    += 16;
3131       }
3132       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3133     }
3134     if (!usecprow) z += 4;
3135   }
3136   PetscFunctionReturn(0);
3137 }
3138 
3139 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3140 {
3141   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3142   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
3143   const PetscScalar *xb;
3144   PetscScalar       x1,x2,x3,x4,x5;
3145   const MatScalar   *v,*vv;
3146   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3147   PetscBool         usecprow=a->compressedrow.use;
3148 
3149   PetscFunctionBegin;
3150   idx = a->j;
3151   v   = a->a;
3152   if (usecprow) {
3153     mbs  = a->compressedrow.nrows;
3154     ii   = a->compressedrow.i;
3155     ridx = a->compressedrow.rindex;
3156   } else {
3157     mbs = a->mbs;
3158     ii  = a->i;
3159     z   = c;
3160   }
3161 
3162   for (i=0; i<mbs; i++) {
3163     n           = ii[1] - ii[0]; ii++;
3164     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3165     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3166     if (usecprow) z = c + 5*ridx[i];
3167     jj = idx;
3168     vv = v;
3169     for (k=0; k<cn; k++) {
3170       idx = jj;
3171       v = vv;
3172       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3173       for (j=0; j<n; j++) {
3174         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3175         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3176         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3177         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3178         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3179         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3180         v    += 25;
3181       }
3182       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3183     }
3184     if (!usecprow) z += 5;
3185   }
3186   PetscFunctionReturn(0);
3187 }
3188 
3189 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3190 {
3191   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3192   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
3193   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
3194   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3195   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3196   PetscBLASInt      bbs,bcn,bbm,bcm;
3197   PetscScalar       *z = NULL;
3198   PetscScalar       *c,*b;
3199   const MatScalar   *v;
3200   const PetscInt    *idx,*ii,*ridx=NULL;
3201   PetscScalar       _DZero=0.0,_DOne=1.0;
3202   PetscBool         usecprow=a->compressedrow.use;
3203   PetscErrorCode    ierr;
3204 
3205   PetscFunctionBegin;
3206   if (!cm || !cn) PetscFunctionReturn(0);
3207   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
3208   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
3209   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
3210   b = bd->v;
3211   if (a->nonzerorowcnt != A->rmap->n) {
3212     ierr = MatZeroEntries(C);CHKERRQ(ierr);
3213   }
3214   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
3215   switch (bs) {
3216   case 1:
3217     ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3218     break;
3219   case 2:
3220     ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3221     break;
3222   case 3:
3223     ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3224     break;
3225   case 4:
3226     ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3227     break;
3228   case 5:
3229     ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3230     break;
3231   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3232     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
3233     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
3234     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
3235     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
3236     idx = a->j;
3237     v   = a->a;
3238     if (usecprow) {
3239       mbs  = a->compressedrow.nrows;
3240       ii   = a->compressedrow.i;
3241       ridx = a->compressedrow.rindex;
3242     } else {
3243       mbs = a->mbs;
3244       ii  = a->i;
3245       z   = c;
3246     }
3247     for (i=0; i<mbs; i++) {
3248       n = ii[1] - ii[0]; ii++;
3249       if (usecprow) z = c + bs*ridx[i];
3250       if (n) {
3251         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3252         v += bs2;
3253       }
3254       for (j=1; j<n; j++) {
3255         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3256         v += bs2;
3257       }
3258       if (!usecprow) z += bs;
3259     }
3260   }
3261   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
3262   ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr);
3263   PetscFunctionReturn(0);
3264 }
3265