xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision ccb4e88a40f0b86eaeca07ff64c64e4de2fae686)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <../src/mat/impls/dense/seq/dense.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8 #include <immintrin.h>
9 #endif
10 
11 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12 {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
16   const PetscInt *idx;
17   PetscInt       start,end,*ai,*aj,bs,*nidx2;
18   PetscBT        table;
19 
20   PetscFunctionBegin;
21   m  = a->mbs;
22   ai = a->i;
23   aj = a->j;
24   bs = A->rmap->bs;
25 
26   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
27 
28   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
29   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
30   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
31 
32   for (i=0; i<is_max; i++) {
33     /* Initialise the two local arrays */
34     isz  = 0;
35     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
36 
37     /* Extract the indices, assume there can be duplicate entries */
38     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
39     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
40 
41     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
42     for (j=0; j<n; ++j) {
43       ival = idx[j]/bs; /* convert the indices into block indices */
44       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
45       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap*/
52       n = isz;
53       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
54         row   = nidx[k];
55         start = ai[row];
56         end   = ai[row+1];
57         for (l = start; l<end; l++) {
58           val = aj[l];
59           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
60         }
61       }
62     }
63     /* expand the Index Set */
64     for (j=0; j<isz; j++) {
65       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
66     }
67     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
68   }
69   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
70   ierr = PetscFree(nidx);CHKERRQ(ierr);
71   ierr = PetscFree(nidx2);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76 {
77   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
78   PetscErrorCode ierr;
79   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
81   const PetscInt *irow,*icol;
82   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83   PetscInt       *aj = a->j,*ai = a->i;
84   MatScalar      *mat_a;
85   Mat            C;
86   PetscBool      flag;
87 
88   PetscFunctionBegin;
89   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
90   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
91   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
93 
94   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
95   ssmap = smap;
96   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
97   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98   /* determine lens of each row */
99   for (i=0; i<nrows; i++) {
100     kstart  = ai[irow[i]];
101     kend    = kstart + a->ilen[irow[i]];
102     lens[i] = 0;
103     for (k=kstart; k<kend; k++) {
104       if (ssmap[aj[k]]) lens[i]++;
105     }
106   }
107   /* Create and fill new matrix */
108   if (scall == MAT_REUSE_MATRIX) {
109     c = (Mat_SeqBAIJ*)((*B)->data);
110 
111     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
113     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
115     C    = *B;
116   } else {
117     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
118     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
119     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
120     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
121   }
122   c = (Mat_SeqBAIJ*)(C->data);
123   for (i=0; i<nrows; i++) {
124     row      = irow[i];
125     kstart   = ai[row];
126     kend     = kstart + a->ilen[row];
127     mat_i    = c->i[i];
128     mat_j    = c->j + mat_i;
129     mat_a    = c->a + mat_i*bs2;
130     mat_ilen = c->ilen + i;
131     for (k=kstart; k<kend; k++) {
132       if ((tcol=ssmap[a->j[k]])) {
133         *mat_j++ = tcol - 1;
134         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
135         mat_a   += bs2;
136         (*mat_ilen)++;
137       }
138     }
139   }
140   /* sort */
141   {
142     MatScalar *work;
143     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
144     for (i=0; i<nrows; i++) {
145       PetscInt ilen;
146       mat_i = c->i[i];
147       mat_j = c->j + mat_i;
148       mat_a = c->a + mat_i*bs2;
149       ilen  = c->ilen[i];
150       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
151     }
152     ierr = PetscFree(work);CHKERRQ(ierr);
153   }
154 
155   /* Free work space */
156   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
157   ierr = PetscFree(smap);CHKERRQ(ierr);
158   ierr = PetscFree(lens);CHKERRQ(ierr);
159   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
160   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161 
162   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
163   *B   = C;
164   PetscFunctionReturn(0);
165 }
166 
167 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
168 {
169   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
170   IS             is1,is2;
171   PetscErrorCode ierr;
172   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
173   const PetscInt *irow,*icol;
174 
175   PetscFunctionBegin;
176   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
177   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
178   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
179   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
180 
181   /* Verify if the indices corespond to each element in a block
182    and form the IS with compressed IS */
183   maxmnbs = PetscMax(a->mbs,a->nbs);
184   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
185   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
186   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
187   for (i=0; i<a->mbs; i++) {
188     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
189   }
190   count = 0;
191   for (i=0; i<nrows; i++) {
192     j = irow[i] / bs;
193     if ((vary[j]--)==bs) iary[count++] = j;
194   }
195   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
196 
197   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
198   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
199   for (i=0; i<a->nbs; i++) {
200     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
201   }
202   count = 0;
203   for (i=0; i<ncols; i++) {
204     j = icol[i] / bs;
205     if ((vary[j]--)==bs) iary[count++] = j;
206   }
207   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
208   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
209   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
210   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
211 
212   ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
213   ierr = ISDestroy(&is1);CHKERRQ(ierr);
214   ierr = ISDestroy(&is2);CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
219 {
220   PetscErrorCode ierr;
221   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
222   Mat_SubSppt    *submatj = c->submatis1;
223 
224   PetscFunctionBegin;
225   ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
226   ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
231 {
232   PetscErrorCode ierr;
233   PetscInt       i;
234   Mat            C;
235   Mat_SeqBAIJ    *c;
236   Mat_SubSppt    *submatj;
237 
238   PetscFunctionBegin;
239   for (i=0; i<n; i++) {
240     C       = (*mat)[i];
241     c       = (Mat_SeqBAIJ*)C->data;
242     submatj = c->submatis1;
243     if (submatj) {
244       if (--((PetscObject)C)->refct <= 0) {
245         ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
246         ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
247         ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
248         ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr);
249         ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr);
250         ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr);
251       }
252     } else {
253       ierr = MatDestroy(&C);CHKERRQ(ierr);
254     }
255   }
256 
257   /* Destroy Dummy submatrices created for reuse */
258   ierr = MatDestroySubMatrices_Dummy(n,mat);CHKERRQ(ierr);
259 
260   ierr = PetscFree(*mat);CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
265 {
266   PetscErrorCode ierr;
267   PetscInt       i;
268 
269   PetscFunctionBegin;
270   if (scall == MAT_INITIAL_MATRIX) {
271     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
272   }
273 
274   for (i=0; i<n; i++) {
275     ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
276   }
277   PetscFunctionReturn(0);
278 }
279 
280 /* -------------------------------------------------------*/
281 /* Should check that shapes of vectors and matrices match */
282 /* -------------------------------------------------------*/
283 
284 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
285 {
286   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
287   PetscScalar       *z,sum;
288   const PetscScalar *x;
289   const MatScalar   *v;
290   PetscErrorCode    ierr;
291   PetscInt          mbs,i,n;
292   const PetscInt    *idx,*ii,*ridx=NULL;
293   PetscBool         usecprow=a->compressedrow.use;
294 
295   PetscFunctionBegin;
296   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
297   ierr = VecGetArrayWrite(zz,&z);CHKERRQ(ierr);
298 
299   if (usecprow) {
300     mbs  = a->compressedrow.nrows;
301     ii   = a->compressedrow.i;
302     ridx = a->compressedrow.rindex;
303     ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr);
304   } else {
305     mbs = a->mbs;
306     ii  = a->i;
307   }
308 
309   for (i=0; i<mbs; i++) {
310     n   = ii[1] - ii[0];
311     v   = a->a + ii[0];
312     idx = a->j + ii[0];
313     ii++;
314     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
315     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
316     sum = 0.0;
317     PetscSparseDensePlusDot(sum,x,v,idx,n);
318     if (usecprow) {
319       z[ridx[i]] = sum;
320     } else {
321       z[i]       = sum;
322     }
323   }
324   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
325   ierr = VecRestoreArrayWrite(zz,&z);CHKERRQ(ierr);
326   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
327   PetscFunctionReturn(0);
328 }
329 
330 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
331 {
332   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
333   PetscScalar       *z = NULL,sum1,sum2,*zarray;
334   const PetscScalar *x,*xb;
335   PetscScalar       x1,x2;
336   const MatScalar   *v;
337   PetscErrorCode    ierr;
338   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
339   PetscBool         usecprow=a->compressedrow.use;
340 
341   PetscFunctionBegin;
342   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
343   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
344 
345   idx = a->j;
346   v   = a->a;
347   if (usecprow) {
348     mbs  = a->compressedrow.nrows;
349     ii   = a->compressedrow.i;
350     ridx = a->compressedrow.rindex;
351     ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr);
352   } else {
353     mbs = a->mbs;
354     ii  = a->i;
355     z   = zarray;
356   }
357 
358   for (i=0; i<mbs; i++) {
359     n           = ii[1] - ii[0]; ii++;
360     sum1        = 0.0; sum2 = 0.0;
361     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
362     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
363     for (j=0; j<n; j++) {
364       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
365       sum1 += v[0]*x1 + v[2]*x2;
366       sum2 += v[1]*x1 + v[3]*x2;
367       v    += 4;
368     }
369     if (usecprow) z = zarray + 2*ridx[i];
370     z[0] = sum1; z[1] = sum2;
371     if (!usecprow) z += 2;
372   }
373   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
374   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
375   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
376   PetscFunctionReturn(0);
377 }
378 
379 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
380 {
381   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
382   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
383   const PetscScalar *x,*xb;
384   const MatScalar   *v;
385   PetscErrorCode    ierr;
386   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
387   PetscBool         usecprow=a->compressedrow.use;
388 
389 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
390 #pragma disjoint(*v,*z,*xb)
391 #endif
392 
393   PetscFunctionBegin;
394   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
395   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
396 
397   idx = a->j;
398   v   = a->a;
399   if (usecprow) {
400     mbs  = a->compressedrow.nrows;
401     ii   = a->compressedrow.i;
402     ridx = a->compressedrow.rindex;
403     ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr);
404   } else {
405     mbs = a->mbs;
406     ii  = a->i;
407     z   = zarray;
408   }
409 
410   for (i=0; i<mbs; i++) {
411     n           = ii[1] - ii[0]; ii++;
412     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
413     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
414     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
415     for (j=0; j<n; j++) {
416       xb = x + 3*(*idx++);
417       x1 = xb[0];
418       x2 = xb[1];
419       x3 = xb[2];
420 
421       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
422       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
423       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
424       v    += 9;
425     }
426     if (usecprow) z = zarray + 3*ridx[i];
427     z[0] = sum1; z[1] = sum2; z[2] = sum3;
428     if (!usecprow) z += 3;
429   }
430   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
431   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
432   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
433   PetscFunctionReturn(0);
434 }
435 
436 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
437 {
438   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
439   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
440   const PetscScalar *x,*xb;
441   const MatScalar   *v;
442   PetscErrorCode    ierr;
443   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
444   PetscBool         usecprow=a->compressedrow.use;
445 
446   PetscFunctionBegin;
447   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
448   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
449 
450   idx = a->j;
451   v   = a->a;
452   if (usecprow) {
453     mbs  = a->compressedrow.nrows;
454     ii   = a->compressedrow.i;
455     ridx = a->compressedrow.rindex;
456     ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr);
457   } else {
458     mbs = a->mbs;
459     ii  = a->i;
460     z   = zarray;
461   }
462 
463   for (i=0; i<mbs; i++) {
464     n = ii[1] - ii[0];
465     ii++;
466     sum1 = 0.0;
467     sum2 = 0.0;
468     sum3 = 0.0;
469     sum4 = 0.0;
470 
471     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
472     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
473     for (j=0; j<n; j++) {
474       xb    = x + 4*(*idx++);
475       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
476       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
477       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
478       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
479       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
480       v    += 16;
481     }
482     if (usecprow) z = zarray + 4*ridx[i];
483     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
484     if (!usecprow) z += 4;
485   }
486   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
487   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
488   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
489   PetscFunctionReturn(0);
490 }
491 
492 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
493 {
494   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
495   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
496   const PetscScalar *xb,*x;
497   const MatScalar   *v;
498   PetscErrorCode    ierr;
499   const PetscInt    *idx,*ii,*ridx=NULL;
500   PetscInt          mbs,i,j,n;
501   PetscBool         usecprow=a->compressedrow.use;
502 
503   PetscFunctionBegin;
504   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
505   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
506 
507   idx = a->j;
508   v   = a->a;
509   if (usecprow) {
510     mbs  = a->compressedrow.nrows;
511     ii   = a->compressedrow.i;
512     ridx = a->compressedrow.rindex;
513     ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr);
514   } else {
515     mbs = a->mbs;
516     ii  = a->i;
517     z   = zarray;
518   }
519 
520   for (i=0; i<mbs; i++) {
521     n           = ii[1] - ii[0]; ii++;
522     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
523     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
524     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
525     for (j=0; j<n; j++) {
526       xb    = x + 5*(*idx++);
527       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
528       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
529       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
530       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
531       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
532       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
533       v    += 25;
534     }
535     if (usecprow) z = zarray + 5*ridx[i];
536     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
537     if (!usecprow) z += 5;
538   }
539   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
540   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
541   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
542   PetscFunctionReturn(0);
543 }
544 
545 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
546 {
547   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
548   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
549   const PetscScalar *x,*xb;
550   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
551   const MatScalar   *v;
552   PetscErrorCode    ierr;
553   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
554   PetscBool         usecprow=a->compressedrow.use;
555 
556   PetscFunctionBegin;
557   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
558   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
559 
560   idx = a->j;
561   v   = a->a;
562   if (usecprow) {
563     mbs  = a->compressedrow.nrows;
564     ii   = a->compressedrow.i;
565     ridx = a->compressedrow.rindex;
566     ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr);
567   } else {
568     mbs = a->mbs;
569     ii  = a->i;
570     z   = zarray;
571   }
572 
573   for (i=0; i<mbs; i++) {
574     n  = ii[1] - ii[0];
575     ii++;
576     sum1 = 0.0;
577     sum2 = 0.0;
578     sum3 = 0.0;
579     sum4 = 0.0;
580     sum5 = 0.0;
581     sum6 = 0.0;
582 
583     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
584     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
585     for (j=0; j<n; j++) {
586       xb    = x + 6*(*idx++);
587       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
588       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
589       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
590       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
591       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
592       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
593       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
594       v    += 36;
595     }
596     if (usecprow) z = zarray + 6*ridx[i];
597     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
598     if (!usecprow) z += 6;
599   }
600 
601   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
602   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
603   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
604   PetscFunctionReturn(0);
605 }
606 
607 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
608 {
609   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
610   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
611   const PetscScalar *x,*xb;
612   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
613   const MatScalar   *v;
614   PetscErrorCode    ierr;
615   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
616   PetscBool         usecprow=a->compressedrow.use;
617 
618   PetscFunctionBegin;
619   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
620   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
621 
622   idx = a->j;
623   v   = a->a;
624   if (usecprow) {
625     mbs  = a->compressedrow.nrows;
626     ii   = a->compressedrow.i;
627     ridx = a->compressedrow.rindex;
628     ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr);
629   } else {
630     mbs = a->mbs;
631     ii  = a->i;
632     z   = zarray;
633   }
634 
635   for (i=0; i<mbs; i++) {
636     n  = ii[1] - ii[0];
637     ii++;
638     sum1 = 0.0;
639     sum2 = 0.0;
640     sum3 = 0.0;
641     sum4 = 0.0;
642     sum5 = 0.0;
643     sum6 = 0.0;
644     sum7 = 0.0;
645 
646     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
647     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
648     for (j=0; j<n; j++) {
649       xb    = x + 7*(*idx++);
650       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
651       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
652       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
653       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
654       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
655       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
656       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
657       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
658       v    += 49;
659     }
660     if (usecprow) z = zarray + 7*ridx[i];
661     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
662     if (!usecprow) z += 7;
663   }
664 
665   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
666   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
667   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
668   PetscFunctionReturn(0);
669 }
670 
671 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
672 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
673 {
674   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
675   PetscScalar       *z = NULL,*work,*workt,*zarray;
676   const PetscScalar *x,*xb;
677   const MatScalar   *v;
678   PetscErrorCode    ierr;
679   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
680   const PetscInt    *idx,*ii,*ridx=NULL;
681   PetscInt          k;
682   PetscBool         usecprow=a->compressedrow.use;
683 
684   __m256d a0,a1,a2,a3,a4,a5;
685   __m256d w0,w1,w2,w3;
686   __m256d z0,z1,z2;
687   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
688 
689   PetscFunctionBegin;
690   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
691   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
692 
693   idx = a->j;
694   v   = a->a;
695   if (usecprow) {
696     mbs  = a->compressedrow.nrows;
697     ii   = a->compressedrow.i;
698     ridx = a->compressedrow.rindex;
699     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
700   } else {
701     mbs = a->mbs;
702     ii  = a->i;
703     z   = zarray;
704   }
705 
706   if (!a->mult_work) {
707     k    = PetscMax(A->rmap->n,A->cmap->n);
708     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
709   }
710 
711   work = a->mult_work;
712   for (i=0; i<mbs; i++) {
713     n           = ii[1] - ii[0]; ii++;
714     workt       = work;
715     for (j=0; j<n; j++) {
716       xb = x + bs*(*idx++);
717       for (k=0; k<bs; k++) workt[k] = xb[k];
718       workt += bs;
719     }
720     if (usecprow) z = zarray + bs*ridx[i];
721 
722     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
723 
724     for (j=0; j<n; j++) {
725       /* first column of a */
726       w0 = _mm256_set1_pd(work[j*9  ]);
727       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
728       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
729       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
730 
731       /* second column of a */
732       w1 = _mm256_set1_pd(work[j*9+ 1]);
733       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
734       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
735       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
736 
737       /* third column of a */
738       w2 = _mm256_set1_pd(work[j*9 +2]);
739       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
740       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
741       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
742 
743       /* fourth column of a */
744       w3 = _mm256_set1_pd(work[j*9+ 3]);
745       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
746       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
747       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
748 
749       /* fifth column of a */
750       w0 = _mm256_set1_pd(work[j*9+ 4]);
751       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
752       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
753       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
754 
755       /* sixth column of a */
756       w1 = _mm256_set1_pd(work[j*9+ 5]);
757       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
758       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
759       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
760 
761       /* seventh column of a */
762       w2 = _mm256_set1_pd(work[j*9+ 6]);
763       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
764       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
765       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
766 
767       /* eigth column of a */
768       w3 = _mm256_set1_pd(work[j*9+ 7]);
769       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
770       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
771       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
772 
773       /* ninth column of a */
774       w0 = _mm256_set1_pd(work[j*9+ 8]);
775       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
776       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
777       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
778     }
779 
780     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
781 
782     v += n*bs2;
783     if (!usecprow) z += bs;
784   }
785   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
786   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
787   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
788   PetscFunctionReturn(0);
789 }
790 #endif
791 
792 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
793 {
794   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
795   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
796   const PetscScalar *x,*xb;
797   PetscScalar       *zarray,xv;
798   const MatScalar   *v;
799   PetscErrorCode    ierr;
800   const PetscInt    *ii,*ij=a->j,*idx;
801   PetscInt          mbs,i,j,k,n,*ridx=NULL;
802   PetscBool         usecprow=a->compressedrow.use;
803 
804   PetscFunctionBegin;
805   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
806   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
807 
808   v = a->a;
809   if (usecprow) {
810     mbs  = a->compressedrow.nrows;
811     ii   = a->compressedrow.i;
812     ridx = a->compressedrow.rindex;
813     ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr);
814   } else {
815     mbs = a->mbs;
816     ii  = a->i;
817     z   = zarray;
818   }
819 
820   for (i=0; i<mbs; i++) {
821     n    = ii[i+1] - ii[i];
822     idx  = ij + ii[i];
823     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
824     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
825 
826     for (j=0; j<n; j++) {
827       xb = x + 11*(idx[j]);
828 
829       for (k=0; k<11; k++) {
830         xv     =  xb[k];
831         sum1  += v[0]*xv;
832         sum2  += v[1]*xv;
833         sum3  += v[2]*xv;
834         sum4  += v[3]*xv;
835         sum5  += v[4]*xv;
836         sum6  += v[5]*xv;
837         sum7  += v[6]*xv;
838         sum8  += v[7]*xv;
839         sum9  += v[8]*xv;
840         sum10 += v[9]*xv;
841         sum11 += v[10]*xv;
842         v     += 11;
843       }
844     }
845     if (usecprow) z = zarray + 11*ridx[i];
846     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
847     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
848 
849     if (!usecprow) z += 11;
850   }
851 
852   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
853   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
854   ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
855   PetscFunctionReturn(0);
856 }
857 
858 /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
859 PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
860 {
861   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
862   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
863   const PetscScalar *x,*xb;
864   PetscScalar       *zarray,xv;
865   const MatScalar   *v;
866   PetscErrorCode    ierr;
867   const PetscInt    *ii,*ij=a->j,*idx;
868   PetscInt          mbs,i,j,k,n,*ridx=NULL;
869   PetscBool         usecprow=a->compressedrow.use;
870 
871   PetscFunctionBegin;
872   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
873   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
874 
875   v = a->a;
876   if (usecprow) {
877     mbs  = a->compressedrow.nrows;
878     ii   = a->compressedrow.i;
879     ridx = a->compressedrow.rindex;
880     ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr);
881   } else {
882     mbs = a->mbs;
883     ii  = a->i;
884     z   = zarray;
885   }
886 
887   for (i=0; i<mbs; i++) {
888     n    = ii[i+1] - ii[i];
889     idx  = ij + ii[i];
890     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
891     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
892 
893     for (j=0; j<n; j++) {
894       xb = x + 12*(idx[j]);
895 
896       for (k=0; k<12; k++) {
897         xv     =  xb[k];
898         sum1  += v[0]*xv;
899         sum2  += v[1]*xv;
900         sum3  += v[2]*xv;
901         sum4  += v[3]*xv;
902         sum5  += v[4]*xv;
903         sum6  += v[5]*xv;
904         sum7  += v[6]*xv;
905         sum8  += v[7]*xv;
906         sum9  += v[8]*xv;
907         sum10 += v[9]*xv;
908         sum11 += v[10]*xv;
909         sum12 += v[11]*xv;
910         v     += 12;
911       }
912     }
913     if (usecprow) z = zarray + 12*ridx[i];
914     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
915     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
916     if (!usecprow) z += 12;
917   }
918   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
919   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
920   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
921   PetscFunctionReturn(0);
922 }
923 
924 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
925 {
926   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
927   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
928   const PetscScalar *x,*xb;
929   PetscScalar       *zarray,*yarray,xv;
930   const MatScalar   *v;
931   PetscErrorCode    ierr;
932   const PetscInt    *ii,*ij=a->j,*idx;
933   PetscInt          mbs = a->mbs,i,j,k,n,*ridx=NULL;
934   PetscBool         usecprow=a->compressedrow.use;
935 
936   PetscFunctionBegin;
937   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
938   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
939 
940   v = a->a;
941   if (usecprow) {
942    if (zz != yy) {
943      ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr);
944     }
945     mbs  = a->compressedrow.nrows;
946     ii   = a->compressedrow.i;
947     ridx = a->compressedrow.rindex;
948   } else {
949     ii  = a->i;
950     y   = yarray;
951     z   = zarray;
952   }
953 
954   for (i=0; i<mbs; i++) {
955     n    = ii[i+1] - ii[i];
956     idx  = ij + ii[i];
957 
958     if (usecprow) {
959       y = yarray + 12*ridx[i];
960       z = zarray + 12*ridx[i];
961     }
962     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
963     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
964 
965     for (j=0; j<n; j++) {
966       xb = x + 12*(idx[j]);
967 
968       for (k=0; k<12; k++) {
969         xv     =  xb[k];
970         sum1  += v[0]*xv;
971         sum2  += v[1]*xv;
972         sum3  += v[2]*xv;
973         sum4  += v[3]*xv;
974         sum5  += v[4]*xv;
975         sum6  += v[5]*xv;
976         sum7  += v[6]*xv;
977         sum8  += v[7]*xv;
978         sum9  += v[8]*xv;
979         sum10 += v[9]*xv;
980         sum11 += v[10]*xv;
981         sum12 += v[11]*xv;
982         v     += 12;
983       }
984     }
985 
986     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
987     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
988     if (!usecprow) {
989       y += 12;
990       z += 12;
991     }
992   }
993   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
994   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
995   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
996   PetscFunctionReturn(0);
997 }
998 
999 /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1000 PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
1001 {
1002   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1003   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1004   const PetscScalar *x,*xb;
1005   PetscScalar       x1,x2,x3,x4,*zarray;
1006   const MatScalar   *v;
1007   PetscErrorCode    ierr;
1008   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1009   PetscInt          mbs,i,j,n;
1010   PetscBool         usecprow=a->compressedrow.use;
1011 
1012   PetscFunctionBegin;
1013   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1014   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1015 
1016   v = a->a;
1017   if (usecprow) {
1018     mbs  = a->compressedrow.nrows;
1019     ii   = a->compressedrow.i;
1020     ridx = a->compressedrow.rindex;
1021     ierr = PetscArrayzero(zarray,12*a->mbs);CHKERRQ(ierr);
1022   } else {
1023     mbs = a->mbs;
1024     ii  = a->i;
1025     z   = zarray;
1026   }
1027 
1028   for (i=0; i<mbs; i++) {
1029     n    = ii[i+1] - ii[i];
1030     idx  = ij + ii[i];
1031 
1032     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1033     for (j=0; j<n; j++) {
1034       xb = x + 12*(idx[j]);
1035       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1036 
1037       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1038       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1039       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1040       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1041       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1042       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1043       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1044       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1045       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1046       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1047       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1048       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1049       v += 48;
1050 
1051       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1052 
1053       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1054       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1055       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1056       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1057       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1058       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1059       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1060       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1061       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1062       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1063       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1064       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1065       v     += 48;
1066 
1067       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1068       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1069       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1070       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1071       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1072       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1073       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1074       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1075       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1076       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1077       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1078       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1079       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1080       v     += 48;
1081 
1082     }
1083     if (usecprow) z = zarray + 12*ridx[i];
1084     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1085     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1086     if (!usecprow) z += 12;
1087   }
1088   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1089   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1090   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1095 PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1096 {
1097   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1098   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1099   const PetscScalar *x,*xb;
1100   PetscScalar       x1,x2,x3,x4,*zarray,*yarray;
1101   const MatScalar   *v;
1102   PetscErrorCode    ierr;
1103   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1104   PetscInt          mbs = a->mbs,i,j,n;
1105   PetscBool         usecprow=a->compressedrow.use;
1106 
1107   PetscFunctionBegin;
1108   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1109   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1110 
1111   v = a->a;
1112   if (usecprow) {
1113     if (zz != yy) {
1114       ierr = PetscArraycpy(zarray,yarray,12*mbs);CHKERRQ(ierr);
1115     }
1116     mbs  = a->compressedrow.nrows;
1117     ii   = a->compressedrow.i;
1118     ridx = a->compressedrow.rindex;
1119   } else {
1120     ii  = a->i;
1121     y   = yarray;
1122     z   = zarray;
1123   }
1124 
1125   for (i=0; i<mbs; i++) {
1126     n    = ii[i+1] - ii[i];
1127     idx  = ij + ii[i];
1128 
1129     if (usecprow) {
1130       y = yarray + 12*ridx[i];
1131       z = zarray + 12*ridx[i];
1132     }
1133     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1134     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1135 
1136     for (j=0; j<n; j++) {
1137       xb = x + 12*(idx[j]);
1138       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1139 
1140       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1141       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1142       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1143       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1144       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1145       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1146       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1147       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1148       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1149       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1150       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1151       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1152       v += 48;
1153 
1154       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1155 
1156       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1157       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1158       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1159       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1160       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1161       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1162       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1163       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1164       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1165       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1166       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1167       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1168       v     += 48;
1169 
1170       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1171       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1172       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1173       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1174       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1175       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1176       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1177       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1178       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1179       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1180       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1181       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1182       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1183       v     += 48;
1184 
1185     }
1186     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1187     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1188     if (!usecprow) {
1189       y += 12;
1190       z += 12;
1191     }
1192   }
1193   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1194   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1195   ierr = PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);CHKERRQ(ierr);
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1200 PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1201 {
1202   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1203   PetscScalar       *z = NULL,*zarray;
1204   const PetscScalar *x,*work;
1205   const MatScalar   *v = a->a;
1206   PetscErrorCode    ierr;
1207   PetscInt          mbs,i,j,n;
1208   const PetscInt    *idx = a->j,*ii,*ridx=NULL;
1209   PetscBool         usecprow=a->compressedrow.use;
1210   const PetscInt    bs = 12, bs2 = 144;
1211 
1212   __m256d a0,a1,a2,a3,a4,a5;
1213   __m256d w0,w1,w2,w3;
1214   __m256d z0,z1,z2;
1215 
1216   PetscFunctionBegin;
1217   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1218   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1219 
1220   if (usecprow) {
1221     mbs  = a->compressedrow.nrows;
1222     ii   = a->compressedrow.i;
1223     ridx = a->compressedrow.rindex;
1224     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1225   } else {
1226     mbs = a->mbs;
1227     ii  = a->i;
1228     z   = zarray;
1229   }
1230 
1231   for (i=0; i<mbs; i++) {
1232     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1233 
1234     n  = ii[1] - ii[0]; ii++;
1235     for (j=0; j<n; j++) {
1236       work = x + bs*(*idx++);
1237 
1238       /* first column of a */
1239       w0 = _mm256_set1_pd(work[0]);
1240       a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1241       a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1242       a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1243 
1244       /* second column of a */
1245       w1 = _mm256_set1_pd(work[1]);
1246       a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1247       a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1248       a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1249 
1250       /* third column of a */
1251       w2 = _mm256_set1_pd(work[2]);
1252       a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1253       a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1254       a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1255 
1256       /* fourth column of a */
1257       w3 = _mm256_set1_pd(work[3]);
1258       a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1259       a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1260       a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1261 
1262       /* fifth column of a */
1263       w0 = _mm256_set1_pd(work[4]);
1264       a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1265       a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1266       a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1267 
1268       /* sixth column of a */
1269       w1 = _mm256_set1_pd(work[5]);
1270       a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1271       a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1272       a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1273 
1274       /* seventh column of a */
1275       w2 = _mm256_set1_pd(work[6]);
1276       a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1277       a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1278       a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1279 
1280       /* eigth column of a */
1281       w3 = _mm256_set1_pd(work[7]);
1282       a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1283       a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1284       a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1285 
1286       /* ninth column of a */
1287       w0 = _mm256_set1_pd(work[8]);
1288       a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1289       a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1290       a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1291 
1292       /* tenth column of a */
1293       w1 = _mm256_set1_pd(work[9]);
1294       a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1295       a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1296       a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1297 
1298       /* eleventh column of a */
1299       w2 = _mm256_set1_pd(work[10]);
1300       a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1301       a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1302       a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1303 
1304       /* twelveth column of a */
1305       w3 = _mm256_set1_pd(work[11]);
1306       a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1307       a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1308       a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1309 
1310       v += bs2;
1311     }
1312     if (usecprow) z = zarray + bs*ridx[i];
1313     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1314     if (!usecprow) z += bs;
1315   }
1316   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1317   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1318   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1319   PetscFunctionReturn(0);
1320 }
1321 #endif
1322 
1323 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1324 /* Default MatMult for block size 15 */
1325 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1326 {
1327   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1328   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1329   const PetscScalar *x,*xb;
1330   PetscScalar       *zarray,xv;
1331   const MatScalar   *v;
1332   PetscErrorCode    ierr;
1333   const PetscInt    *ii,*ij=a->j,*idx;
1334   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1335   PetscBool         usecprow=a->compressedrow.use;
1336 
1337   PetscFunctionBegin;
1338   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1339   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1340 
1341   v = a->a;
1342   if (usecprow) {
1343     mbs  = a->compressedrow.nrows;
1344     ii   = a->compressedrow.i;
1345     ridx = a->compressedrow.rindex;
1346     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1347   } else {
1348     mbs = a->mbs;
1349     ii  = a->i;
1350     z   = zarray;
1351   }
1352 
1353   for (i=0; i<mbs; i++) {
1354     n    = ii[i+1] - ii[i];
1355     idx  = ij + ii[i];
1356     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1357     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1358 
1359     for (j=0; j<n; j++) {
1360       xb = x + 15*(idx[j]);
1361 
1362       for (k=0; k<15; k++) {
1363         xv     =  xb[k];
1364         sum1  += v[0]*xv;
1365         sum2  += v[1]*xv;
1366         sum3  += v[2]*xv;
1367         sum4  += v[3]*xv;
1368         sum5  += v[4]*xv;
1369         sum6  += v[5]*xv;
1370         sum7  += v[6]*xv;
1371         sum8  += v[7]*xv;
1372         sum9  += v[8]*xv;
1373         sum10 += v[9]*xv;
1374         sum11 += v[10]*xv;
1375         sum12 += v[11]*xv;
1376         sum13 += v[12]*xv;
1377         sum14 += v[13]*xv;
1378         sum15 += v[14]*xv;
1379         v     += 15;
1380       }
1381     }
1382     if (usecprow) z = zarray + 15*ridx[i];
1383     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1384     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1385 
1386     if (!usecprow) z += 15;
1387   }
1388 
1389   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1390   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1391   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1392   PetscFunctionReturn(0);
1393 }
1394 
1395 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1396 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1397 {
1398   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1399   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1400   const PetscScalar *x,*xb;
1401   PetscScalar       x1,x2,x3,x4,*zarray;
1402   const MatScalar   *v;
1403   PetscErrorCode    ierr;
1404   const PetscInt    *ii,*ij=a->j,*idx;
1405   PetscInt          mbs,i,j,n,*ridx=NULL;
1406   PetscBool         usecprow=a->compressedrow.use;
1407 
1408   PetscFunctionBegin;
1409   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1410   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1411 
1412   v = a->a;
1413   if (usecprow) {
1414     mbs  = a->compressedrow.nrows;
1415     ii   = a->compressedrow.i;
1416     ridx = a->compressedrow.rindex;
1417     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1418   } else {
1419     mbs = a->mbs;
1420     ii  = a->i;
1421     z   = zarray;
1422   }
1423 
1424   for (i=0; i<mbs; i++) {
1425     n    = ii[i+1] - ii[i];
1426     idx  = ij + ii[i];
1427     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1428     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1429 
1430     for (j=0; j<n; j++) {
1431       xb = x + 15*(idx[j]);
1432       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1433 
1434       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1435       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1436       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1437       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1438       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1439       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1440       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1441       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1442       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1443       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1444       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1445       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1446       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1447       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1448       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1449 
1450       v += 60;
1451 
1452       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1453 
1454       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1455       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1456       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1457       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1458       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1459       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1460       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1461       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1462       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1463       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1464       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1465       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1466       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1467       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1468       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1469       v     += 60;
1470 
1471       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1472       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1473       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1474       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1475       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1476       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1477       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1478       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1479       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1480       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1481       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1482       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1483       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1484       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1485       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1486       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1487       v     += 60;
1488 
1489       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1490       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1491       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1492       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1493       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1494       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1495       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1496       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1497       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1498       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1499       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1500       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1501       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1502       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1503       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1504       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1505       v     += 45;
1506     }
1507     if (usecprow) z = zarray + 15*ridx[i];
1508     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1509     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1510 
1511     if (!usecprow) z += 15;
1512   }
1513 
1514   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1515   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1516   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1521 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1522 {
1523   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1524   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1525   const PetscScalar *x,*xb;
1526   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1527   const MatScalar   *v;
1528   PetscErrorCode    ierr;
1529   const PetscInt    *ii,*ij=a->j,*idx;
1530   PetscInt          mbs,i,j,n,*ridx=NULL;
1531   PetscBool         usecprow=a->compressedrow.use;
1532 
1533   PetscFunctionBegin;
1534   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1535   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1536 
1537   v = a->a;
1538   if (usecprow) {
1539     mbs  = a->compressedrow.nrows;
1540     ii   = a->compressedrow.i;
1541     ridx = a->compressedrow.rindex;
1542     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1543   } else {
1544     mbs = a->mbs;
1545     ii  = a->i;
1546     z   = zarray;
1547   }
1548 
1549   for (i=0; i<mbs; i++) {
1550     n    = ii[i+1] - ii[i];
1551     idx  = ij + ii[i];
1552     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1553     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1554 
1555     for (j=0; j<n; j++) {
1556       xb = x + 15*(idx[j]);
1557       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1558       x8 = xb[7];
1559 
1560       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1561       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1562       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1563       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1564       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1565       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1566       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1567       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1568       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1569       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1570       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1571       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1572       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1573       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1574       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1575       v     += 120;
1576 
1577       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1578 
1579       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1580       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1581       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1582       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1583       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1584       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1585       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1586       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1587       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1588       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1589       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1590       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1591       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1592       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1593       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1594       v     += 105;
1595     }
1596     if (usecprow) z = zarray + 15*ridx[i];
1597     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1598     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1599 
1600     if (!usecprow) z += 15;
1601   }
1602 
1603   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1604   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1605   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1606   PetscFunctionReturn(0);
1607 }
1608 
1609 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1610 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1611 {
1612   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1613   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1614   const PetscScalar *x,*xb;
1615   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1616   const MatScalar   *v;
1617   PetscErrorCode    ierr;
1618   const PetscInt    *ii,*ij=a->j,*idx;
1619   PetscInt          mbs,i,j,n,*ridx=NULL;
1620   PetscBool         usecprow=a->compressedrow.use;
1621 
1622   PetscFunctionBegin;
1623   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1624   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1625 
1626   v = a->a;
1627   if (usecprow) {
1628     mbs  = a->compressedrow.nrows;
1629     ii   = a->compressedrow.i;
1630     ridx = a->compressedrow.rindex;
1631     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1632   } else {
1633     mbs = a->mbs;
1634     ii  = a->i;
1635     z   = zarray;
1636   }
1637 
1638   for (i=0; i<mbs; i++) {
1639     n    = ii[i+1] - ii[i];
1640     idx  = ij + ii[i];
1641     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1642     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1643 
1644     for (j=0; j<n; j++) {
1645       xb = x + 15*(idx[j]);
1646       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1647       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1648 
1649       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1650       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1651       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1652       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1653       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1654       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1655       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1656       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1657       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1658       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1659       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1660       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1661       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1662       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1663       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1664       v     += 225;
1665     }
1666     if (usecprow) z = zarray + 15*ridx[i];
1667     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1668     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1669 
1670     if (!usecprow) z += 15;
1671   }
1672 
1673   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1674   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1675   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1676   PetscFunctionReturn(0);
1677 }
1678 
1679 /*
1680     This will not work with MatScalar == float because it calls the BLAS
1681 */
1682 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1683 {
1684   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1685   PetscScalar       *z = NULL,*work,*workt,*zarray;
1686   const PetscScalar *x,*xb;
1687   const MatScalar   *v;
1688   PetscErrorCode    ierr;
1689   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1690   const PetscInt    *idx,*ii,*ridx=NULL;
1691   PetscInt          ncols,k;
1692   PetscBool         usecprow=a->compressedrow.use;
1693 
1694   PetscFunctionBegin;
1695   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1696   ierr = VecGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1697 
1698   idx = a->j;
1699   v   = a->a;
1700   if (usecprow) {
1701     mbs  = a->compressedrow.nrows;
1702     ii   = a->compressedrow.i;
1703     ridx = a->compressedrow.rindex;
1704     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1705   } else {
1706     mbs = a->mbs;
1707     ii  = a->i;
1708     z   = zarray;
1709   }
1710 
1711   if (!a->mult_work) {
1712     k    = PetscMax(A->rmap->n,A->cmap->n);
1713     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1714   }
1715   work = a->mult_work;
1716   for (i=0; i<mbs; i++) {
1717     n           = ii[1] - ii[0]; ii++;
1718     ncols       = n*bs;
1719     workt       = work;
1720     for (j=0; j<n; j++) {
1721       xb = x + bs*(*idx++);
1722       for (k=0; k<bs; k++) workt[k] = xb[k];
1723       workt += bs;
1724     }
1725     if (usecprow) z = zarray + bs*ridx[i];
1726     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1727     v += n*bs2;
1728     if (!usecprow) z += bs;
1729   }
1730   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1731   ierr = VecRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1732   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1737 {
1738   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1739   const PetscScalar *x;
1740   PetscScalar       *y,*z,sum;
1741   const MatScalar   *v;
1742   PetscErrorCode    ierr;
1743   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1744   const PetscInt    *idx,*ii;
1745   PetscBool         usecprow=a->compressedrow.use;
1746 
1747   PetscFunctionBegin;
1748   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1749   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1750 
1751   idx = a->j;
1752   v   = a->a;
1753   if (usecprow) {
1754     if (zz != yy) {
1755       ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr);
1756     }
1757     mbs  = a->compressedrow.nrows;
1758     ii   = a->compressedrow.i;
1759     ridx = a->compressedrow.rindex;
1760   } else {
1761     ii = a->i;
1762   }
1763 
1764   for (i=0; i<mbs; i++) {
1765     n = ii[1] - ii[0];
1766     ii++;
1767     if (!usecprow) {
1768       sum = y[i];
1769     } else {
1770       sum = y[ridx[i]];
1771     }
1772     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1773     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1774     PetscSparseDensePlusDot(sum,x,v,idx,n);
1775     v   += n;
1776     idx += n;
1777     if (usecprow) {
1778       z[ridx[i]] = sum;
1779     } else {
1780       z[i] = sum;
1781     }
1782   }
1783   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1784   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1785   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1786   PetscFunctionReturn(0);
1787 }
1788 
1789 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1790 {
1791   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1792   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1793   const PetscScalar *x,*xb;
1794   PetscScalar       x1,x2,*yarray,*zarray;
1795   const MatScalar   *v;
1796   PetscErrorCode    ierr;
1797   PetscInt          mbs = a->mbs,i,n,j;
1798   const PetscInt    *idx,*ii,*ridx = NULL;
1799   PetscBool         usecprow = a->compressedrow.use;
1800 
1801   PetscFunctionBegin;
1802   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1803   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1804 
1805   idx = a->j;
1806   v   = a->a;
1807   if (usecprow) {
1808     if (zz != yy) {
1809       ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr);
1810     }
1811     mbs  = a->compressedrow.nrows;
1812     ii   = a->compressedrow.i;
1813     ridx = a->compressedrow.rindex;
1814   } else {
1815     ii = a->i;
1816     y  = yarray;
1817     z  = zarray;
1818   }
1819 
1820   for (i=0; i<mbs; i++) {
1821     n = ii[1] - ii[0]; ii++;
1822     if (usecprow) {
1823       z = zarray + 2*ridx[i];
1824       y = yarray + 2*ridx[i];
1825     }
1826     sum1 = y[0]; sum2 = y[1];
1827     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1828     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1829     for (j=0; j<n; j++) {
1830       xb = x + 2*(*idx++);
1831       x1 = xb[0];
1832       x2 = xb[1];
1833 
1834       sum1 += v[0]*x1 + v[2]*x2;
1835       sum2 += v[1]*x1 + v[3]*x2;
1836       v    += 4;
1837     }
1838     z[0] = sum1; z[1] = sum2;
1839     if (!usecprow) {
1840       z += 2; y += 2;
1841     }
1842   }
1843   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1844   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1845   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1846   PetscFunctionReturn(0);
1847 }
1848 
1849 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1850 {
1851   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1852   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1853   const PetscScalar *x,*xb;
1854   const MatScalar   *v;
1855   PetscErrorCode    ierr;
1856   PetscInt          mbs = a->mbs,i,j,n;
1857   const PetscInt    *idx,*ii,*ridx = NULL;
1858   PetscBool         usecprow = a->compressedrow.use;
1859 
1860   PetscFunctionBegin;
1861   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1862   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1863 
1864   idx = a->j;
1865   v   = a->a;
1866   if (usecprow) {
1867     if (zz != yy) {
1868       ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr);
1869     }
1870     mbs  = a->compressedrow.nrows;
1871     ii   = a->compressedrow.i;
1872     ridx = a->compressedrow.rindex;
1873   } else {
1874     ii = a->i;
1875     y  = yarray;
1876     z  = zarray;
1877   }
1878 
1879   for (i=0; i<mbs; i++) {
1880     n = ii[1] - ii[0]; ii++;
1881     if (usecprow) {
1882       z = zarray + 3*ridx[i];
1883       y = yarray + 3*ridx[i];
1884     }
1885     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1886     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1887     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1888     for (j=0; j<n; j++) {
1889       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1890       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1891       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1892       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1893       v    += 9;
1894     }
1895     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1896     if (!usecprow) {
1897       z += 3; y += 3;
1898     }
1899   }
1900   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1901   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1902   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1903   PetscFunctionReturn(0);
1904 }
1905 
1906 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1907 {
1908   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1909   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1910   const PetscScalar *x,*xb;
1911   const MatScalar   *v;
1912   PetscErrorCode    ierr;
1913   PetscInt          mbs = a->mbs,i,j,n;
1914   const PetscInt    *idx,*ii,*ridx=NULL;
1915   PetscBool         usecprow=a->compressedrow.use;
1916 
1917   PetscFunctionBegin;
1918   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1919   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1920 
1921   idx = a->j;
1922   v   = a->a;
1923   if (usecprow) {
1924     if (zz != yy) {
1925       ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr);
1926     }
1927     mbs  = a->compressedrow.nrows;
1928     ii   = a->compressedrow.i;
1929     ridx = a->compressedrow.rindex;
1930   } else {
1931     ii = a->i;
1932     y  = yarray;
1933     z  = zarray;
1934   }
1935 
1936   for (i=0; i<mbs; i++) {
1937     n = ii[1] - ii[0]; ii++;
1938     if (usecprow) {
1939       z = zarray + 4*ridx[i];
1940       y = yarray + 4*ridx[i];
1941     }
1942     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1943     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1944     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1945     for (j=0; j<n; j++) {
1946       xb    = x + 4*(*idx++);
1947       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1948       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1949       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1950       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1951       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1952       v    += 16;
1953     }
1954     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1955     if (!usecprow) {
1956       z += 4; y += 4;
1957     }
1958   }
1959   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1960   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1961   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1962   PetscFunctionReturn(0);
1963 }
1964 
1965 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1966 {
1967   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1968   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1969   const PetscScalar *x,*xb;
1970   PetscScalar       *yarray,*zarray;
1971   const MatScalar   *v;
1972   PetscErrorCode    ierr;
1973   PetscInt          mbs = a->mbs,i,j,n;
1974   const PetscInt    *idx,*ii,*ridx = NULL;
1975   PetscBool         usecprow=a->compressedrow.use;
1976 
1977   PetscFunctionBegin;
1978   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1979   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1980 
1981   idx = a->j;
1982   v   = a->a;
1983   if (usecprow) {
1984     if (zz != yy) {
1985       ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr);
1986     }
1987     mbs  = a->compressedrow.nrows;
1988     ii   = a->compressedrow.i;
1989     ridx = a->compressedrow.rindex;
1990   } else {
1991     ii = a->i;
1992     y  = yarray;
1993     z  = zarray;
1994   }
1995 
1996   for (i=0; i<mbs; i++) {
1997     n = ii[1] - ii[0]; ii++;
1998     if (usecprow) {
1999       z = zarray + 5*ridx[i];
2000       y = yarray + 5*ridx[i];
2001     }
2002     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
2003     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2004     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2005     for (j=0; j<n; j++) {
2006       xb    = x + 5*(*idx++);
2007       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
2008       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2009       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2010       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2011       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2012       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2013       v    += 25;
2014     }
2015     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
2016     if (!usecprow) {
2017       z += 5; y += 5;
2018     }
2019   }
2020   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2021   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2022   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
2023   PetscFunctionReturn(0);
2024 }
2025 
2026 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
2027 {
2028   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2029   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2030   const PetscScalar *x,*xb;
2031   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2032   const MatScalar   *v;
2033   PetscErrorCode    ierr;
2034   PetscInt          mbs = a->mbs,i,j,n;
2035   const PetscInt    *idx,*ii,*ridx=NULL;
2036   PetscBool         usecprow=a->compressedrow.use;
2037 
2038   PetscFunctionBegin;
2039   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2040   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2041 
2042   idx = a->j;
2043   v   = a->a;
2044   if (usecprow) {
2045     if (zz != yy) {
2046       ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr);
2047     }
2048     mbs  = a->compressedrow.nrows;
2049     ii   = a->compressedrow.i;
2050     ridx = a->compressedrow.rindex;
2051   } else {
2052     ii = a->i;
2053     y  = yarray;
2054     z  = zarray;
2055   }
2056 
2057   for (i=0; i<mbs; i++) {
2058     n = ii[1] - ii[0]; ii++;
2059     if (usecprow) {
2060       z = zarray + 6*ridx[i];
2061       y = yarray + 6*ridx[i];
2062     }
2063     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2064     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2065     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2066     for (j=0; j<n; j++) {
2067       xb    = x + 6*(*idx++);
2068       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2069       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2070       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2071       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2072       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2073       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2074       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2075       v    += 36;
2076     }
2077     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2078     if (!usecprow) {
2079       z += 6; y += 6;
2080     }
2081   }
2082   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2083   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2084   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
2085   PetscFunctionReturn(0);
2086 }
2087 
2088 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2089 {
2090   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2091   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2092   const PetscScalar *x,*xb;
2093   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2094   const MatScalar   *v;
2095   PetscErrorCode    ierr;
2096   PetscInt          mbs = a->mbs,i,j,n;
2097   const PetscInt    *idx,*ii,*ridx = NULL;
2098   PetscBool         usecprow=a->compressedrow.use;
2099 
2100   PetscFunctionBegin;
2101   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2102   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2103 
2104   idx = a->j;
2105   v   = a->a;
2106   if (usecprow) {
2107     if (zz != yy) {
2108       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
2109     }
2110     mbs  = a->compressedrow.nrows;
2111     ii   = a->compressedrow.i;
2112     ridx = a->compressedrow.rindex;
2113   } else {
2114     ii = a->i;
2115     y  = yarray;
2116     z  = zarray;
2117   }
2118 
2119   for (i=0; i<mbs; i++) {
2120     n = ii[1] - ii[0]; ii++;
2121     if (usecprow) {
2122       z = zarray + 7*ridx[i];
2123       y = yarray + 7*ridx[i];
2124     }
2125     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2126     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2127     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2128     for (j=0; j<n; j++) {
2129       xb    = x + 7*(*idx++);
2130       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2131       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2132       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2133       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2134       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2135       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2136       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2137       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2138       v    += 49;
2139     }
2140     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2141     if (!usecprow) {
2142       z += 7; y += 7;
2143     }
2144   }
2145   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2146   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2147   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
2148   PetscFunctionReturn(0);
2149 }
2150 
2151 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2152 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2153 {
2154   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2155   PetscScalar       *z = NULL,*work,*workt,*zarray;
2156   const PetscScalar *x,*xb;
2157   const MatScalar   *v;
2158   PetscErrorCode    ierr;
2159   PetscInt          mbs,i,j,n;
2160   PetscInt          k;
2161   PetscBool         usecprow=a->compressedrow.use;
2162   const PetscInt    *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2163 
2164   __m256d a0,a1,a2,a3,a4,a5;
2165   __m256d w0,w1,w2,w3;
2166   __m256d z0,z1,z2;
2167   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2168 
2169   PetscFunctionBegin;
2170   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2171   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2172   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2173 
2174   idx = a->j;
2175   v   = a->a;
2176   if (usecprow) {
2177     mbs  = a->compressedrow.nrows;
2178     ii   = a->compressedrow.i;
2179     ridx = a->compressedrow.rindex;
2180   } else {
2181     mbs = a->mbs;
2182     ii  = a->i;
2183     z   = zarray;
2184   }
2185 
2186   if (!a->mult_work) {
2187     k    = PetscMax(A->rmap->n,A->cmap->n);
2188     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2189   }
2190 
2191   work = a->mult_work;
2192   for (i=0; i<mbs; i++) {
2193     n           = ii[1] - ii[0]; ii++;
2194     workt       = work;
2195     for (j=0; j<n; j++) {
2196       xb = x + bs*(*idx++);
2197       for (k=0; k<bs; k++) workt[k] = xb[k];
2198       workt += bs;
2199     }
2200     if (usecprow) z = zarray + bs*ridx[i];
2201 
2202     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2203 
2204     for (j=0; j<n; j++) {
2205       /* first column of a */
2206       w0 = _mm256_set1_pd(work[j*9  ]);
2207       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2208       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2209       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2210 
2211       /* second column of a */
2212       w1 = _mm256_set1_pd(work[j*9+ 1]);
2213       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2214       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2215       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2216 
2217       /* third column of a */
2218       w2 = _mm256_set1_pd(work[j*9+ 2]);
2219       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2220       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2221       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2222 
2223       /* fourth column of a */
2224       w3 = _mm256_set1_pd(work[j*9+ 3]);
2225       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2226       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2227       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2228 
2229       /* fifth column of a */
2230       w0 = _mm256_set1_pd(work[j*9+ 4]);
2231       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2232       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2233       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2234 
2235       /* sixth column of a */
2236       w1 = _mm256_set1_pd(work[j*9+ 5]);
2237       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2238       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2239       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2240 
2241       /* seventh column of a */
2242       w2 = _mm256_set1_pd(work[j*9+ 6]);
2243       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2244       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2245       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2246 
2247       /* eigth column of a */
2248       w3 = _mm256_set1_pd(work[j*9+ 7]);
2249       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2250       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2251       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2252 
2253       /* ninth column of a */
2254       w0 = _mm256_set1_pd(work[j*9+ 8]);
2255       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2256       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2257       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2258     }
2259 
2260     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2261 
2262     v += n*bs2;
2263     if (!usecprow) z += bs;
2264   }
2265   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2266   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2267   ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr);
2268   PetscFunctionReturn(0);
2269 }
2270 #endif
2271 
2272 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2273 {
2274   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2275   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2276   const PetscScalar *x,*xb;
2277   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2278   const MatScalar   *v;
2279   PetscErrorCode    ierr;
2280   PetscInt          mbs = a->mbs,i,j,n;
2281   const PetscInt    *idx,*ii,*ridx = NULL;
2282   PetscBool         usecprow=a->compressedrow.use;
2283 
2284   PetscFunctionBegin;
2285   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2286   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2287 
2288   idx = a->j;
2289   v   = a->a;
2290   if (usecprow) {
2291     if (zz != yy) {
2292       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
2293     }
2294     mbs  = a->compressedrow.nrows;
2295     ii   = a->compressedrow.i;
2296     ridx = a->compressedrow.rindex;
2297   } else {
2298     ii = a->i;
2299     y  = yarray;
2300     z  = zarray;
2301   }
2302 
2303   for (i=0; i<mbs; i++) {
2304     n = ii[1] - ii[0]; ii++;
2305     if (usecprow) {
2306       z = zarray + 11*ridx[i];
2307       y = yarray + 11*ridx[i];
2308     }
2309     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2310     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2311     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2312     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2313     for (j=0; j<n; j++) {
2314       xb    = x + 11*(*idx++);
2315       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2316       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
2317       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
2318       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
2319       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
2320       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
2321       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
2322       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
2323       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
2324       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
2325       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
2326       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
2327       v    += 121;
2328     }
2329     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2330     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
2331     if (!usecprow) {
2332       z += 11; y += 11;
2333     }
2334   }
2335   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2336   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
2337   ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
2338   PetscFunctionReturn(0);
2339 }
2340 
2341 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2342 {
2343   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2344   PetscScalar       *z = NULL,*work,*workt,*zarray;
2345   const PetscScalar *x,*xb;
2346   const MatScalar   *v;
2347   PetscErrorCode    ierr;
2348   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2349   PetscInt          ncols,k;
2350   const PetscInt    *ridx = NULL,*idx,*ii;
2351   PetscBool         usecprow = a->compressedrow.use;
2352 
2353   PetscFunctionBegin;
2354   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
2355   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2356   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
2357 
2358   idx = a->j;
2359   v   = a->a;
2360   if (usecprow) {
2361     mbs  = a->compressedrow.nrows;
2362     ii   = a->compressedrow.i;
2363     ridx = a->compressedrow.rindex;
2364   } else {
2365     mbs = a->mbs;
2366     ii  = a->i;
2367     z   = zarray;
2368   }
2369 
2370   if (!a->mult_work) {
2371     k    = PetscMax(A->rmap->n,A->cmap->n);
2372     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2373   }
2374   work = a->mult_work;
2375   for (i=0; i<mbs; i++) {
2376     n     = ii[1] - ii[0]; ii++;
2377     ncols = n*bs;
2378     workt = work;
2379     for (j=0; j<n; j++) {
2380       xb = x + bs*(*idx++);
2381       for (k=0; k<bs; k++) workt[k] = xb[k];
2382       workt += bs;
2383     }
2384     if (usecprow) z = zarray + bs*ridx[i];
2385     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2386     v += n*bs2;
2387     if (!usecprow) z += bs;
2388   }
2389   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2390   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
2391   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
2392   PetscFunctionReturn(0);
2393 }
2394 
2395 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2396 {
2397   PetscScalar    zero = 0.0;
2398   PetscErrorCode ierr;
2399 
2400   PetscFunctionBegin;
2401   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2402   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2403   PetscFunctionReturn(0);
2404 }
2405 
2406 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2407 {
2408   PetscScalar    zero = 0.0;
2409   PetscErrorCode ierr;
2410 
2411   PetscFunctionBegin;
2412   ierr = VecSet(zz,zero);CHKERRQ(ierr);
2413   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
2414   PetscFunctionReturn(0);
2415 }
2416 
2417 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2418 {
2419   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2420   PetscScalar       *z,x1,x2,x3,x4,x5;
2421   const PetscScalar *x,*xb = NULL;
2422   const MatScalar   *v;
2423   PetscErrorCode    ierr;
2424   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2425   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2426   Mat_CompressedRow cprow = a->compressedrow;
2427   PetscBool         usecprow = cprow.use;
2428 
2429   PetscFunctionBegin;
2430   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2431   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2432   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2433 
2434   idx = a->j;
2435   v   = a->a;
2436   if (usecprow) {
2437     mbs  = cprow.nrows;
2438     ii   = cprow.i;
2439     ridx = cprow.rindex;
2440   } else {
2441     mbs=a->mbs;
2442     ii = a->i;
2443     xb = x;
2444   }
2445 
2446   switch (bs) {
2447   case 1:
2448     for (i=0; i<mbs; i++) {
2449       if (usecprow) xb = x + ridx[i];
2450       x1 = xb[0];
2451       ib = idx + ii[0];
2452       n  = ii[1] - ii[0]; ii++;
2453       for (j=0; j<n; j++) {
2454         rval     = ib[j];
2455         z[rval] += PetscConj(*v) * x1;
2456         v++;
2457       }
2458       if (!usecprow) xb++;
2459     }
2460     break;
2461   case 2:
2462     for (i=0; i<mbs; i++) {
2463       if (usecprow) xb = x + 2*ridx[i];
2464       x1 = xb[0]; x2 = xb[1];
2465       ib = idx + ii[0];
2466       n  = ii[1] - ii[0]; ii++;
2467       for (j=0; j<n; j++) {
2468         rval       = ib[j]*2;
2469         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2470         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2471         v         += 4;
2472       }
2473       if (!usecprow) xb += 2;
2474     }
2475     break;
2476   case 3:
2477     for (i=0; i<mbs; i++) {
2478       if (usecprow) xb = x + 3*ridx[i];
2479       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2480       ib = idx + ii[0];
2481       n  = ii[1] - ii[0]; ii++;
2482       for (j=0; j<n; j++) {
2483         rval       = ib[j]*3;
2484         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2485         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2486         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2487         v         += 9;
2488       }
2489       if (!usecprow) xb += 3;
2490     }
2491     break;
2492   case 4:
2493     for (i=0; i<mbs; i++) {
2494       if (usecprow) xb = x + 4*ridx[i];
2495       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2496       ib = idx + ii[0];
2497       n  = ii[1] - ii[0]; ii++;
2498       for (j=0; j<n; j++) {
2499         rval       = ib[j]*4;
2500         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2501         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2502         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2503         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2504         v         += 16;
2505       }
2506       if (!usecprow) xb += 4;
2507     }
2508     break;
2509   case 5:
2510     for (i=0; i<mbs; i++) {
2511       if (usecprow) xb = x + 5*ridx[i];
2512       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2513       x4 = xb[3]; x5 = xb[4];
2514       ib = idx + ii[0];
2515       n  = ii[1] - ii[0]; ii++;
2516       for (j=0; j<n; j++) {
2517         rval       = ib[j]*5;
2518         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2519         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2520         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2521         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2522         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2523         v         += 25;
2524       }
2525       if (!usecprow) xb += 5;
2526     }
2527     break;
2528   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2529     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2530 #if 0
2531     {
2532       PetscInt          ncols,k,bs2=a->bs2;
2533       PetscScalar       *work,*workt,zb;
2534       const PetscScalar *xtmp;
2535       if (!a->mult_work) {
2536         k    = PetscMax(A->rmap->n,A->cmap->n);
2537         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2538       }
2539       work = a->mult_work;
2540       xtmp = x;
2541       for (i=0; i<mbs; i++) {
2542         n     = ii[1] - ii[0]; ii++;
2543         ncols = n*bs;
2544         ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2545         if (usecprow) xtmp = x + bs*ridx[i];
2546         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2547         v += n*bs2;
2548         if (!usecprow) xtmp += bs;
2549         workt = work;
2550         for (j=0; j<n; j++) {
2551           zb = z + bs*(*idx++);
2552           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2553           workt += bs;
2554         }
2555       }
2556     }
2557 #endif
2558   }
2559   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2560   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2561   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2562   PetscFunctionReturn(0);
2563 }
2564 
2565 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2566 {
2567   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2568   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2569   const PetscScalar *x,*xb = NULL;
2570   const MatScalar   *v;
2571   PetscErrorCode    ierr;
2572   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2573   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2574   Mat_CompressedRow cprow   = a->compressedrow;
2575   PetscBool         usecprow=cprow.use;
2576 
2577   PetscFunctionBegin;
2578   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2579   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2580   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2581 
2582   idx = a->j;
2583   v   = a->a;
2584   if (usecprow) {
2585     mbs  = cprow.nrows;
2586     ii   = cprow.i;
2587     ridx = cprow.rindex;
2588   } else {
2589     mbs=a->mbs;
2590     ii = a->i;
2591     xb = x;
2592   }
2593 
2594   switch (bs) {
2595   case 1:
2596     for (i=0; i<mbs; i++) {
2597       if (usecprow) xb = x + ridx[i];
2598       x1 = xb[0];
2599       ib = idx + ii[0];
2600       n  = ii[1] - ii[0]; ii++;
2601       for (j=0; j<n; j++) {
2602         rval     = ib[j];
2603         z[rval] += *v * x1;
2604         v++;
2605       }
2606       if (!usecprow) xb++;
2607     }
2608     break;
2609   case 2:
2610     for (i=0; i<mbs; i++) {
2611       if (usecprow) xb = x + 2*ridx[i];
2612       x1 = xb[0]; x2 = xb[1];
2613       ib = idx + ii[0];
2614       n  = ii[1] - ii[0]; ii++;
2615       for (j=0; j<n; j++) {
2616         rval       = ib[j]*2;
2617         z[rval++] += v[0]*x1 + v[1]*x2;
2618         z[rval++] += v[2]*x1 + v[3]*x2;
2619         v         += 4;
2620       }
2621       if (!usecprow) xb += 2;
2622     }
2623     break;
2624   case 3:
2625     for (i=0; i<mbs; i++) {
2626       if (usecprow) xb = x + 3*ridx[i];
2627       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2628       ib = idx + ii[0];
2629       n  = ii[1] - ii[0]; ii++;
2630       for (j=0; j<n; j++) {
2631         rval       = ib[j]*3;
2632         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2633         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2634         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2635         v         += 9;
2636       }
2637       if (!usecprow) xb += 3;
2638     }
2639     break;
2640   case 4:
2641     for (i=0; i<mbs; i++) {
2642       if (usecprow) xb = x + 4*ridx[i];
2643       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2644       ib = idx + ii[0];
2645       n  = ii[1] - ii[0]; ii++;
2646       for (j=0; j<n; j++) {
2647         rval       = ib[j]*4;
2648         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2649         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2650         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2651         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2652         v         += 16;
2653       }
2654       if (!usecprow) xb += 4;
2655     }
2656     break;
2657   case 5:
2658     for (i=0; i<mbs; i++) {
2659       if (usecprow) xb = x + 5*ridx[i];
2660       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2661       x4 = xb[3]; x5 = xb[4];
2662       ib = idx + ii[0];
2663       n  = ii[1] - ii[0]; ii++;
2664       for (j=0; j<n; j++) {
2665         rval       = ib[j]*5;
2666         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2667         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2668         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2669         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2670         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2671         v         += 25;
2672       }
2673       if (!usecprow) xb += 5;
2674     }
2675     break;
2676   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2677     PetscInt          ncols,k;
2678     PetscScalar       *work,*workt;
2679     const PetscScalar *xtmp;
2680     if (!a->mult_work) {
2681       k    = PetscMax(A->rmap->n,A->cmap->n);
2682       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2683     }
2684     work = a->mult_work;
2685     xtmp = x;
2686     for (i=0; i<mbs; i++) {
2687       n     = ii[1] - ii[0]; ii++;
2688       ncols = n*bs;
2689       ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2690       if (usecprow) xtmp = x + bs*ridx[i];
2691       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2692       v += n*bs2;
2693       if (!usecprow) xtmp += bs;
2694       workt = work;
2695       for (j=0; j<n; j++) {
2696         zb = z + bs*(*idx++);
2697         for (k=0; k<bs; k++) zb[k] += workt[k];
2698         workt += bs;
2699       }
2700     }
2701     }
2702   }
2703   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2704   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2705   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2706   PetscFunctionReturn(0);
2707 }
2708 
2709 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2710 {
2711   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2712   PetscInt       totalnz = a->bs2*a->nz;
2713   PetscScalar    oalpha  = alpha;
2714   PetscErrorCode ierr;
2715   PetscBLASInt   one = 1,tnz;
2716 
2717   PetscFunctionBegin;
2718   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2719   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2720   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2721   PetscFunctionReturn(0);
2722 }
2723 
2724 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2725 {
2726   PetscErrorCode ierr;
2727   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2728   MatScalar      *v  = a->a;
2729   PetscReal      sum = 0.0;
2730   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2731 
2732   PetscFunctionBegin;
2733   if (type == NORM_FROBENIUS) {
2734 #if defined(PETSC_USE_REAL___FP16)
2735     PetscBLASInt one = 1,cnt = bs2*nz;
2736     PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2737 #else
2738     for (i=0; i<bs2*nz; i++) {
2739       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2740     }
2741 #endif
2742     *norm = PetscSqrtReal(sum);
2743     ierr = PetscLogFlops(2.0*bs2*nz);CHKERRQ(ierr);
2744   } else if (type == NORM_1) { /* maximum column sum */
2745     PetscReal *tmp;
2746     PetscInt  *bcol = a->j;
2747     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2748     for (i=0; i<nz; i++) {
2749       for (j=0; j<bs; j++) {
2750         k1 = bs*(*bcol) + j; /* column index */
2751         for (k=0; k<bs; k++) {
2752           tmp[k1] += PetscAbsScalar(*v); v++;
2753         }
2754       }
2755       bcol++;
2756     }
2757     *norm = 0.0;
2758     for (j=0; j<A->cmap->n; j++) {
2759       if (tmp[j] > *norm) *norm = tmp[j];
2760     }
2761     ierr = PetscFree(tmp);CHKERRQ(ierr);
2762     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2763   } else if (type == NORM_INFINITY) { /* maximum row sum */
2764     *norm = 0.0;
2765     for (k=0; k<bs; k++) {
2766       for (j=0; j<a->mbs; j++) {
2767         v   = a->a + bs2*a->i[j] + k;
2768         sum = 0.0;
2769         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2770           for (k1=0; k1<bs; k1++) {
2771             sum += PetscAbsScalar(*v);
2772             v   += bs;
2773           }
2774         }
2775         if (sum > *norm) *norm = sum;
2776       }
2777     }
2778     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2779   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2780   PetscFunctionReturn(0);
2781 }
2782 
2783 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2784 {
2785   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2786   PetscErrorCode ierr;
2787 
2788   PetscFunctionBegin;
2789   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2790   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2791     *flg = PETSC_FALSE;
2792     PetscFunctionReturn(0);
2793   }
2794 
2795   /* if the a->i are the same */
2796   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
2797   if (!*flg) PetscFunctionReturn(0);
2798 
2799   /* if a->j are the same */
2800   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
2801   if (!*flg) PetscFunctionReturn(0);
2802 
2803   /* if a->a are the same */
2804   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr);
2805   PetscFunctionReturn(0);
2806 
2807 }
2808 
2809 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2810 {
2811   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2812   PetscErrorCode ierr;
2813   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2814   PetscScalar    *x,zero = 0.0;
2815   MatScalar      *aa,*aa_j;
2816 
2817   PetscFunctionBegin;
2818   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2819   bs   = A->rmap->bs;
2820   aa   = a->a;
2821   ai   = a->i;
2822   aj   = a->j;
2823   ambs = a->mbs;
2824   bs2  = a->bs2;
2825 
2826   ierr = VecSet(v,zero);CHKERRQ(ierr);
2827   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2828   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2829   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2830   for (i=0; i<ambs; i++) {
2831     for (j=ai[i]; j<ai[i+1]; j++) {
2832       if (aj[j] == i) {
2833         row  = i*bs;
2834         aa_j = aa+j*bs2;
2835         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2836         break;
2837       }
2838     }
2839   }
2840   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2841   PetscFunctionReturn(0);
2842 }
2843 
2844 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2845 {
2846   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2847   const PetscScalar *l,*r,*li,*ri;
2848   PetscScalar       x;
2849   MatScalar         *aa, *v;
2850   PetscErrorCode    ierr;
2851   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2852   const PetscInt    *ai,*aj;
2853 
2854   PetscFunctionBegin;
2855   ai  = a->i;
2856   aj  = a->j;
2857   aa  = a->a;
2858   m   = A->rmap->n;
2859   n   = A->cmap->n;
2860   bs  = A->rmap->bs;
2861   mbs = a->mbs;
2862   bs2 = a->bs2;
2863   if (ll) {
2864     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2865     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2866     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2867     for (i=0; i<mbs; i++) { /* for each block row */
2868       M  = ai[i+1] - ai[i];
2869       li = l + i*bs;
2870       v  = aa + bs2*ai[i];
2871       for (j=0; j<M; j++) { /* for each block */
2872         for (k=0; k<bs2; k++) {
2873           (*v++) *= li[k%bs];
2874         }
2875       }
2876     }
2877     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2878     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2879   }
2880 
2881   if (rr) {
2882     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2883     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2884     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2885     for (i=0; i<mbs; i++) { /* for each block row */
2886       iai = ai[i];
2887       M   = ai[i+1] - iai;
2888       v   = aa + bs2*iai;
2889       for (j=0; j<M; j++) { /* for each block */
2890         ri = r + bs*aj[iai+j];
2891         for (k=0; k<bs; k++) {
2892           x = ri[k];
2893           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2894           v += bs;
2895         }
2896       }
2897     }
2898     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2899     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2900   }
2901   PetscFunctionReturn(0);
2902 }
2903 
2904 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2905 {
2906   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2907 
2908   PetscFunctionBegin;
2909   info->block_size   = a->bs2;
2910   info->nz_allocated = a->bs2*a->maxnz;
2911   info->nz_used      = a->bs2*a->nz;
2912   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2913   info->assemblies   = A->num_ass;
2914   info->mallocs      = A->info.mallocs;
2915   info->memory       = ((PetscObject)A)->mem;
2916   if (A->factortype) {
2917     info->fill_ratio_given  = A->info.fill_ratio_given;
2918     info->fill_ratio_needed = A->info.fill_ratio_needed;
2919     info->factor_mallocs    = A->info.factor_mallocs;
2920   } else {
2921     info->fill_ratio_given  = 0;
2922     info->fill_ratio_needed = 0;
2923     info->factor_mallocs    = 0;
2924   }
2925   PetscFunctionReturn(0);
2926 }
2927 
2928 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2929 {
2930   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2931   PetscErrorCode ierr;
2932 
2933   PetscFunctionBegin;
2934   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
2935   PetscFunctionReturn(0);
2936 }
2937 
2938 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2939 {
2940   PetscErrorCode ierr;
2941 
2942   PetscFunctionBegin;
2943   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
2944   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2945   PetscFunctionReturn(0);
2946 }
2947 
2948 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2949 {
2950   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2951   PetscScalar       *z = NULL,sum1;
2952   const PetscScalar *xb;
2953   PetscScalar       x1;
2954   const MatScalar   *v,*vv;
2955   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2956   PetscBool         usecprow=a->compressedrow.use;
2957 
2958   PetscFunctionBegin;
2959   idx = a->j;
2960   v   = a->a;
2961   if (usecprow) {
2962     mbs  = a->compressedrow.nrows;
2963     ii   = a->compressedrow.i;
2964     ridx = a->compressedrow.rindex;
2965   } else {
2966     mbs = a->mbs;
2967     ii  = a->i;
2968     z   = c;
2969   }
2970 
2971   for (i=0; i<mbs; i++) {
2972     n           = ii[1] - ii[0]; ii++;
2973     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2974     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2975     if (usecprow) z = c + ridx[i];
2976     jj = idx;
2977     vv = v;
2978     for (k=0; k<cn; k++) {
2979       idx = jj;
2980       v = vv;
2981       sum1    = 0.0;
2982       for (j=0; j<n; j++) {
2983         xb    = b + (*idx++); x1 = xb[0+k*bm];
2984         sum1 += v[0]*x1;
2985         v    += 1;
2986       }
2987       z[0+k*cm] = sum1;
2988     }
2989     if (!usecprow) z += 1;
2990   }
2991   PetscFunctionReturn(0);
2992 }
2993 
2994 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2995 {
2996   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2997   PetscScalar       *z = NULL,sum1,sum2;
2998   const PetscScalar *xb;
2999   PetscScalar       x1,x2;
3000   const MatScalar   *v,*vv;
3001   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3002   PetscBool         usecprow=a->compressedrow.use;
3003 
3004   PetscFunctionBegin;
3005   idx = a->j;
3006   v   = a->a;
3007   if (usecprow) {
3008     mbs  = a->compressedrow.nrows;
3009     ii   = a->compressedrow.i;
3010     ridx = a->compressedrow.rindex;
3011   } else {
3012     mbs = a->mbs;
3013     ii  = a->i;
3014     z   = c;
3015   }
3016 
3017   for (i=0; i<mbs; i++) {
3018     n           = ii[1] - ii[0]; ii++;
3019     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3020     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3021     if (usecprow) z = c + 2*ridx[i];
3022     jj = idx;
3023     vv = v;
3024     for (k=0; k<cn; k++) {
3025       idx = jj;
3026       v = vv;
3027       sum1    = 0.0; sum2 = 0.0;
3028       for (j=0; j<n; j++) {
3029         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
3030         sum1 += v[0]*x1 + v[2]*x2;
3031         sum2 += v[1]*x1 + v[3]*x2;
3032         v    += 4;
3033       }
3034       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
3035     }
3036     if (!usecprow) z += 2;
3037   }
3038   PetscFunctionReturn(0);
3039 }
3040 
3041 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3042 {
3043   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3044   PetscScalar       *z = NULL,sum1,sum2,sum3;
3045   const PetscScalar *xb;
3046   PetscScalar       x1,x2,x3;
3047   const MatScalar   *v,*vv;
3048   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3049   PetscBool         usecprow=a->compressedrow.use;
3050 
3051   PetscFunctionBegin;
3052   idx = a->j;
3053   v   = a->a;
3054   if (usecprow) {
3055     mbs  = a->compressedrow.nrows;
3056     ii   = a->compressedrow.i;
3057     ridx = a->compressedrow.rindex;
3058   } else {
3059     mbs = a->mbs;
3060     ii  = a->i;
3061     z   = c;
3062   }
3063 
3064   for (i=0; i<mbs; i++) {
3065     n           = ii[1] - ii[0]; ii++;
3066     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3067     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3068     if (usecprow) z = c + 3*ridx[i];
3069     jj = idx;
3070     vv = v;
3071     for (k=0; k<cn; k++) {
3072       idx = jj;
3073       v = vv;
3074       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
3075       for (j=0; j<n; j++) {
3076         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3077         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3078         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3079         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3080         v    += 9;
3081       }
3082       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3083     }
3084     if (!usecprow) z += 3;
3085   }
3086   PetscFunctionReturn(0);
3087 }
3088 
3089 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3090 {
3091   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3092   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
3093   const PetscScalar *xb;
3094   PetscScalar       x1,x2,x3,x4;
3095   const MatScalar   *v,*vv;
3096   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3097   PetscBool         usecprow=a->compressedrow.use;
3098 
3099   PetscFunctionBegin;
3100   idx = a->j;
3101   v   = a->a;
3102   if (usecprow) {
3103     mbs  = a->compressedrow.nrows;
3104     ii   = a->compressedrow.i;
3105     ridx = a->compressedrow.rindex;
3106   } else {
3107     mbs = a->mbs;
3108     ii  = a->i;
3109     z   = c;
3110   }
3111 
3112   for (i=0; i<mbs; i++) {
3113     n           = ii[1] - ii[0]; ii++;
3114     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3115     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3116     if (usecprow) z = c + 4*ridx[i];
3117     jj = idx;
3118     vv = v;
3119     for (k=0; k<cn; k++) {
3120       idx = jj;
3121       v = vv;
3122       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3123       for (j=0; j<n; j++) {
3124         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3125         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3126         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3127         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3128         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3129         v    += 16;
3130       }
3131       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3132     }
3133     if (!usecprow) z += 4;
3134   }
3135   PetscFunctionReturn(0);
3136 }
3137 
3138 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3139 {
3140   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3141   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
3142   const PetscScalar *xb;
3143   PetscScalar       x1,x2,x3,x4,x5;
3144   const MatScalar   *v,*vv;
3145   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3146   PetscBool         usecprow=a->compressedrow.use;
3147 
3148   PetscFunctionBegin;
3149   idx = a->j;
3150   v   = a->a;
3151   if (usecprow) {
3152     mbs  = a->compressedrow.nrows;
3153     ii   = a->compressedrow.i;
3154     ridx = a->compressedrow.rindex;
3155   } else {
3156     mbs = a->mbs;
3157     ii  = a->i;
3158     z   = c;
3159   }
3160 
3161   for (i=0; i<mbs; i++) {
3162     n           = ii[1] - ii[0]; ii++;
3163     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3164     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3165     if (usecprow) z = c + 5*ridx[i];
3166     jj = idx;
3167     vv = v;
3168     for (k=0; k<cn; k++) {
3169       idx = jj;
3170       v = vv;
3171       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3172       for (j=0; j<n; j++) {
3173         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3174         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3175         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3176         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3177         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3178         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3179         v    += 25;
3180       }
3181       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3182     }
3183     if (!usecprow) z += 5;
3184   }
3185   PetscFunctionReturn(0);
3186 }
3187 
3188 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3189 {
3190   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3191   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
3192   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
3193   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3194   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3195   PetscBLASInt      bbs,bcn,bbm,bcm;
3196   PetscScalar       *z = NULL;
3197   PetscScalar       *c,*b;
3198   const MatScalar   *v;
3199   const PetscInt    *idx,*ii,*ridx=NULL;
3200   PetscScalar       _DZero=0.0,_DOne=1.0;
3201   PetscBool         usecprow=a->compressedrow.use;
3202   PetscErrorCode    ierr;
3203 
3204   PetscFunctionBegin;
3205   if (!cm || !cn) PetscFunctionReturn(0);
3206   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
3207   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
3208   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
3209   b = bd->v;
3210   if (a->nonzerorowcnt != A->rmap->n) {
3211     ierr = MatZeroEntries(C);CHKERRQ(ierr);
3212   }
3213   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
3214   switch (bs) {
3215   case 1:
3216     ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3217     break;
3218   case 2:
3219     ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3220     break;
3221   case 3:
3222     ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3223     break;
3224   case 4:
3225     ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3226     break;
3227   case 5:
3228     ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);CHKERRQ(ierr);
3229     break;
3230   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3231     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
3232     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
3233     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
3234     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
3235     idx = a->j;
3236     v   = a->a;
3237     if (usecprow) {
3238       mbs  = a->compressedrow.nrows;
3239       ii   = a->compressedrow.i;
3240       ridx = a->compressedrow.rindex;
3241     } else {
3242       mbs = a->mbs;
3243       ii  = a->i;
3244       z   = c;
3245     }
3246     for (i=0; i<mbs; i++) {
3247       n = ii[1] - ii[0]; ii++;
3248       if (usecprow) z = c + bs*ridx[i];
3249       if (n) {
3250         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3251         v += bs2;
3252       }
3253       for (j=1; j<n; j++) {
3254         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3255         v += bs2;
3256       }
3257       if (!usecprow) z += bs;
3258     }
3259   }
3260   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
3261   ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr);
3262   PetscFunctionReturn(0);
3263 }
3264