xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision d2fd7bfc6f0fd2e1d083decbb7cc7d77e16824f0)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscbt.h>
6 #include <petscblaslapack.h>
7 
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 #include <immintrin.h>
10 #endif
11 
12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77 {
78   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
79   PetscErrorCode ierr;
80   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
82   const PetscInt *irow,*icol;
83   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84   PetscInt       *aj = a->j,*ai = a->i;
85   MatScalar      *mat_a;
86   Mat            C;
87   PetscBool      flag;
88 
89   PetscFunctionBegin;
90   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
91   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
93   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
94 
95   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
96   ssmap = smap;
97   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
98   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99   /* determine lens of each row */
100   for (i=0; i<nrows; i++) {
101     kstart  = ai[irow[i]];
102     kend    = kstart + a->ilen[irow[i]];
103     lens[i] = 0;
104     for (k=kstart; k<kend; k++) {
105       if (ssmap[aj[k]]) lens[i]++;
106     }
107   }
108   /* Create and fill new matrix */
109   if (scall == MAT_REUSE_MATRIX) {
110     c = (Mat_SeqBAIJ*)((*B)->data);
111 
112     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
114     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
116     C    = *B;
117   } else {
118     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
119     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
120     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
121     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
122   }
123   c = (Mat_SeqBAIJ*)(C->data);
124   for (i=0; i<nrows; i++) {
125     row      = irow[i];
126     kstart   = ai[row];
127     kend     = kstart + a->ilen[row];
128     mat_i    = c->i[i];
129     mat_j    = c->j + mat_i;
130     mat_a    = c->a + mat_i*bs2;
131     mat_ilen = c->ilen + i;
132     for (k=kstart; k<kend; k++) {
133       if ((tcol=ssmap[a->j[k]])) {
134         *mat_j++ = tcol - 1;
135         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
136         mat_a   += bs2;
137         (*mat_ilen)++;
138       }
139     }
140   }
141   /* sort */
142   {
143     MatScalar *work;
144     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
145     for (i=0; i<nrows; i++) {
146       PetscInt ilen;
147       mat_i = c->i[i];
148       mat_j = c->j + mat_i;
149       mat_a = c->a + mat_i*bs2;
150       ilen  = c->ilen[i];
151       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
152     }
153     ierr = PetscFree(work);CHKERRQ(ierr);
154   }
155 
156   /* Free work space */
157   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
158   ierr = PetscFree(smap);CHKERRQ(ierr);
159   ierr = PetscFree(lens);CHKERRQ(ierr);
160   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
162 
163   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
164   *B   = C;
165   PetscFunctionReturn(0);
166 }
167 
168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169 {
170   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
171   IS             is1,is2;
172   PetscErrorCode ierr;
173   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174   const PetscInt *irow,*icol;
175 
176   PetscFunctionBegin;
177   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
178   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
179   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
180   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
181 
182   /* Verify if the indices corespond to each element in a block
183    and form the IS with compressed IS */
184   maxmnbs = PetscMax(a->mbs,a->nbs);
185   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
186   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
187   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188   for (i=0; i<a->mbs; i++) {
189     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190   }
191   count = 0;
192   for (i=0; i<nrows; i++) {
193     j = irow[i] / bs;
194     if ((vary[j]--)==bs) iary[count++] = j;
195   }
196   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
197 
198   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
199   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200   for (i=0; i<a->nbs; i++) {
201     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202   }
203   count = 0;
204   for (i=0; i<ncols; i++) {
205     j = icol[i] / bs;
206     if ((vary[j]--)==bs) iary[count++] = j;
207   }
208   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
209   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
210   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
211   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
212 
213   ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
214   ierr = ISDestroy(&is1);CHKERRQ(ierr);
215   ierr = ISDestroy(&is2);CHKERRQ(ierr);
216   PetscFunctionReturn(0);
217 }
218 
219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220 {
221   PetscErrorCode ierr;
222   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
223   Mat_SubSppt    *submatj = c->submatis1;
224 
225   PetscFunctionBegin;
226   ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
227   ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232 {
233   PetscErrorCode ierr;
234   PetscInt       i;
235   Mat            C;
236   Mat_SeqBAIJ    *c;
237   Mat_SubSppt    *submatj;
238 
239   PetscFunctionBegin;
240   for (i=0; i<n; i++) {
241     C       = (*mat)[i];
242     c       = (Mat_SeqBAIJ*)C->data;
243     submatj = c->submatis1;
244     if (submatj) {
245       ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
246       ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
247       ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
248       ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr);
249       ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr);
250       ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr);
251     } else {
252       ierr = MatDestroy(&C);CHKERRQ(ierr);
253     }
254   }
255   ierr = PetscFree(*mat);CHKERRQ(ierr);
256   PetscFunctionReturn(0);
257 }
258 
259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260 {
261   PetscErrorCode ierr;
262   PetscInt       i;
263 
264   PetscFunctionBegin;
265   if (scall == MAT_INITIAL_MATRIX) {
266     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
267   }
268 
269   for (i=0; i<n; i++) {
270     ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
271   }
272   PetscFunctionReturn(0);
273 }
274 
275 /* -------------------------------------------------------*/
276 /* Should check that shapes of vectors and matrices match */
277 /* -------------------------------------------------------*/
278 
279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
282   PetscScalar       *z,sum;
283   const PetscScalar *x;
284   const MatScalar   *v;
285   PetscErrorCode    ierr;
286   PetscInt          mbs,i,n;
287   const PetscInt    *idx,*ii,*ridx=NULL;
288   PetscBool         usecprow=a->compressedrow.use;
289 
290   PetscFunctionBegin;
291   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
292   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
293 
294   if (usecprow) {
295     mbs  = a->compressedrow.nrows;
296     ii   = a->compressedrow.i;
297     ridx = a->compressedrow.rindex;
298     ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr);
299   } else {
300     mbs = a->mbs;
301     ii  = a->i;
302   }
303 
304   for (i=0; i<mbs; i++) {
305     n   = ii[1] - ii[0];
306     v   = a->a + ii[0];
307     idx = a->j + ii[0];
308     ii++;
309     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
310     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311     sum = 0.0;
312     PetscSparseDensePlusDot(sum,x,v,idx,n);
313     if (usecprow) {
314       z[ridx[i]] = sum;
315     } else {
316       z[i]       = sum;
317     }
318   }
319   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
320   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
321   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
322   PetscFunctionReturn(0);
323 }
324 
325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326 {
327   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
328   PetscScalar       *z = 0,sum1,sum2,*zarray;
329   const PetscScalar *x,*xb;
330   PetscScalar       x1,x2;
331   const MatScalar   *v;
332   PetscErrorCode    ierr;
333   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
334   PetscBool         usecprow=a->compressedrow.use;
335 
336   PetscFunctionBegin;
337   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
338   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
339 
340   idx = a->j;
341   v   = a->a;
342   if (usecprow) {
343     mbs  = a->compressedrow.nrows;
344     ii   = a->compressedrow.i;
345     ridx = a->compressedrow.rindex;
346     ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr);
347   } else {
348     mbs = a->mbs;
349     ii  = a->i;
350     z   = zarray;
351   }
352 
353   for (i=0; i<mbs; i++) {
354     n           = ii[1] - ii[0]; ii++;
355     sum1        = 0.0; sum2 = 0.0;
356     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
357     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358     for (j=0; j<n; j++) {
359       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
360       sum1 += v[0]*x1 + v[2]*x2;
361       sum2 += v[1]*x1 + v[3]*x2;
362       v    += 4;
363     }
364     if (usecprow) z = zarray + 2*ridx[i];
365     z[0] = sum1; z[1] = sum2;
366     if (!usecprow) z += 2;
367   }
368   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
369   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
370   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
371   PetscFunctionReturn(0);
372 }
373 
374 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
375 {
376   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
377   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
378   const PetscScalar *x,*xb;
379   const MatScalar   *v;
380   PetscErrorCode    ierr;
381   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
382   PetscBool         usecprow=a->compressedrow.use;
383 
384 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
385 #pragma disjoint(*v,*z,*xb)
386 #endif
387 
388   PetscFunctionBegin;
389   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
390   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
391 
392   idx = a->j;
393   v   = a->a;
394   if (usecprow) {
395     mbs  = a->compressedrow.nrows;
396     ii   = a->compressedrow.i;
397     ridx = a->compressedrow.rindex;
398     ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr);
399   } else {
400     mbs = a->mbs;
401     ii  = a->i;
402     z   = zarray;
403   }
404 
405   for (i=0; i<mbs; i++) {
406     n           = ii[1] - ii[0]; ii++;
407     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
408     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
409     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410     for (j=0; j<n; j++) {
411       xb = x + 3*(*idx++);
412       x1 = xb[0];
413       x2 = xb[1];
414       x3 = xb[2];
415 
416       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
417       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
418       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
419       v    += 9;
420     }
421     if (usecprow) z = zarray + 3*ridx[i];
422     z[0] = sum1; z[1] = sum2; z[2] = sum3;
423     if (!usecprow) z += 3;
424   }
425   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
426   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
427   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
431 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
432 {
433   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
434   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
435   const PetscScalar *x,*xb;
436   const MatScalar   *v;
437   PetscErrorCode    ierr;
438   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
439   PetscBool         usecprow=a->compressedrow.use;
440 
441   PetscFunctionBegin;
442   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
443   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
444 
445   idx = a->j;
446   v   = a->a;
447   if (usecprow) {
448     mbs  = a->compressedrow.nrows;
449     ii   = a->compressedrow.i;
450     ridx = a->compressedrow.rindex;
451     ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr);
452   } else {
453     mbs = a->mbs;
454     ii  = a->i;
455     z   = zarray;
456   }
457 
458   for (i=0; i<mbs; i++) {
459     n = ii[1] - ii[0];
460     ii++;
461     sum1 = 0.0;
462     sum2 = 0.0;
463     sum3 = 0.0;
464     sum4 = 0.0;
465 
466     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
467     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468     for (j=0; j<n; j++) {
469       xb    = x + 4*(*idx++);
470       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
471       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
472       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
473       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
474       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
475       v    += 16;
476     }
477     if (usecprow) z = zarray + 4*ridx[i];
478     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
479     if (!usecprow) z += 4;
480   }
481   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
482   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
483   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
484   PetscFunctionReturn(0);
485 }
486 
487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
488 {
489   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
490   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
491   const PetscScalar *xb,*x;
492   const MatScalar   *v;
493   PetscErrorCode    ierr;
494   const PetscInt    *idx,*ii,*ridx=NULL;
495   PetscInt          mbs,i,j,n;
496   PetscBool         usecprow=a->compressedrow.use;
497 
498   PetscFunctionBegin;
499   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
500   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
501 
502   idx = a->j;
503   v   = a->a;
504   if (usecprow) {
505     mbs  = a->compressedrow.nrows;
506     ii   = a->compressedrow.i;
507     ridx = a->compressedrow.rindex;
508     ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr);
509   } else {
510     mbs = a->mbs;
511     ii  = a->i;
512     z   = zarray;
513   }
514 
515   for (i=0; i<mbs; i++) {
516     n           = ii[1] - ii[0]; ii++;
517     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
518     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
519     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520     for (j=0; j<n; j++) {
521       xb    = x + 5*(*idx++);
522       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
523       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
524       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
525       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
526       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
527       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
528       v    += 25;
529     }
530     if (usecprow) z = zarray + 5*ridx[i];
531     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
532     if (!usecprow) z += 5;
533   }
534   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
535   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
536   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
537   PetscFunctionReturn(0);
538 }
539 
540 
541 
542 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
543 {
544   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
545   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
546   const PetscScalar *x,*xb;
547   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
548   const MatScalar   *v;
549   PetscErrorCode    ierr;
550   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
551   PetscBool         usecprow=a->compressedrow.use;
552 
553   PetscFunctionBegin;
554   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
555   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
556 
557   idx = a->j;
558   v   = a->a;
559   if (usecprow) {
560     mbs  = a->compressedrow.nrows;
561     ii   = a->compressedrow.i;
562     ridx = a->compressedrow.rindex;
563     ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr);
564   } else {
565     mbs = a->mbs;
566     ii  = a->i;
567     z   = zarray;
568   }
569 
570   for (i=0; i<mbs; i++) {
571     n  = ii[1] - ii[0];
572     ii++;
573     sum1 = 0.0;
574     sum2 = 0.0;
575     sum3 = 0.0;
576     sum4 = 0.0;
577     sum5 = 0.0;
578     sum6 = 0.0;
579 
580     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
581     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
582     for (j=0; j<n; j++) {
583       xb    = x + 6*(*idx++);
584       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
585       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
586       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
587       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
588       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
589       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
590       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
591       v    += 36;
592     }
593     if (usecprow) z = zarray + 6*ridx[i];
594     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
595     if (!usecprow) z += 6;
596   }
597 
598   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
600   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
601   PetscFunctionReturn(0);
602 }
603 
604 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
605 {
606   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
607   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
608   const PetscScalar *x,*xb;
609   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
610   const MatScalar   *v;
611   PetscErrorCode    ierr;
612   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
613   PetscBool         usecprow=a->compressedrow.use;
614 
615   PetscFunctionBegin;
616   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
617   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
618 
619   idx = a->j;
620   v   = a->a;
621   if (usecprow) {
622     mbs  = a->compressedrow.nrows;
623     ii   = a->compressedrow.i;
624     ridx = a->compressedrow.rindex;
625     ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr);
626   } else {
627     mbs = a->mbs;
628     ii  = a->i;
629     z   = zarray;
630   }
631 
632   for (i=0; i<mbs; i++) {
633     n  = ii[1] - ii[0];
634     ii++;
635     sum1 = 0.0;
636     sum2 = 0.0;
637     sum3 = 0.0;
638     sum4 = 0.0;
639     sum5 = 0.0;
640     sum6 = 0.0;
641     sum7 = 0.0;
642 
643     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
644     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
645     for (j=0; j<n; j++) {
646       xb    = x + 7*(*idx++);
647       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
648       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
649       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
650       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
651       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
652       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
653       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
654       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
655       v    += 49;
656     }
657     if (usecprow) z = zarray + 7*ridx[i];
658     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
659     if (!usecprow) z += 7;
660   }
661 
662   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
663   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
664   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
665   PetscFunctionReturn(0);
666 }
667 
668 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
669 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
670 {
671   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
672   PetscScalar    *z = 0,*work,*workt,*zarray;
673   const PetscScalar *x,*xb;
674   const MatScalar   *v;
675   PetscErrorCode ierr;
676   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
677   const PetscInt *idx,*ii,*ridx=NULL;
678   PetscInt       k;
679   PetscBool      usecprow=a->compressedrow.use;
680 
681   __m256d a0,a1,a2,a3,a4,a5;
682   __m256d w0,w1,w2,w3;
683   __m256d z0,z1,z2;
684   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
685 
686   PetscFunctionBegin;
687   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
688   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
689 
690   idx = a->j;
691   v   = a->a;
692   if (usecprow) {
693     mbs  = a->compressedrow.nrows;
694     ii   = a->compressedrow.i;
695     ridx = a->compressedrow.rindex;
696     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
697   } else {
698     mbs = a->mbs;
699     ii  = a->i;
700     z   = zarray;
701   }
702 
703   if (!a->mult_work) {
704     k    = PetscMax(A->rmap->n,A->cmap->n);
705     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
706   }
707 
708   work = a->mult_work;
709   for (i=0; i<mbs; i++) {
710     n           = ii[1] - ii[0]; ii++;
711     workt       = work;
712     for (j=0; j<n; j++) {
713       xb = x + bs*(*idx++);
714       for (k=0; k<bs; k++) workt[k] = xb[k];
715       workt += bs;
716     }
717     if (usecprow) z = zarray + bs*ridx[i];
718 
719     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
720 
721     for (j=0; j<n; j++) {
722       /* first column of a */
723       w0 = _mm256_set1_pd(work[j*9  ]);
724       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
725       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
726       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
727 
728       /* second column of a */
729       w1 = _mm256_set1_pd(work[j*9+ 1]);
730       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
731       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
732       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
733 
734       /* third column of a */
735       w2 = _mm256_set1_pd(work[j*9 +2]);
736       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
737       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
738       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
739 
740       /* fourth column of a */
741       w3 = _mm256_set1_pd(work[j*9+ 3]);
742       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
743       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
744       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
745 
746       /* fifth column of a */
747       w0 = _mm256_set1_pd(work[j*9+ 4]);
748       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
749       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
750       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
751 
752       /* sixth column of a */
753       w1 = _mm256_set1_pd(work[j*9+ 5]);
754       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
755       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
756       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
757 
758       /* seventh column of a */
759       w2 = _mm256_set1_pd(work[j*9+ 6]);
760       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
761       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
762       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
763 
764       /* eigth column of a */
765       w3 = _mm256_set1_pd(work[j*9+ 7]);
766       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
767       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
768       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
769 
770       /* ninth column of a */
771       w0 = _mm256_set1_pd(work[j*9+ 8]);
772       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
773       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
774       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
775     }
776 
777     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
778 
779     v += n*bs2;
780     if (!usecprow) z += bs;
781   }
782   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
783   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
784   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
785   PetscFunctionReturn(0);
786 }
787 #endif
788 
789 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
790 {
791   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
792   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
793   const PetscScalar *x,*xb;
794   PetscScalar       *zarray,xv;
795   const MatScalar   *v;
796   PetscErrorCode    ierr;
797   const PetscInt    *ii,*ij=a->j,*idx;
798   PetscInt          mbs,i,j,k,n,*ridx=NULL;
799   PetscBool         usecprow=a->compressedrow.use;
800 
801   PetscFunctionBegin;
802   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
803   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
804 
805   v = a->a;
806   if (usecprow) {
807     mbs  = a->compressedrow.nrows;
808     ii   = a->compressedrow.i;
809     ridx = a->compressedrow.rindex;
810     ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr);
811   } else {
812     mbs = a->mbs;
813     ii  = a->i;
814     z   = zarray;
815   }
816 
817   for (i=0; i<mbs; i++) {
818     n    = ii[i+1] - ii[i];
819     idx  = ij + ii[i];
820     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
821     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
822 
823     for (j=0; j<n; j++) {
824       xb = x + 11*(idx[j]);
825 
826       for (k=0; k<11; k++) {
827         xv     =  xb[k];
828         sum1  += v[0]*xv;
829         sum2  += v[1]*xv;
830         sum3  += v[2]*xv;
831         sum4  += v[3]*xv;
832         sum5  += v[4]*xv;
833         sum6  += v[5]*xv;
834         sum7  += v[6]*xv;
835         sum8  += v[7]*xv;
836         sum9  += v[8]*xv;
837         sum10 += v[9]*xv;
838         sum11 += v[10]*xv;
839         v     += 11;
840       }
841     }
842     if (usecprow) z = zarray + 11*ridx[i];
843     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
844     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
845 
846     if (!usecprow) z += 11;
847   }
848 
849   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
850   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
851   ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
852   PetscFunctionReturn(0);
853 }
854 
855 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
856 /* Default MatMult for block size 15 */
857 
858 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
859 {
860   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
861   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
862   const PetscScalar *x,*xb;
863   PetscScalar       *zarray,xv;
864   const MatScalar   *v;
865   PetscErrorCode    ierr;
866   const PetscInt    *ii,*ij=a->j,*idx;
867   PetscInt          mbs,i,j,k,n,*ridx=NULL;
868   PetscBool         usecprow=a->compressedrow.use;
869 
870   PetscFunctionBegin;
871   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
872   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
873 
874   v = a->a;
875   if (usecprow) {
876     mbs  = a->compressedrow.nrows;
877     ii   = a->compressedrow.i;
878     ridx = a->compressedrow.rindex;
879     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
880   } else {
881     mbs = a->mbs;
882     ii  = a->i;
883     z   = zarray;
884   }
885 
886   for (i=0; i<mbs; i++) {
887     n    = ii[i+1] - ii[i];
888     idx  = ij + ii[i];
889     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
890     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
891 
892     for (j=0; j<n; j++) {
893       xb = x + 15*(idx[j]);
894 
895       for (k=0; k<15; k++) {
896         xv     =  xb[k];
897         sum1  += v[0]*xv;
898         sum2  += v[1]*xv;
899         sum3  += v[2]*xv;
900         sum4  += v[3]*xv;
901         sum5  += v[4]*xv;
902         sum6  += v[5]*xv;
903         sum7  += v[6]*xv;
904         sum8  += v[7]*xv;
905         sum9  += v[8]*xv;
906         sum10 += v[9]*xv;
907         sum11 += v[10]*xv;
908         sum12 += v[11]*xv;
909         sum13 += v[12]*xv;
910         sum14 += v[13]*xv;
911         sum15 += v[14]*xv;
912         v     += 15;
913       }
914     }
915     if (usecprow) z = zarray + 15*ridx[i];
916     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
917     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
918 
919     if (!usecprow) z += 15;
920   }
921 
922   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
923   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
924   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
925   PetscFunctionReturn(0);
926 }
927 
928 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
929 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
930 {
931   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
932   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
933   const PetscScalar *x,*xb;
934   PetscScalar       x1,x2,x3,x4,*zarray;
935   const MatScalar   *v;
936   PetscErrorCode    ierr;
937   const PetscInt    *ii,*ij=a->j,*idx;
938   PetscInt          mbs,i,j,n,*ridx=NULL;
939   PetscBool         usecprow=a->compressedrow.use;
940 
941   PetscFunctionBegin;
942   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
943   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
944 
945   v = a->a;
946   if (usecprow) {
947     mbs  = a->compressedrow.nrows;
948     ii   = a->compressedrow.i;
949     ridx = a->compressedrow.rindex;
950     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
951   } else {
952     mbs = a->mbs;
953     ii  = a->i;
954     z   = zarray;
955   }
956 
957   for (i=0; i<mbs; i++) {
958     n    = ii[i+1] - ii[i];
959     idx  = ij + ii[i];
960     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
961     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
962 
963     for (j=0; j<n; j++) {
964       xb = x + 15*(idx[j]);
965       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
966 
967       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
968       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
969       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
970       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
971       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
972       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
973       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
974       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
975       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
976       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
977       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
978       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
979       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
980       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
981       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
982 
983       v += 60;
984 
985       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
986 
987       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
988       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
989       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
990       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
991       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
992       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
993       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
994       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
995       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
996       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
997       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
998       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
999       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1000       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1001       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1002       v     += 60;
1003 
1004       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1005       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1006       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1007       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1008       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1009       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1010       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1011       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1012       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1013       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1014       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1015       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1016       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1017       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1018       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1019       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1020       v     += 60;
1021 
1022       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1023       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1024       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1025       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1026       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1027       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1028       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1029       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1030       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1031       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1032       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1033       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1034       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1035       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1036       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1037       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1038       v     += 45;
1039     }
1040     if (usecprow) z = zarray + 15*ridx[i];
1041     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1042     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1043 
1044     if (!usecprow) z += 15;
1045   }
1046 
1047   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1048   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1049   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1050   PetscFunctionReturn(0);
1051 }
1052 
1053 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1054 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1055 {
1056   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1057   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1058   const PetscScalar *x,*xb;
1059   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1060   const MatScalar   *v;
1061   PetscErrorCode    ierr;
1062   const PetscInt    *ii,*ij=a->j,*idx;
1063   PetscInt          mbs,i,j,n,*ridx=NULL;
1064   PetscBool         usecprow=a->compressedrow.use;
1065 
1066   PetscFunctionBegin;
1067   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1068   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1069 
1070   v = a->a;
1071   if (usecprow) {
1072     mbs  = a->compressedrow.nrows;
1073     ii   = a->compressedrow.i;
1074     ridx = a->compressedrow.rindex;
1075     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1076   } else {
1077     mbs = a->mbs;
1078     ii  = a->i;
1079     z   = zarray;
1080   }
1081 
1082   for (i=0; i<mbs; i++) {
1083     n    = ii[i+1] - ii[i];
1084     idx  = ij + ii[i];
1085     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1086     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1087 
1088     for (j=0; j<n; j++) {
1089       xb = x + 15*(idx[j]);
1090       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1091       x8 = xb[7];
1092 
1093       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1094       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1095       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1096       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1097       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1098       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1099       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1100       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1101       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1102       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1103       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1104       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1105       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1106       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1107       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1108       v     += 120;
1109 
1110       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1111 
1112       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1113       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1114       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1115       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1116       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1117       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1118       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1119       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1120       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1121       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1122       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1123       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1124       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1125       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1126       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1127       v     += 105;
1128     }
1129     if (usecprow) z = zarray + 15*ridx[i];
1130     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1131     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1132 
1133     if (!usecprow) z += 15;
1134   }
1135 
1136   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1137   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1138   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1139   PetscFunctionReturn(0);
1140 }
1141 
1142 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1143 
1144 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1145 {
1146   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1147   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1148   const PetscScalar *x,*xb;
1149   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1150   const MatScalar   *v;
1151   PetscErrorCode    ierr;
1152   const PetscInt    *ii,*ij=a->j,*idx;
1153   PetscInt          mbs,i,j,n,*ridx=NULL;
1154   PetscBool         usecprow=a->compressedrow.use;
1155 
1156   PetscFunctionBegin;
1157   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1158   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1159 
1160   v = a->a;
1161   if (usecprow) {
1162     mbs  = a->compressedrow.nrows;
1163     ii   = a->compressedrow.i;
1164     ridx = a->compressedrow.rindex;
1165     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1166   } else {
1167     mbs = a->mbs;
1168     ii  = a->i;
1169     z   = zarray;
1170   }
1171 
1172   for (i=0; i<mbs; i++) {
1173     n    = ii[i+1] - ii[i];
1174     idx  = ij + ii[i];
1175     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1176     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1177 
1178     for (j=0; j<n; j++) {
1179       xb = x + 15*(idx[j]);
1180       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1181       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1182 
1183       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1184       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1185       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1186       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1187       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1188       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1189       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1190       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1191       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1192       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1193       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1194       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1195       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1196       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1197       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1198       v     += 225;
1199     }
1200     if (usecprow) z = zarray + 15*ridx[i];
1201     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1202     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1203 
1204     if (!usecprow) z += 15;
1205   }
1206 
1207   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1208   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1209   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1210   PetscFunctionReturn(0);
1211 }
1212 
1213 
1214 /*
1215     This will not work with MatScalar == float because it calls the BLAS
1216 */
1217 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1218 {
1219   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1220   PetscScalar       *z = 0,*work,*workt,*zarray;
1221   const PetscScalar *x,*xb;
1222   const MatScalar   *v;
1223   PetscErrorCode    ierr;
1224   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1225   const PetscInt    *idx,*ii,*ridx=NULL;
1226   PetscInt          ncols,k;
1227   PetscBool         usecprow=a->compressedrow.use;
1228 
1229   PetscFunctionBegin;
1230   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1231   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1232 
1233   idx = a->j;
1234   v   = a->a;
1235   if (usecprow) {
1236     mbs  = a->compressedrow.nrows;
1237     ii   = a->compressedrow.i;
1238     ridx = a->compressedrow.rindex;
1239     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1240   } else {
1241     mbs = a->mbs;
1242     ii  = a->i;
1243     z   = zarray;
1244   }
1245 
1246   if (!a->mult_work) {
1247     k    = PetscMax(A->rmap->n,A->cmap->n);
1248     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1249   }
1250   work = a->mult_work;
1251   for (i=0; i<mbs; i++) {
1252     n           = ii[1] - ii[0]; ii++;
1253     ncols       = n*bs;
1254     workt       = work;
1255     for (j=0; j<n; j++) {
1256       xb = x + bs*(*idx++);
1257       for (k=0; k<bs; k++) workt[k] = xb[k];
1258       workt += bs;
1259     }
1260     if (usecprow) z = zarray + bs*ridx[i];
1261     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1262     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1263     v += n*bs2;
1264     if (!usecprow) z += bs;
1265   }
1266   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1267   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1268   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1269   PetscFunctionReturn(0);
1270 }
1271 
1272 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1273 {
1274   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1275   const PetscScalar *x;
1276   PetscScalar       *y,*z,sum;
1277   const MatScalar   *v;
1278   PetscErrorCode    ierr;
1279   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1280   const PetscInt    *idx,*ii;
1281   PetscBool         usecprow=a->compressedrow.use;
1282 
1283   PetscFunctionBegin;
1284   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1285   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1286 
1287   idx = a->j;
1288   v   = a->a;
1289   if (usecprow) {
1290     if (zz != yy) {
1291       ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr);
1292     }
1293     mbs  = a->compressedrow.nrows;
1294     ii   = a->compressedrow.i;
1295     ridx = a->compressedrow.rindex;
1296   } else {
1297     ii = a->i;
1298   }
1299 
1300   for (i=0; i<mbs; i++) {
1301     n = ii[1] - ii[0];
1302     ii++;
1303     if (!usecprow) {
1304       sum         = y[i];
1305     } else {
1306       sum = y[ridx[i]];
1307     }
1308     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1309     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1310     PetscSparseDensePlusDot(sum,x,v,idx,n);
1311     v   += n;
1312     idx += n;
1313     if (usecprow) {
1314       z[ridx[i]] = sum;
1315     } else {
1316       z[i] = sum;
1317     }
1318   }
1319   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1320   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1321   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1326 {
1327   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1328   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1329   const PetscScalar *x,*xb;
1330   PetscScalar       x1,x2,*yarray,*zarray;
1331   const MatScalar   *v;
1332   PetscErrorCode    ierr;
1333   PetscInt          mbs = a->mbs,i,n,j;
1334   const PetscInt    *idx,*ii,*ridx = NULL;
1335   PetscBool         usecprow = a->compressedrow.use;
1336 
1337   PetscFunctionBegin;
1338   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1339   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1340 
1341   idx = a->j;
1342   v   = a->a;
1343   if (usecprow) {
1344     if (zz != yy) {
1345       ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr);
1346     }
1347     mbs  = a->compressedrow.nrows;
1348     ii   = a->compressedrow.i;
1349     ridx = a->compressedrow.rindex;
1350   } else {
1351     ii = a->i;
1352     y  = yarray;
1353     z  = zarray;
1354   }
1355 
1356   for (i=0; i<mbs; i++) {
1357     n = ii[1] - ii[0]; ii++;
1358     if (usecprow) {
1359       z = zarray + 2*ridx[i];
1360       y = yarray + 2*ridx[i];
1361     }
1362     sum1 = y[0]; sum2 = y[1];
1363     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1364     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1365     for (j=0; j<n; j++) {
1366       xb = x + 2*(*idx++);
1367       x1 = xb[0];
1368       x2 = xb[1];
1369 
1370       sum1 += v[0]*x1 + v[2]*x2;
1371       sum2 += v[1]*x1 + v[3]*x2;
1372       v    += 4;
1373     }
1374     z[0] = sum1; z[1] = sum2;
1375     if (!usecprow) {
1376       z += 2; y += 2;
1377     }
1378   }
1379   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1380   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1381   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1382   PetscFunctionReturn(0);
1383 }
1384 
1385 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1386 {
1387   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1388   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1389   const PetscScalar *x,*xb;
1390   const MatScalar   *v;
1391   PetscErrorCode    ierr;
1392   PetscInt          mbs = a->mbs,i,j,n;
1393   const PetscInt    *idx,*ii,*ridx = NULL;
1394   PetscBool         usecprow = a->compressedrow.use;
1395 
1396   PetscFunctionBegin;
1397   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1398   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1399 
1400   idx = a->j;
1401   v   = a->a;
1402   if (usecprow) {
1403     if (zz != yy) {
1404       ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr);
1405     }
1406     mbs  = a->compressedrow.nrows;
1407     ii   = a->compressedrow.i;
1408     ridx = a->compressedrow.rindex;
1409   } else {
1410     ii = a->i;
1411     y  = yarray;
1412     z  = zarray;
1413   }
1414 
1415   for (i=0; i<mbs; i++) {
1416     n = ii[1] - ii[0]; ii++;
1417     if (usecprow) {
1418       z = zarray + 3*ridx[i];
1419       y = yarray + 3*ridx[i];
1420     }
1421     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1422     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1423     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1424     for (j=0; j<n; j++) {
1425       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1426       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1427       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1428       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1429       v    += 9;
1430     }
1431     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1432     if (!usecprow) {
1433       z += 3; y += 3;
1434     }
1435   }
1436   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1437   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1438   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1439   PetscFunctionReturn(0);
1440 }
1441 
1442 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1443 {
1444   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1445   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1446   const PetscScalar *x,*xb;
1447   const MatScalar   *v;
1448   PetscErrorCode    ierr;
1449   PetscInt          mbs = a->mbs,i,j,n;
1450   const PetscInt    *idx,*ii,*ridx=NULL;
1451   PetscBool         usecprow=a->compressedrow.use;
1452 
1453   PetscFunctionBegin;
1454   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1455   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1456 
1457   idx = a->j;
1458   v   = a->a;
1459   if (usecprow) {
1460     if (zz != yy) {
1461       ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr);
1462     }
1463     mbs  = a->compressedrow.nrows;
1464     ii   = a->compressedrow.i;
1465     ridx = a->compressedrow.rindex;
1466   } else {
1467     ii = a->i;
1468     y  = yarray;
1469     z  = zarray;
1470   }
1471 
1472   for (i=0; i<mbs; i++) {
1473     n = ii[1] - ii[0]; ii++;
1474     if (usecprow) {
1475       z = zarray + 4*ridx[i];
1476       y = yarray + 4*ridx[i];
1477     }
1478     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1479     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1480     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1481     for (j=0; j<n; j++) {
1482       xb    = x + 4*(*idx++);
1483       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1484       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1485       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1486       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1487       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1488       v    += 16;
1489     }
1490     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1491     if (!usecprow) {
1492       z += 4; y += 4;
1493     }
1494   }
1495   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1496   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1497   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1498   PetscFunctionReturn(0);
1499 }
1500 
1501 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1502 {
1503   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1504   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1505   const PetscScalar *x,*xb;
1506   PetscScalar       *yarray,*zarray;
1507   const MatScalar   *v;
1508   PetscErrorCode    ierr;
1509   PetscInt          mbs = a->mbs,i,j,n;
1510   const PetscInt    *idx,*ii,*ridx = NULL;
1511   PetscBool         usecprow=a->compressedrow.use;
1512 
1513   PetscFunctionBegin;
1514   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1515   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1516 
1517   idx = a->j;
1518   v   = a->a;
1519   if (usecprow) {
1520     if (zz != yy) {
1521       ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr);
1522     }
1523     mbs  = a->compressedrow.nrows;
1524     ii   = a->compressedrow.i;
1525     ridx = a->compressedrow.rindex;
1526   } else {
1527     ii = a->i;
1528     y  = yarray;
1529     z  = zarray;
1530   }
1531 
1532   for (i=0; i<mbs; i++) {
1533     n = ii[1] - ii[0]; ii++;
1534     if (usecprow) {
1535       z = zarray + 5*ridx[i];
1536       y = yarray + 5*ridx[i];
1537     }
1538     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1539     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1540     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1541     for (j=0; j<n; j++) {
1542       xb    = x + 5*(*idx++);
1543       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1544       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1545       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1546       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1547       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1548       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1549       v    += 25;
1550     }
1551     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1552     if (!usecprow) {
1553       z += 5; y += 5;
1554     }
1555   }
1556   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1557   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1558   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1559   PetscFunctionReturn(0);
1560 }
1561 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1562 {
1563   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1564   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1565   const PetscScalar *x,*xb;
1566   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1567   const MatScalar   *v;
1568   PetscErrorCode    ierr;
1569   PetscInt          mbs = a->mbs,i,j,n;
1570   const PetscInt    *idx,*ii,*ridx=NULL;
1571   PetscBool         usecprow=a->compressedrow.use;
1572 
1573   PetscFunctionBegin;
1574   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1575   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1576 
1577   idx = a->j;
1578   v   = a->a;
1579   if (usecprow) {
1580     if (zz != yy) {
1581       ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr);
1582     }
1583     mbs  = a->compressedrow.nrows;
1584     ii   = a->compressedrow.i;
1585     ridx = a->compressedrow.rindex;
1586   } else {
1587     ii = a->i;
1588     y  = yarray;
1589     z  = zarray;
1590   }
1591 
1592   for (i=0; i<mbs; i++) {
1593     n = ii[1] - ii[0]; ii++;
1594     if (usecprow) {
1595       z = zarray + 6*ridx[i];
1596       y = yarray + 6*ridx[i];
1597     }
1598     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1599     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1600     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1601     for (j=0; j<n; j++) {
1602       xb    = x + 6*(*idx++);
1603       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1604       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1605       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1606       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1607       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1608       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1609       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1610       v    += 36;
1611     }
1612     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1613     if (!usecprow) {
1614       z += 6; y += 6;
1615     }
1616   }
1617   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1618   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1619   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1620   PetscFunctionReturn(0);
1621 }
1622 
1623 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1624 {
1625   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1626   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1627   const PetscScalar *x,*xb;
1628   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1629   const MatScalar   *v;
1630   PetscErrorCode    ierr;
1631   PetscInt          mbs = a->mbs,i,j,n;
1632   const PetscInt    *idx,*ii,*ridx = NULL;
1633   PetscBool         usecprow=a->compressedrow.use;
1634 
1635   PetscFunctionBegin;
1636   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1637   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1638 
1639   idx = a->j;
1640   v   = a->a;
1641   if (usecprow) {
1642     if (zz != yy) {
1643       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
1644     }
1645     mbs  = a->compressedrow.nrows;
1646     ii   = a->compressedrow.i;
1647     ridx = a->compressedrow.rindex;
1648   } else {
1649     ii = a->i;
1650     y  = yarray;
1651     z  = zarray;
1652   }
1653 
1654   for (i=0; i<mbs; i++) {
1655     n = ii[1] - ii[0]; ii++;
1656     if (usecprow) {
1657       z = zarray + 7*ridx[i];
1658       y = yarray + 7*ridx[i];
1659     }
1660     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1661     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1662     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1663     for (j=0; j<n; j++) {
1664       xb    = x + 7*(*idx++);
1665       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1666       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1667       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1668       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1669       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1670       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1671       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1672       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1673       v    += 49;
1674     }
1675     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1676     if (!usecprow) {
1677       z += 7; y += 7;
1678     }
1679   }
1680   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1681   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1682   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1683   PetscFunctionReturn(0);
1684 }
1685 
1686 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1687 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1688 {
1689   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1690   PetscScalar    *z = 0,*work,*workt,*zarray;
1691   const PetscScalar *x,*xb;
1692   const MatScalar   *v;
1693   PetscErrorCode ierr;
1694   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1695   PetscInt       k;
1696   PetscBool      usecprow=a->compressedrow.use;
1697   const PetscInt *idx,*ii,*ridx=NULL;
1698 
1699   __m256d a0,a1,a2,a3,a4,a5;
1700   __m256d w0,w1,w2,w3;
1701   __m256d z0,z1,z2;
1702   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
1703 
1704   PetscFunctionBegin;
1705   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1706   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1707   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1708 
1709   idx = a->j;
1710   v   = a->a;
1711   if (usecprow) {
1712     mbs  = a->compressedrow.nrows;
1713     ii   = a->compressedrow.i;
1714     ridx = a->compressedrow.rindex;
1715   } else {
1716     mbs = a->mbs;
1717     ii  = a->i;
1718     z   = zarray;
1719   }
1720 
1721   if (!a->mult_work) {
1722     k    = PetscMax(A->rmap->n,A->cmap->n);
1723     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1724   }
1725 
1726   work = a->mult_work;
1727   for (i=0; i<mbs; i++) {
1728     n           = ii[1] - ii[0]; ii++;
1729     workt       = work;
1730     for (j=0; j<n; j++) {
1731       xb = x + bs*(*idx++);
1732       for (k=0; k<bs; k++) workt[k] = xb[k];
1733       workt += bs;
1734     }
1735     if (usecprow) z = zarray + bs*ridx[i];
1736 
1737     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
1738 
1739     for (j=0; j<n; j++) {
1740       /* first column of a */
1741       w0 = _mm256_set1_pd(work[j*9  ]);
1742       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1743       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1744       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
1745 
1746       /* second column of a */
1747       w1 = _mm256_set1_pd(work[j*9+ 1]);
1748       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1749       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1750       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1751 
1752       /* third column of a */
1753       w2 = _mm256_set1_pd(work[j*9+ 2]);
1754       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1755       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1756       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
1757 
1758       /* fourth column of a */
1759       w3 = _mm256_set1_pd(work[j*9+ 3]);
1760       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1761       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1762       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
1763 
1764       /* fifth column of a */
1765       w0 = _mm256_set1_pd(work[j*9+ 4]);
1766       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1767       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1768       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
1769 
1770       /* sixth column of a */
1771       w1 = _mm256_set1_pd(work[j*9+ 5]);
1772       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1773       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1774       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1775 
1776       /* seventh column of a */
1777       w2 = _mm256_set1_pd(work[j*9+ 6]);
1778       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1779       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1780       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
1781 
1782       /* eigth column of a */
1783       w3 = _mm256_set1_pd(work[j*9+ 7]);
1784       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1785       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1786       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
1787 
1788       /* ninth column of a */
1789       w0 = _mm256_set1_pd(work[j*9+ 8]);
1790       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1791       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1792       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1793     }
1794 
1795     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
1796 
1797     v += n*bs2;
1798     if (!usecprow) z += bs;
1799   }
1800   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1801   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1802   ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr);
1803   PetscFunctionReturn(0);
1804 }
1805 #endif
1806 
1807 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1808 {
1809   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1810   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1811   const PetscScalar *x,*xb;
1812   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1813   const MatScalar   *v;
1814   PetscErrorCode    ierr;
1815   PetscInt          mbs = a->mbs,i,j,n;
1816   const PetscInt    *idx,*ii,*ridx = NULL;
1817   PetscBool         usecprow=a->compressedrow.use;
1818 
1819   PetscFunctionBegin;
1820   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1821   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1822 
1823   idx = a->j;
1824   v   = a->a;
1825   if (usecprow) {
1826     if (zz != yy) {
1827       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
1828     }
1829     mbs  = a->compressedrow.nrows;
1830     ii   = a->compressedrow.i;
1831     ridx = a->compressedrow.rindex;
1832   } else {
1833     ii = a->i;
1834     y  = yarray;
1835     z  = zarray;
1836   }
1837 
1838   for (i=0; i<mbs; i++) {
1839     n = ii[1] - ii[0]; ii++;
1840     if (usecprow) {
1841       z = zarray + 11*ridx[i];
1842       y = yarray + 11*ridx[i];
1843     }
1844     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1845     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1846     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1847     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1848     for (j=0; j<n; j++) {
1849       xb    = x + 11*(*idx++);
1850       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1851       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1852       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1853       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1854       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1855       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1856       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1857       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1858       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1859       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1860       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1861       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1862       v    += 121;
1863     }
1864     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1865     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1866     if (!usecprow) {
1867       z += 11; y += 11;
1868     }
1869   }
1870   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1871   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1872   ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
1873   PetscFunctionReturn(0);
1874 }
1875 
1876 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1877 {
1878   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1879   PetscScalar       *z = 0,*work,*workt,*zarray;
1880   const PetscScalar *x,*xb;
1881   const MatScalar   *v;
1882   PetscErrorCode    ierr;
1883   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1884   PetscInt          ncols,k;
1885   const PetscInt    *ridx = NULL,*idx,*ii;
1886   PetscBool         usecprow = a->compressedrow.use;
1887 
1888   PetscFunctionBegin;
1889   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1890   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1891   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1892 
1893   idx = a->j;
1894   v   = a->a;
1895   if (usecprow) {
1896     mbs  = a->compressedrow.nrows;
1897     ii   = a->compressedrow.i;
1898     ridx = a->compressedrow.rindex;
1899   } else {
1900     mbs = a->mbs;
1901     ii  = a->i;
1902     z   = zarray;
1903   }
1904 
1905   if (!a->mult_work) {
1906     k    = PetscMax(A->rmap->n,A->cmap->n);
1907     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1908   }
1909   work = a->mult_work;
1910   for (i=0; i<mbs; i++) {
1911     n     = ii[1] - ii[0]; ii++;
1912     ncols = n*bs;
1913     workt = work;
1914     for (j=0; j<n; j++) {
1915       xb = x + bs*(*idx++);
1916       for (k=0; k<bs; k++) workt[k] = xb[k];
1917       workt += bs;
1918     }
1919     if (usecprow) z = zarray + bs*ridx[i];
1920     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1921     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1922     v += n*bs2;
1923     if (!usecprow) z += bs;
1924   }
1925   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1926   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1927   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1928   PetscFunctionReturn(0);
1929 }
1930 
1931 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1932 {
1933   PetscScalar    zero = 0.0;
1934   PetscErrorCode ierr;
1935 
1936   PetscFunctionBegin;
1937   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1938   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1939   PetscFunctionReturn(0);
1940 }
1941 
1942 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1943 {
1944   PetscScalar    zero = 0.0;
1945   PetscErrorCode ierr;
1946 
1947   PetscFunctionBegin;
1948   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1949   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1950   PetscFunctionReturn(0);
1951 }
1952 
1953 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1954 {
1955   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1956   PetscScalar       *z,x1,x2,x3,x4,x5;
1957   const PetscScalar *x,*xb = NULL;
1958   const MatScalar   *v;
1959   PetscErrorCode    ierr;
1960   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1961   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1962   Mat_CompressedRow cprow = a->compressedrow;
1963   PetscBool         usecprow = cprow.use;
1964 
1965   PetscFunctionBegin;
1966   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1967   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1968   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1969 
1970   idx = a->j;
1971   v   = a->a;
1972   if (usecprow) {
1973     mbs  = cprow.nrows;
1974     ii   = cprow.i;
1975     ridx = cprow.rindex;
1976   } else {
1977     mbs=a->mbs;
1978     ii = a->i;
1979     xb = x;
1980   }
1981 
1982   switch (bs) {
1983   case 1:
1984     for (i=0; i<mbs; i++) {
1985       if (usecprow) xb = x + ridx[i];
1986       x1 = xb[0];
1987       ib = idx + ii[0];
1988       n  = ii[1] - ii[0]; ii++;
1989       for (j=0; j<n; j++) {
1990         rval     = ib[j];
1991         z[rval] += PetscConj(*v) * x1;
1992         v++;
1993       }
1994       if (!usecprow) xb++;
1995     }
1996     break;
1997   case 2:
1998     for (i=0; i<mbs; i++) {
1999       if (usecprow) xb = x + 2*ridx[i];
2000       x1 = xb[0]; x2 = xb[1];
2001       ib = idx + ii[0];
2002       n  = ii[1] - ii[0]; ii++;
2003       for (j=0; j<n; j++) {
2004         rval       = ib[j]*2;
2005         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2006         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2007         v         += 4;
2008       }
2009       if (!usecprow) xb += 2;
2010     }
2011     break;
2012   case 3:
2013     for (i=0; i<mbs; i++) {
2014       if (usecprow) xb = x + 3*ridx[i];
2015       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2016       ib = idx + ii[0];
2017       n  = ii[1] - ii[0]; ii++;
2018       for (j=0; j<n; j++) {
2019         rval       = ib[j]*3;
2020         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2021         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2022         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2023         v         += 9;
2024       }
2025       if (!usecprow) xb += 3;
2026     }
2027     break;
2028   case 4:
2029     for (i=0; i<mbs; i++) {
2030       if (usecprow) xb = x + 4*ridx[i];
2031       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2032       ib = idx + ii[0];
2033       n  = ii[1] - ii[0]; ii++;
2034       for (j=0; j<n; j++) {
2035         rval       = ib[j]*4;
2036         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2037         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2038         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2039         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2040         v         += 16;
2041       }
2042       if (!usecprow) xb += 4;
2043     }
2044     break;
2045   case 5:
2046     for (i=0; i<mbs; i++) {
2047       if (usecprow) xb = x + 5*ridx[i];
2048       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2049       x4 = xb[3]; x5 = xb[4];
2050       ib = idx + ii[0];
2051       n  = ii[1] - ii[0]; ii++;
2052       for (j=0; j<n; j++) {
2053         rval       = ib[j]*5;
2054         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2055         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2056         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2057         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2058         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2059         v         += 25;
2060       }
2061       if (!usecprow) xb += 5;
2062     }
2063     break;
2064   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2065     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2066 #if 0
2067     {
2068       PetscInt          ncols,k,bs2=a->bs2;
2069       PetscScalar       *work,*workt,zb;
2070       const PetscScalar *xtmp;
2071       if (!a->mult_work) {
2072         k    = PetscMax(A->rmap->n,A->cmap->n);
2073         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2074       }
2075       work = a->mult_work;
2076       xtmp = x;
2077       for (i=0; i<mbs; i++) {
2078         n     = ii[1] - ii[0]; ii++;
2079         ncols = n*bs;
2080         ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2081         if (usecprow) xtmp = x + bs*ridx[i];
2082         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2083         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2084         v += n*bs2;
2085         if (!usecprow) xtmp += bs;
2086         workt = work;
2087         for (j=0; j<n; j++) {
2088           zb = z + bs*(*idx++);
2089           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2090           workt += bs;
2091         }
2092       }
2093     }
2094 #endif
2095   }
2096   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2097   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2098   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2103 {
2104   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2105   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2106   const PetscScalar *x,*xb = 0;
2107   const MatScalar   *v;
2108   PetscErrorCode    ierr;
2109   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2110   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2111   Mat_CompressedRow cprow   = a->compressedrow;
2112   PetscBool         usecprow=cprow.use;
2113 
2114   PetscFunctionBegin;
2115   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2116   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2117   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2118 
2119   idx = a->j;
2120   v   = a->a;
2121   if (usecprow) {
2122     mbs  = cprow.nrows;
2123     ii   = cprow.i;
2124     ridx = cprow.rindex;
2125   } else {
2126     mbs=a->mbs;
2127     ii = a->i;
2128     xb = x;
2129   }
2130 
2131   switch (bs) {
2132   case 1:
2133     for (i=0; i<mbs; i++) {
2134       if (usecprow) xb = x + ridx[i];
2135       x1 = xb[0];
2136       ib = idx + ii[0];
2137       n  = ii[1] - ii[0]; ii++;
2138       for (j=0; j<n; j++) {
2139         rval     = ib[j];
2140         z[rval] += *v * x1;
2141         v++;
2142       }
2143       if (!usecprow) xb++;
2144     }
2145     break;
2146   case 2:
2147     for (i=0; i<mbs; i++) {
2148       if (usecprow) xb = x + 2*ridx[i];
2149       x1 = xb[0]; x2 = xb[1];
2150       ib = idx + ii[0];
2151       n  = ii[1] - ii[0]; ii++;
2152       for (j=0; j<n; j++) {
2153         rval       = ib[j]*2;
2154         z[rval++] += v[0]*x1 + v[1]*x2;
2155         z[rval++] += v[2]*x1 + v[3]*x2;
2156         v         += 4;
2157       }
2158       if (!usecprow) xb += 2;
2159     }
2160     break;
2161   case 3:
2162     for (i=0; i<mbs; i++) {
2163       if (usecprow) xb = x + 3*ridx[i];
2164       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2165       ib = idx + ii[0];
2166       n  = ii[1] - ii[0]; ii++;
2167       for (j=0; j<n; j++) {
2168         rval       = ib[j]*3;
2169         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2170         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2171         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2172         v         += 9;
2173       }
2174       if (!usecprow) xb += 3;
2175     }
2176     break;
2177   case 4:
2178     for (i=0; i<mbs; i++) {
2179       if (usecprow) xb = x + 4*ridx[i];
2180       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2181       ib = idx + ii[0];
2182       n  = ii[1] - ii[0]; ii++;
2183       for (j=0; j<n; j++) {
2184         rval       = ib[j]*4;
2185         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2186         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2187         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2188         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2189         v         += 16;
2190       }
2191       if (!usecprow) xb += 4;
2192     }
2193     break;
2194   case 5:
2195     for (i=0; i<mbs; i++) {
2196       if (usecprow) xb = x + 5*ridx[i];
2197       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2198       x4 = xb[3]; x5 = xb[4];
2199       ib = idx + ii[0];
2200       n  = ii[1] - ii[0]; ii++;
2201       for (j=0; j<n; j++) {
2202         rval       = ib[j]*5;
2203         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2204         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2205         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2206         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2207         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2208         v         += 25;
2209       }
2210       if (!usecprow) xb += 5;
2211     }
2212     break;
2213   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2214     PetscInt          ncols,k;
2215     PetscScalar       *work,*workt;
2216     const PetscScalar *xtmp;
2217     if (!a->mult_work) {
2218       k    = PetscMax(A->rmap->n,A->cmap->n);
2219       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2220     }
2221     work = a->mult_work;
2222     xtmp = x;
2223     for (i=0; i<mbs; i++) {
2224       n     = ii[1] - ii[0]; ii++;
2225       ncols = n*bs;
2226       ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2227       if (usecprow) xtmp = x + bs*ridx[i];
2228       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2229       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2230       v += n*bs2;
2231       if (!usecprow) xtmp += bs;
2232       workt = work;
2233       for (j=0; j<n; j++) {
2234         zb = z + bs*(*idx++);
2235         for (k=0; k<bs; k++) zb[k] += workt[k];
2236         workt += bs;
2237       }
2238     }
2239     }
2240   }
2241   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2242   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2243   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2244   PetscFunctionReturn(0);
2245 }
2246 
2247 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2248 {
2249   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2250   PetscInt       totalnz = a->bs2*a->nz;
2251   PetscScalar    oalpha  = alpha;
2252   PetscErrorCode ierr;
2253   PetscBLASInt   one = 1,tnz;
2254 
2255   PetscFunctionBegin;
2256   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2257   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2258   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2259   PetscFunctionReturn(0);
2260 }
2261 
2262 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2263 {
2264   PetscErrorCode ierr;
2265   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2266   MatScalar      *v  = a->a;
2267   PetscReal      sum = 0.0;
2268   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2269 
2270   PetscFunctionBegin;
2271   if (type == NORM_FROBENIUS) {
2272 #if defined(PETSC_USE_REAL___FP16)
2273     PetscBLASInt one = 1,cnt = bs2*nz;
2274     *norm = BLASnrm2_(&cnt,v,&one);
2275 #else
2276     for (i=0; i<bs2*nz; i++) {
2277       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2278     }
2279 #endif
2280     *norm = PetscSqrtReal(sum);
2281     ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr);
2282   } else if (type == NORM_1) { /* maximum column sum */
2283     PetscReal *tmp;
2284     PetscInt  *bcol = a->j;
2285     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2286     for (i=0; i<nz; i++) {
2287       for (j=0; j<bs; j++) {
2288         k1 = bs*(*bcol) + j; /* column index */
2289         for (k=0; k<bs; k++) {
2290           tmp[k1] += PetscAbsScalar(*v); v++;
2291         }
2292       }
2293       bcol++;
2294     }
2295     *norm = 0.0;
2296     for (j=0; j<A->cmap->n; j++) {
2297       if (tmp[j] > *norm) *norm = tmp[j];
2298     }
2299     ierr = PetscFree(tmp);CHKERRQ(ierr);
2300     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2301   } else if (type == NORM_INFINITY) { /* maximum row sum */
2302     *norm = 0.0;
2303     for (k=0; k<bs; k++) {
2304       for (j=0; j<a->mbs; j++) {
2305         v   = a->a + bs2*a->i[j] + k;
2306         sum = 0.0;
2307         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2308           for (k1=0; k1<bs; k1++) {
2309             sum += PetscAbsScalar(*v);
2310             v   += bs;
2311           }
2312         }
2313         if (sum > *norm) *norm = sum;
2314       }
2315     }
2316     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2317   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2318   PetscFunctionReturn(0);
2319 }
2320 
2321 
2322 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2323 {
2324   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2325   PetscErrorCode ierr;
2326 
2327   PetscFunctionBegin;
2328   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2329   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2330     *flg = PETSC_FALSE;
2331     PetscFunctionReturn(0);
2332   }
2333 
2334   /* if the a->i are the same */
2335   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
2336   if (!*flg) PetscFunctionReturn(0);
2337 
2338   /* if a->j are the same */
2339   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
2340   if (!*flg) PetscFunctionReturn(0);
2341 
2342   /* if a->a are the same */
2343   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr);
2344   PetscFunctionReturn(0);
2345 
2346 }
2347 
2348 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2349 {
2350   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2351   PetscErrorCode ierr;
2352   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2353   PetscScalar    *x,zero = 0.0;
2354   MatScalar      *aa,*aa_j;
2355 
2356   PetscFunctionBegin;
2357   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2358   bs   = A->rmap->bs;
2359   aa   = a->a;
2360   ai   = a->i;
2361   aj   = a->j;
2362   ambs = a->mbs;
2363   bs2  = a->bs2;
2364 
2365   ierr = VecSet(v,zero);CHKERRQ(ierr);
2366   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2367   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2368   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2369   for (i=0; i<ambs; i++) {
2370     for (j=ai[i]; j<ai[i+1]; j++) {
2371       if (aj[j] == i) {
2372         row  = i*bs;
2373         aa_j = aa+j*bs2;
2374         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2375         break;
2376       }
2377     }
2378   }
2379   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2380   PetscFunctionReturn(0);
2381 }
2382 
2383 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2384 {
2385   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2386   const PetscScalar *l,*r,*li,*ri;
2387   PetscScalar       x;
2388   MatScalar         *aa, *v;
2389   PetscErrorCode    ierr;
2390   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2391   const PetscInt    *ai,*aj;
2392 
2393   PetscFunctionBegin;
2394   ai  = a->i;
2395   aj  = a->j;
2396   aa  = a->a;
2397   m   = A->rmap->n;
2398   n   = A->cmap->n;
2399   bs  = A->rmap->bs;
2400   mbs = a->mbs;
2401   bs2 = a->bs2;
2402   if (ll) {
2403     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2404     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2405     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2406     for (i=0; i<mbs; i++) { /* for each block row */
2407       M  = ai[i+1] - ai[i];
2408       li = l + i*bs;
2409       v  = aa + bs2*ai[i];
2410       for (j=0; j<M; j++) { /* for each block */
2411         for (k=0; k<bs2; k++) {
2412           (*v++) *= li[k%bs];
2413         }
2414       }
2415     }
2416     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2417     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2418   }
2419 
2420   if (rr) {
2421     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2422     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2423     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2424     for (i=0; i<mbs; i++) { /* for each block row */
2425       iai = ai[i];
2426       M   = ai[i+1] - iai;
2427       v   = aa + bs2*iai;
2428       for (j=0; j<M; j++) { /* for each block */
2429         ri = r + bs*aj[iai+j];
2430         for (k=0; k<bs; k++) {
2431           x = ri[k];
2432           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2433           v += bs;
2434         }
2435       }
2436     }
2437     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2438     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2439   }
2440   PetscFunctionReturn(0);
2441 }
2442 
2443 
2444 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2445 {
2446   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2447 
2448   PetscFunctionBegin;
2449   info->block_size   = a->bs2;
2450   info->nz_allocated = a->bs2*a->maxnz;
2451   info->nz_used      = a->bs2*a->nz;
2452   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2453   info->assemblies   = A->num_ass;
2454   info->mallocs      = A->info.mallocs;
2455   info->memory       = ((PetscObject)A)->mem;
2456   if (A->factortype) {
2457     info->fill_ratio_given  = A->info.fill_ratio_given;
2458     info->fill_ratio_needed = A->info.fill_ratio_needed;
2459     info->factor_mallocs    = A->info.factor_mallocs;
2460   } else {
2461     info->fill_ratio_given  = 0;
2462     info->fill_ratio_needed = 0;
2463     info->factor_mallocs    = 0;
2464   }
2465   PetscFunctionReturn(0);
2466 }
2467 
2468 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2469 {
2470   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2471   PetscErrorCode ierr;
2472 
2473   PetscFunctionBegin;
2474   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
2475   PetscFunctionReturn(0);
2476 }
2477 
2478 PETSC_INTERN PetscErrorCode MatMatMult_SeqBAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
2479 {
2480   PetscErrorCode ierr;
2481 
2482   PetscFunctionBegin;
2483   if (scall == MAT_INITIAL_MATRIX) {
2484     ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
2485     ierr = MatMatMultSymbolic_SeqBAIJ_SeqDense(A,B,fill,C);CHKERRQ(ierr);
2486     ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr);
2487   }
2488   ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
2489   ierr = MatMatMultNumeric_SeqBAIJ_SeqDense(A,B,*C);CHKERRQ(ierr);
2490   ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr);
2491   PetscFunctionReturn(0);
2492 }
2493 
2494 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
2495 {
2496   PetscErrorCode ierr;
2497 
2498   PetscFunctionBegin;
2499   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
2500 
2501   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2502   PetscFunctionReturn(0);
2503 }
2504 
2505 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2506 {
2507   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2508   PetscScalar       *z = 0,sum1;
2509   const PetscScalar *xb;
2510   PetscScalar       x1;
2511   const MatScalar   *v,*vv;
2512   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2513   PetscBool         usecprow=a->compressedrow.use;
2514 
2515   PetscFunctionBegin;
2516 
2517   idx = a->j;
2518   v   = a->a;
2519   if (usecprow) {
2520     mbs  = a->compressedrow.nrows;
2521     ii   = a->compressedrow.i;
2522     ridx = a->compressedrow.rindex;
2523   } else {
2524     mbs = a->mbs;
2525     ii  = a->i;
2526     z   = c;
2527   }
2528 
2529   for (i=0; i<mbs; i++) {
2530     n           = ii[1] - ii[0]; ii++;
2531     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2532     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2533     if (usecprow) z = c + ridx[i];
2534     jj = idx;
2535     vv = v;
2536     for (k=0; k<cn; k++) {
2537       idx = jj;
2538       v = vv;
2539       sum1    = 0.0;
2540       for (j=0; j<n; j++) {
2541         xb    = b + (*idx++); x1 = xb[0+k*cm];
2542         sum1 += v[0]*x1;
2543         v    += 1;
2544       }
2545       z[0+k*cm] = sum1;
2546     }
2547     if (!usecprow) z += 1;
2548   }
2549   PetscFunctionReturn(0);
2550 }
2551 
2552 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2553 {
2554   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2555   PetscScalar       *z = 0,sum1,sum2;
2556   const PetscScalar *xb;
2557   PetscScalar       x1,x2;
2558   const MatScalar   *v,*vv;
2559   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2560   PetscBool         usecprow=a->compressedrow.use;
2561 
2562   PetscFunctionBegin;
2563 
2564   idx = a->j;
2565   v   = a->a;
2566   if (usecprow) {
2567     mbs  = a->compressedrow.nrows;
2568     ii   = a->compressedrow.i;
2569     ridx = a->compressedrow.rindex;
2570   } else {
2571     mbs = a->mbs;
2572     ii  = a->i;
2573     z   = c;
2574   }
2575 
2576   for (i=0; i<mbs; i++) {
2577     n           = ii[1] - ii[0]; ii++;
2578     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2579     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2580     if (usecprow) z = c + 2*ridx[i];
2581     jj = idx;
2582     vv = v;
2583     for (k=0; k<cn; k++) {
2584       idx = jj;
2585       v = vv;
2586       sum1    = 0.0; sum2 = 0.0;
2587       for (j=0; j<n; j++) {
2588         xb    = b + 2*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm];
2589         sum1 += v[0]*x1 + v[2]*x2;
2590         sum2 += v[1]*x1 + v[3]*x2;
2591         v    += 4;
2592       }
2593       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2594     }
2595     if (!usecprow) z += 2;
2596   }
2597   PetscFunctionReturn(0);
2598 }
2599 
2600 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2601 {
2602   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2603   PetscScalar       *z = 0,sum1,sum2,sum3;
2604   const PetscScalar *xb;
2605   PetscScalar       x1,x2,x3;
2606   const MatScalar   *v,*vv;
2607   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2608   PetscBool         usecprow=a->compressedrow.use;
2609 
2610   PetscFunctionBegin;
2611 
2612   idx = a->j;
2613   v   = a->a;
2614   if (usecprow) {
2615     mbs  = a->compressedrow.nrows;
2616     ii   = a->compressedrow.i;
2617     ridx = a->compressedrow.rindex;
2618   } else {
2619     mbs = a->mbs;
2620     ii  = a->i;
2621     z   = c;
2622   }
2623 
2624   for (i=0; i<mbs; i++) {
2625     n           = ii[1] - ii[0]; ii++;
2626     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2627     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2628     if (usecprow) z = c + 3*ridx[i];
2629     jj = idx;
2630     vv = v;
2631     for (k=0; k<cn; k++) {
2632       idx = jj;
2633       v = vv;
2634       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
2635       for (j=0; j<n; j++) {
2636         xb    = b + 3*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm];
2637         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
2638         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
2639         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
2640         v    += 9;
2641       }
2642       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
2643     }
2644     if (!usecprow) z += 3;
2645   }
2646   PetscFunctionReturn(0);
2647 }
2648 
2649 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2650 {
2651   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2652   PetscScalar       *z = 0,sum1,sum2,sum3,sum4;
2653   const PetscScalar *xb;
2654   PetscScalar       x1,x2,x3,x4;
2655   const MatScalar   *v,*vv;
2656   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2657   PetscBool         usecprow=a->compressedrow.use;
2658 
2659   PetscFunctionBegin;
2660 
2661   idx = a->j;
2662   v   = a->a;
2663   if (usecprow) {
2664     mbs  = a->compressedrow.nrows;
2665     ii   = a->compressedrow.i;
2666     ridx = a->compressedrow.rindex;
2667   } else {
2668     mbs = a->mbs;
2669     ii  = a->i;
2670     z   = c;
2671   }
2672 
2673   for (i=0; i<mbs; i++) {
2674     n           = ii[1] - ii[0]; ii++;
2675     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2676     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2677     if (usecprow) z = c + 4*ridx[i];
2678     jj = idx;
2679     vv = v;
2680     for (k=0; k<cn; k++) {
2681       idx = jj;
2682       v = vv;
2683       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
2684       for (j=0; j<n; j++) {
2685         xb    = b + 4*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm]; x4 = xb[3+k*cm];
2686         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2687         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2688         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2689         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2690         v    += 16;
2691       }
2692       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
2693     }
2694     if (!usecprow) z += 4;
2695   }
2696   PetscFunctionReturn(0);
2697 }
2698 
2699 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2700 {
2701   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2702   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5;
2703   const PetscScalar *xb;
2704   PetscScalar       x1,x2,x3,x4,x5;
2705   const MatScalar   *v,*vv;
2706   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2707   PetscBool         usecprow=a->compressedrow.use;
2708 
2709   PetscFunctionBegin;
2710 
2711   idx = a->j;
2712   v   = a->a;
2713   if (usecprow) {
2714     mbs  = a->compressedrow.nrows;
2715     ii   = a->compressedrow.i;
2716     ridx = a->compressedrow.rindex;
2717   } else {
2718     mbs = a->mbs;
2719     ii  = a->i;
2720     z   = c;
2721   }
2722 
2723   for (i=0; i<mbs; i++) {
2724     n           = ii[1] - ii[0]; ii++;
2725     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2726     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2727     if (usecprow) z = c + 5*ridx[i];
2728     jj = idx;
2729     vv = v;
2730     for (k=0; k<cn; k++) {
2731       idx = jj;
2732       v = vv;
2733       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
2734       for (j=0; j<n; j++) {
2735         xb    = b + 5*(*idx++); x1 = xb[0+k*cm]; x2 = xb[1+k*cm]; x3 = xb[2+k*cm]; x4 = xb[3+k*cm]; x5 = xb[4+k*cm];
2736         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2737         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2738         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2739         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2740         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2741         v    += 25;
2742       }
2743       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
2744     }
2745     if (!usecprow) z += 5;
2746   }
2747   PetscFunctionReturn(0);
2748 }
2749 
2750 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
2751 {
2752   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2753   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
2754   Mat_SeqDense      *cd = (Mat_SeqDense*)B->data;
2755   PetscErrorCode    ierr;
2756   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
2757   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2758   PetscBLASInt      bbs,bcn,bbm,bcm;
2759   PetscScalar       *z = 0;
2760   PetscScalar       *c,*b;
2761   const MatScalar   *v;
2762   const PetscInt    *idx,*ii,*ridx=NULL;
2763   PetscScalar       _DZero=0.0,_DOne=1.0;
2764   PetscBool         usecprow=a->compressedrow.use;
2765 
2766   PetscFunctionBegin;
2767   if (!cm || !cn) PetscFunctionReturn(0);
2768   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
2769   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
2770   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
2771   b = bd->v;
2772   if (a->nonzerorowcnt != A->rmap->n) {
2773     ierr = MatZeroEntries(C);CHKERRQ(ierr);
2774   }
2775   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
2776   switch (bs) {
2777   case 1:
2778     ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
2779     break;
2780   case 2:
2781     ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
2782     break;
2783   case 3:
2784     ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
2785     break;
2786   case 4:
2787     ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
2788     break;
2789   case 5:
2790     ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
2791     break;
2792   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2793     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
2794     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
2795     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
2796     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
2797     idx = a->j;
2798     v   = a->a;
2799     if (usecprow) {
2800       mbs  = a->compressedrow.nrows;
2801       ii   = a->compressedrow.i;
2802       ridx = a->compressedrow.rindex;
2803     } else {
2804       mbs = a->mbs;
2805       ii  = a->i;
2806       z   = c;
2807     }
2808     for (i=0; i<mbs; i++) {
2809       n = ii[1] - ii[0]; ii++;
2810       if (usecprow) z = c + bs*ridx[i];
2811       if (n) {
2812         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
2813         v += bs2;
2814       }
2815       for (j=1; j<n; j++) {
2816         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
2817         v += bs2;
2818       }
2819       if (!usecprow) z += bs;
2820     }
2821   }
2822   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
2823   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2824   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2825   ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr);
2826   PetscFunctionReturn(0);
2827 }
2828