xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision e69d178c2f48dff605cfe88e4730ccdae44cd6c9)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <petscblaslapack.h>
6 
7 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
8 {
9   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
10   PetscErrorCode ierr;
11   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
12   const PetscInt *idx;
13   PetscInt       start,end,*ai,*aj,bs,*nidx2;
14   PetscBT        table;
15 
16   PetscFunctionBegin;
17   m  = a->mbs;
18   ai = a->i;
19   aj = a->j;
20   bs = A->rmap->bs;
21 
22   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23 
24   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
25   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27 
28   for (i=0; i<is_max; i++) {
29     /* Initialise the two local arrays */
30     isz  = 0;
31     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
38     for (j=0; j<n; ++j) {
39       ival = idx[j]/bs; /* convert the indices into block indices */
40       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
42     }
43     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
44     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
45 
46     k = 0;
47     for (j=0; j<ov; j++) { /* for each overlap*/
48       n = isz;
49       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
50         row   = nidx[k];
51         start = ai[row];
52         end   = ai[row+1];
53         for (l = start; l<end; l++) {
54           val = aj[l];
55           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
56         }
57       }
58     }
59     /* expand the Index Set */
60     for (j=0; j<isz; j++) {
61       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
62     }
63     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
64   }
65   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
66   ierr = PetscFree(nidx);CHKERRQ(ierr);
67   ierr = PetscFree(nidx2);CHKERRQ(ierr);
68   PetscFunctionReturn(0);
69 }
70 
71 PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
72 {
73   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
74   PetscErrorCode ierr;
75   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
76   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
77   const PetscInt *irow,*icol;
78   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
79   PetscInt       *aj = a->j,*ai = a->i;
80   MatScalar      *mat_a;
81   Mat            C;
82   PetscBool      flag;
83 
84   PetscFunctionBegin;
85   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
86   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
87   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
88   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
89 
90   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
91   ssmap = smap;
92   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
93   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
94   /* determine lens of each row */
95   for (i=0; i<nrows; i++) {
96     kstart  = ai[irow[i]];
97     kend    = kstart + a->ilen[irow[i]];
98     lens[i] = 0;
99     for (k=kstart; k<kend; k++) {
100       if (ssmap[aj[k]]) lens[i]++;
101     }
102   }
103   /* Create and fill new matrix */
104   if (scall == MAT_REUSE_MATRIX) {
105     c = (Mat_SeqBAIJ*)((*B)->data);
106 
107     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
108     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
109     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
110     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
111     C    = *B;
112   } else {
113     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
114     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
115     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
116     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
117   }
118   c = (Mat_SeqBAIJ*)(C->data);
119   for (i=0; i<nrows; i++) {
120     row      = irow[i];
121     kstart   = ai[row];
122     kend     = kstart + a->ilen[row];
123     mat_i    = c->i[i];
124     mat_j    = c->j + mat_i;
125     mat_a    = c->a + mat_i*bs2;
126     mat_ilen = c->ilen + i;
127     for (k=kstart; k<kend; k++) {
128       if ((tcol=ssmap[a->j[k]])) {
129         *mat_j++ = tcol - 1;
130         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
131         mat_a   += bs2;
132         (*mat_ilen)++;
133       }
134     }
135   }
136   /* sort */
137   {
138     MatScalar *work;
139     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
140     for (i=0; i<nrows; i++) {
141       PetscInt ilen;
142       mat_i = c->i[i];
143       mat_j = c->j + mat_i;
144       mat_a = c->a + mat_i*bs2;
145       ilen  = c->ilen[i];
146       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
147     }
148     ierr = PetscFree(work);CHKERRQ(ierr);
149   }
150 
151   /* Free work space */
152   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
153   ierr = PetscFree(smap);CHKERRQ(ierr);
154   ierr = PetscFree(lens);CHKERRQ(ierr);
155   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
156   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
157 
158   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
159   *B   = C;
160   PetscFunctionReturn(0);
161 }
162 
163 PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
164 {
165   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166   IS             is1,is2;
167   PetscErrorCode ierr;
168   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
169   const PetscInt *irow,*icol;
170 
171   PetscFunctionBegin;
172   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
173   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
174   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
175   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
176 
177   /* Verify if the indices corespond to each element in a block
178    and form the IS with compressed IS */
179   maxmnbs = PetscMax(a->mbs,a->nbs);
180   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
181   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
182   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
183   for (i=0; i<a->mbs; i++) {
184     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185   }
186   count = 0;
187   for (i=0; i<nrows; i++) {
188     j = irow[i] / bs;
189     if ((vary[j]--)==bs) iary[count++] = j;
190   }
191   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
192 
193   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
194   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
195   for (i=0; i<a->nbs; i++) {
196     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
197   }
198   count = 0;
199   for (i=0; i<ncols; i++) {
200     j = icol[i] / bs;
201     if ((vary[j]--)==bs) iary[count++] = j;
202   }
203   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
204   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
206   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
207 
208   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
209   ierr = ISDestroy(&is1);CHKERRQ(ierr);
210   ierr = ISDestroy(&is2);CHKERRQ(ierr);
211   PetscFunctionReturn(0);
212 }
213 
214 PetscErrorCode MatDestroy_SeqBAIJ_Submatrices(Mat C)
215 {
216   PetscErrorCode ierr;
217   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
218   Mat_SubMat     *submatj = c->submatis1;
219 
220   PetscFunctionBegin;
221   ierr = submatj->destroy(C);CHKERRQ(ierr);
222   ierr = MatDestroySubMatrices_Private(submatj);CHKERRQ(ierr);
223   PetscFunctionReturn(0);
224 }
225 
226 PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
227 {
228   PetscErrorCode ierr;
229   PetscInt       i;
230 
231   PetscFunctionBegin;
232   if (scall == MAT_INITIAL_MATRIX) {
233     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
234   }
235 
236   for (i=0; i<n; i++) {
237     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
238   }
239   PetscFunctionReturn(0);
240 }
241 
242 
243 /* -------------------------------------------------------*/
244 /* Should check that shapes of vectors and matrices match */
245 /* -------------------------------------------------------*/
246 
247 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
248 {
249   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
250   PetscScalar       *z,sum;
251   const PetscScalar *x;
252   const MatScalar   *v;
253   PetscErrorCode    ierr;
254   PetscInt          mbs,i,n;
255   const PetscInt    *idx,*ii,*ridx=NULL;
256   PetscBool         usecprow=a->compressedrow.use;
257 
258   PetscFunctionBegin;
259   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
260   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
261 
262   if (usecprow) {
263     mbs  = a->compressedrow.nrows;
264     ii   = a->compressedrow.i;
265     ridx = a->compressedrow.rindex;
266     ierr = PetscMemzero(z,a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
267   } else {
268     mbs = a->mbs;
269     ii  = a->i;
270   }
271 
272   for (i=0; i<mbs; i++) {
273     n   = ii[1] - ii[0];
274     v   = a->a + ii[0];
275     idx = a->j + ii[0];
276     ii++;
277     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
278     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
279     sum = 0.0;
280     PetscSparseDensePlusDot(sum,x,v,idx,n);
281     if (usecprow) {
282       z[ridx[i]] = sum;
283     } else {
284       z[i]       = sum;
285     }
286   }
287   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
288   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
289   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
290   PetscFunctionReturn(0);
291 }
292 
293 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
294 {
295   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
296   PetscScalar       *z = 0,sum1,sum2,*zarray;
297   const PetscScalar *x,*xb;
298   PetscScalar       x1,x2;
299   const MatScalar   *v;
300   PetscErrorCode    ierr;
301   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
302   PetscBool         usecprow=a->compressedrow.use;
303 
304   PetscFunctionBegin;
305   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
306   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
307 
308   idx = a->j;
309   v   = a->a;
310   if (usecprow) {
311     mbs  = a->compressedrow.nrows;
312     ii   = a->compressedrow.i;
313     ridx = a->compressedrow.rindex;
314     ierr = PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
315   } else {
316     mbs = a->mbs;
317     ii  = a->i;
318     z   = zarray;
319   }
320 
321   for (i=0; i<mbs; i++) {
322     n           = ii[1] - ii[0]; ii++;
323     sum1        = 0.0; sum2 = 0.0;
324     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
325     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
326     for (j=0; j<n; j++) {
327       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
328       sum1 += v[0]*x1 + v[2]*x2;
329       sum2 += v[1]*x1 + v[3]*x2;
330       v    += 4;
331     }
332     if (usecprow) z = zarray + 2*ridx[i];
333     z[0] = sum1; z[1] = sum2;
334     if (!usecprow) z += 2;
335   }
336   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
337   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
338   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
339   PetscFunctionReturn(0);
340 }
341 
342 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
343 {
344   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
345   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
346   const PetscScalar *x,*xb;
347   const MatScalar   *v;
348   PetscErrorCode    ierr;
349   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
350   PetscBool         usecprow=a->compressedrow.use;
351 
352 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
353 #pragma disjoint(*v,*z,*xb)
354 #endif
355 
356   PetscFunctionBegin;
357   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
358   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
359 
360   idx = a->j;
361   v   = a->a;
362   if (usecprow) {
363     mbs  = a->compressedrow.nrows;
364     ii   = a->compressedrow.i;
365     ridx = a->compressedrow.rindex;
366     ierr = PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
367   } else {
368     mbs = a->mbs;
369     ii  = a->i;
370     z   = zarray;
371   }
372 
373   for (i=0; i<mbs; i++) {
374     n           = ii[1] - ii[0]; ii++;
375     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
376     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
377     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
378     for (j=0; j<n; j++) {
379       xb = x + 3*(*idx++);
380       x1 = xb[0];
381       x2 = xb[1];
382       x3 = xb[2];
383 
384       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
385       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
386       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
387       v    += 9;
388     }
389     if (usecprow) z = zarray + 3*ridx[i];
390     z[0] = sum1; z[1] = sum2; z[2] = sum3;
391     if (!usecprow) z += 3;
392   }
393   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
394   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
395   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
396   PetscFunctionReturn(0);
397 }
398 
399 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
400 {
401   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
402   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
403   const PetscScalar *x,*xb;
404   const MatScalar   *v;
405   PetscErrorCode    ierr;
406   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
407   PetscBool         usecprow=a->compressedrow.use;
408 
409   PetscFunctionBegin;
410   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
411   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
412 
413   idx = a->j;
414   v   = a->a;
415   if (usecprow) {
416     mbs  = a->compressedrow.nrows;
417     ii   = a->compressedrow.i;
418     ridx = a->compressedrow.rindex;
419     ierr = PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
420   } else {
421     mbs = a->mbs;
422     ii  = a->i;
423     z   = zarray;
424   }
425 
426   for (i=0; i<mbs; i++) {
427     n = ii[1] - ii[0];
428     ii++;
429     sum1 = 0.0;
430     sum2 = 0.0;
431     sum3 = 0.0;
432     sum4 = 0.0;
433 
434     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
435     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
436     for (j=0; j<n; j++) {
437       xb    = x + 4*(*idx++);
438       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
439       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
440       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
441       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
442       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
443       v    += 16;
444     }
445     if (usecprow) z = zarray + 4*ridx[i];
446     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
447     if (!usecprow) z += 4;
448   }
449   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
450   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
451   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
452   PetscFunctionReturn(0);
453 }
454 
455 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
456 {
457   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
458   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
459   const PetscScalar *xb,*x;
460   const MatScalar   *v;
461   PetscErrorCode    ierr;
462   const PetscInt    *idx,*ii,*ridx=NULL;
463   PetscInt          mbs,i,j,n;
464   PetscBool         usecprow=a->compressedrow.use;
465 
466   PetscFunctionBegin;
467   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
468   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
469 
470   idx = a->j;
471   v   = a->a;
472   if (usecprow) {
473     mbs  = a->compressedrow.nrows;
474     ii   = a->compressedrow.i;
475     ridx = a->compressedrow.rindex;
476     ierr = PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
477   } else {
478     mbs = a->mbs;
479     ii  = a->i;
480     z   = zarray;
481   }
482 
483   for (i=0; i<mbs; i++) {
484     n           = ii[1] - ii[0]; ii++;
485     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
486     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
487     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
488     for (j=0; j<n; j++) {
489       xb    = x + 5*(*idx++);
490       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
491       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
492       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
493       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
494       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
495       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
496       v    += 25;
497     }
498     if (usecprow) z = zarray + 5*ridx[i];
499     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
500     if (!usecprow) z += 5;
501   }
502   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
503   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
504   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
505   PetscFunctionReturn(0);
506 }
507 
508 
509 
510 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
511 {
512   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
513   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
514   const PetscScalar *x,*xb;
515   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
516   const MatScalar   *v;
517   PetscErrorCode    ierr;
518   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
519   PetscBool         usecprow=a->compressedrow.use;
520 
521   PetscFunctionBegin;
522   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
523   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
524 
525   idx = a->j;
526   v   = a->a;
527   if (usecprow) {
528     mbs  = a->compressedrow.nrows;
529     ii   = a->compressedrow.i;
530     ridx = a->compressedrow.rindex;
531     ierr = PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
532   } else {
533     mbs = a->mbs;
534     ii  = a->i;
535     z   = zarray;
536   }
537 
538   for (i=0; i<mbs; i++) {
539     n  = ii[1] - ii[0];
540     ii++;
541     sum1 = 0.0;
542     sum2 = 0.0;
543     sum3 = 0.0;
544     sum4 = 0.0;
545     sum5 = 0.0;
546     sum6 = 0.0;
547 
548     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
549     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
550     for (j=0; j<n; j++) {
551       xb    = x + 6*(*idx++);
552       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
553       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
554       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
555       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
556       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
557       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
558       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
559       v    += 36;
560     }
561     if (usecprow) z = zarray + 6*ridx[i];
562     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
563     if (!usecprow) z += 6;
564   }
565 
566   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
567   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
568   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
569   PetscFunctionReturn(0);
570 }
571 
572 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
573 {
574   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
575   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
576   const PetscScalar *x,*xb;
577   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
578   const MatScalar   *v;
579   PetscErrorCode    ierr;
580   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
581   PetscBool         usecprow=a->compressedrow.use;
582 
583   PetscFunctionBegin;
584   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
585   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
586 
587   idx = a->j;
588   v   = a->a;
589   if (usecprow) {
590     mbs  = a->compressedrow.nrows;
591     ii   = a->compressedrow.i;
592     ridx = a->compressedrow.rindex;
593     ierr = PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
594   } else {
595     mbs = a->mbs;
596     ii  = a->i;
597     z   = zarray;
598   }
599 
600   for (i=0; i<mbs; i++) {
601     n  = ii[1] - ii[0];
602     ii++;
603     sum1 = 0.0;
604     sum2 = 0.0;
605     sum3 = 0.0;
606     sum4 = 0.0;
607     sum5 = 0.0;
608     sum6 = 0.0;
609     sum7 = 0.0;
610 
611     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
612     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
613     for (j=0; j<n; j++) {
614       xb    = x + 7*(*idx++);
615       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
616       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
617       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
618       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
619       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
620       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
621       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
622       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
623       v    += 49;
624     }
625     if (usecprow) z = zarray + 7*ridx[i];
626     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
627     if (!usecprow) z += 7;
628   }
629 
630   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
631   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
632   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
633   PetscFunctionReturn(0);
634 }
635 
636 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
637 /* Default MatMult for block size 15 */
638 
639 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
640 {
641   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
642   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
643   const PetscScalar *x,*xb;
644   PetscScalar       *zarray,xv;
645   const MatScalar   *v;
646   PetscErrorCode    ierr;
647   const PetscInt    *ii,*ij=a->j,*idx;
648   PetscInt          mbs,i,j,k,n,*ridx=NULL;
649   PetscBool         usecprow=a->compressedrow.use;
650 
651   PetscFunctionBegin;
652   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
653   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
654 
655   v = a->a;
656   if (usecprow) {
657     mbs  = a->compressedrow.nrows;
658     ii   = a->compressedrow.i;
659     ridx = a->compressedrow.rindex;
660     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
661   } else {
662     mbs = a->mbs;
663     ii  = a->i;
664     z   = zarray;
665   }
666 
667   for (i=0; i<mbs; i++) {
668     n    = ii[i+1] - ii[i];
669     idx  = ij + ii[i];
670     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
671     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
672 
673     for (j=0; j<n; j++) {
674       xb = x + 15*(idx[j]);
675 
676       for (k=0; k<15; k++) {
677         xv     =  xb[k];
678         sum1  += v[0]*xv;
679         sum2  += v[1]*xv;
680         sum3  += v[2]*xv;
681         sum4  += v[3]*xv;
682         sum5  += v[4]*xv;
683         sum6  += v[5]*xv;
684         sum7  += v[6]*xv;
685         sum8  += v[7]*xv;
686         sum9  += v[8]*xv;
687         sum10 += v[9]*xv;
688         sum11 += v[10]*xv;
689         sum12 += v[11]*xv;
690         sum13 += v[12]*xv;
691         sum14 += v[13]*xv;
692         sum15 += v[14]*xv;
693         v     += 15;
694       }
695     }
696     if (usecprow) z = zarray + 15*ridx[i];
697     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
698     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
699 
700     if (!usecprow) z += 15;
701   }
702 
703   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
704   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
705   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
706   PetscFunctionReturn(0);
707 }
708 
709 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
710 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
711 {
712   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
713   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
714   const PetscScalar *x,*xb;
715   PetscScalar       x1,x2,x3,x4,*zarray;
716   const MatScalar   *v;
717   PetscErrorCode    ierr;
718   const PetscInt    *ii,*ij=a->j,*idx;
719   PetscInt          mbs,i,j,n,*ridx=NULL;
720   PetscBool         usecprow=a->compressedrow.use;
721 
722   PetscFunctionBegin;
723   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
724   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
725 
726   v = a->a;
727   if (usecprow) {
728     mbs  = a->compressedrow.nrows;
729     ii   = a->compressedrow.i;
730     ridx = a->compressedrow.rindex;
731     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
732   } else {
733     mbs = a->mbs;
734     ii  = a->i;
735     z   = zarray;
736   }
737 
738   for (i=0; i<mbs; i++) {
739     n    = ii[i+1] - ii[i];
740     idx  = ij + ii[i];
741     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
742     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
743 
744     for (j=0; j<n; j++) {
745       xb = x + 15*(idx[j]);
746       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
747 
748       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
749       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
750       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
751       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
752       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
753       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
754       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
755       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
756       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
757       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
758       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
759       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
760       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
761       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
762       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
763 
764       v += 60;
765 
766       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
767 
768       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
769       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
770       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
771       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
772       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
773       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
774       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
775       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
776       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
777       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
778       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
779       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
780       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
781       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
782       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
783       v     += 60;
784 
785       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
786       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
787       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
788       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
789       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
790       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
791       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
792       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
793       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
794       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
795       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
796       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
797       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
798       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
799       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
800       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
801       v     += 60;
802 
803       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
804       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
805       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
806       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
807       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
808       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
809       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
810       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
811       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
812       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
813       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
814       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
815       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
816       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
817       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
818       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
819       v     += 45;
820     }
821     if (usecprow) z = zarray + 15*ridx[i];
822     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
823     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
824 
825     if (!usecprow) z += 15;
826   }
827 
828   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
829   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
830   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
831   PetscFunctionReturn(0);
832 }
833 
834 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
835 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
836 {
837   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
838   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
839   const PetscScalar *x,*xb;
840   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
841   const MatScalar   *v;
842   PetscErrorCode    ierr;
843   const PetscInt    *ii,*ij=a->j,*idx;
844   PetscInt          mbs,i,j,n,*ridx=NULL;
845   PetscBool         usecprow=a->compressedrow.use;
846 
847   PetscFunctionBegin;
848   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
849   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
850 
851   v = a->a;
852   if (usecprow) {
853     mbs  = a->compressedrow.nrows;
854     ii   = a->compressedrow.i;
855     ridx = a->compressedrow.rindex;
856     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
857   } else {
858     mbs = a->mbs;
859     ii  = a->i;
860     z   = zarray;
861   }
862 
863   for (i=0; i<mbs; i++) {
864     n    = ii[i+1] - ii[i];
865     idx  = ij + ii[i];
866     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
867     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
868 
869     for (j=0; j<n; j++) {
870       xb = x + 15*(idx[j]);
871       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
872       x8 = xb[7];
873 
874       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
875       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
876       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
877       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
878       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
879       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
880       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
881       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
882       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
883       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
884       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
885       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
886       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
887       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
888       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
889       v     += 120;
890 
891       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
892 
893       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
894       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
895       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
896       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
897       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
898       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
899       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
900       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
901       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
902       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
903       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
904       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
905       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
906       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
907       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
908       v     += 105;
909     }
910     if (usecprow) z = zarray + 15*ridx[i];
911     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
912     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
913 
914     if (!usecprow) z += 15;
915   }
916 
917   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
918   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
919   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
920   PetscFunctionReturn(0);
921 }
922 
923 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
924 
925 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
926 {
927   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
928   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
929   const PetscScalar *x,*xb;
930   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
931   const MatScalar   *v;
932   PetscErrorCode    ierr;
933   const PetscInt    *ii,*ij=a->j,*idx;
934   PetscInt          mbs,i,j,n,*ridx=NULL;
935   PetscBool         usecprow=a->compressedrow.use;
936 
937   PetscFunctionBegin;
938   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
939   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
940 
941   v = a->a;
942   if (usecprow) {
943     mbs  = a->compressedrow.nrows;
944     ii   = a->compressedrow.i;
945     ridx = a->compressedrow.rindex;
946     ierr = PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
947   } else {
948     mbs = a->mbs;
949     ii  = a->i;
950     z   = zarray;
951   }
952 
953   for (i=0; i<mbs; i++) {
954     n    = ii[i+1] - ii[i];
955     idx  = ij + ii[i];
956     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
957     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
958 
959     for (j=0; j<n; j++) {
960       xb = x + 15*(idx[j]);
961       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
962       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
963 
964       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
965       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
966       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
967       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
968       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
969       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
970       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
971       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
972       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
973       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
974       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
975       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
976       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
977       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
978       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
979       v     += 225;
980     }
981     if (usecprow) z = zarray + 15*ridx[i];
982     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
983     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
984 
985     if (!usecprow) z += 15;
986   }
987 
988   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
989   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
990   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
991   PetscFunctionReturn(0);
992 }
993 
994 
995 /*
996     This will not work with MatScalar == float because it calls the BLAS
997 */
998 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
999 {
1000   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1001   PetscScalar       *z = 0,*work,*workt,*zarray;
1002   const PetscScalar *x,*xb;
1003   const MatScalar   *v;
1004   PetscErrorCode    ierr;
1005   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1006   const PetscInt    *idx,*ii,*ridx=NULL;
1007   PetscInt          ncols,k;
1008   PetscBool         usecprow=a->compressedrow.use;
1009 
1010   PetscFunctionBegin;
1011   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1012   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1013 
1014   idx = a->j;
1015   v   = a->a;
1016   if (usecprow) {
1017     mbs  = a->compressedrow.nrows;
1018     ii   = a->compressedrow.i;
1019     ridx = a->compressedrow.rindex;
1020     ierr = PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1021   } else {
1022     mbs = a->mbs;
1023     ii  = a->i;
1024     z   = zarray;
1025   }
1026 
1027   if (!a->mult_work) {
1028     k    = PetscMax(A->rmap->n,A->cmap->n);
1029     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1030   }
1031   work = a->mult_work;
1032   for (i=0; i<mbs; i++) {
1033     n           = ii[1] - ii[0]; ii++;
1034     ncols       = n*bs;
1035     workt       = work;
1036     for (j=0; j<n; j++) {
1037       xb = x + bs*(*idx++);
1038       for (k=0; k<bs; k++) workt[k] = xb[k];
1039       workt += bs;
1040     }
1041     if (usecprow) z = zarray + bs*ridx[i];
1042     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1043     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1044     v += n*bs2;
1045     if (!usecprow) z += bs;
1046   }
1047   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1048   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1049   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1050   PetscFunctionReturn(0);
1051 }
1052 
1053 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1054 {
1055   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1056   const PetscScalar *x;
1057   PetscScalar       *y,*z,sum;
1058   const MatScalar   *v;
1059   PetscErrorCode    ierr;
1060   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1061   const PetscInt    *idx,*ii;
1062   PetscBool         usecprow=a->compressedrow.use;
1063 
1064   PetscFunctionBegin;
1065   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1066   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1067 
1068   idx = a->j;
1069   v   = a->a;
1070   if (usecprow) {
1071     if (zz != yy) {
1072       ierr = PetscMemcpy(z,y,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1073     }
1074     mbs  = a->compressedrow.nrows;
1075     ii   = a->compressedrow.i;
1076     ridx = a->compressedrow.rindex;
1077   } else {
1078     ii = a->i;
1079   }
1080 
1081   for (i=0; i<mbs; i++) {
1082     n = ii[1] - ii[0];
1083     ii++;
1084     if (!usecprow) {
1085       sum         = y[i];
1086     } else {
1087       sum = y[ridx[i]];
1088     }
1089     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1090     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1091     PetscSparseDensePlusDot(sum,x,v,idx,n);
1092     v   += n;
1093     idx += n;
1094     if (usecprow) {
1095       z[ridx[i]] = sum;
1096     } else {
1097       z[i] = sum;
1098     }
1099   }
1100   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1101   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1102   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1103   PetscFunctionReturn(0);
1104 }
1105 
1106 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1107 {
1108   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1109   PetscScalar       *y = 0,*z = 0,sum1,sum2;
1110   const PetscScalar *x,*xb;
1111   PetscScalar       x1,x2,*yarray,*zarray;
1112   const MatScalar   *v;
1113   PetscErrorCode    ierr;
1114   PetscInt          mbs = a->mbs,i,n,j;
1115   const PetscInt    *idx,*ii,*ridx = NULL;
1116   PetscBool         usecprow = a->compressedrow.use;
1117 
1118   PetscFunctionBegin;
1119   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1120   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1121 
1122   idx = a->j;
1123   v   = a->a;
1124   if (usecprow) {
1125     if (zz != yy) {
1126       ierr = PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1127     }
1128     mbs  = a->compressedrow.nrows;
1129     ii   = a->compressedrow.i;
1130     ridx = a->compressedrow.rindex;
1131   } else {
1132     ii = a->i;
1133     y  = yarray;
1134     z  = zarray;
1135   }
1136 
1137   for (i=0; i<mbs; i++) {
1138     n = ii[1] - ii[0]; ii++;
1139     if (usecprow) {
1140       z = zarray + 2*ridx[i];
1141       y = yarray + 2*ridx[i];
1142     }
1143     sum1 = y[0]; sum2 = y[1];
1144     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1145     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1146     for (j=0; j<n; j++) {
1147       xb = x + 2*(*idx++);
1148       x1 = xb[0];
1149       x2 = xb[1];
1150 
1151       sum1 += v[0]*x1 + v[2]*x2;
1152       sum2 += v[1]*x1 + v[3]*x2;
1153       v    += 4;
1154     }
1155     z[0] = sum1; z[1] = sum2;
1156     if (!usecprow) {
1157       z += 2; y += 2;
1158     }
1159   }
1160   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1161   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1162   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1167 {
1168   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1169   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1170   const PetscScalar *x,*xb;
1171   const MatScalar   *v;
1172   PetscErrorCode    ierr;
1173   PetscInt          mbs = a->mbs,i,j,n;
1174   const PetscInt    *idx,*ii,*ridx = NULL;
1175   PetscBool         usecprow = a->compressedrow.use;
1176 
1177   PetscFunctionBegin;
1178   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1179   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1180 
1181   idx = a->j;
1182   v   = a->a;
1183   if (usecprow) {
1184     if (zz != yy) {
1185       ierr = PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1186     }
1187     mbs  = a->compressedrow.nrows;
1188     ii   = a->compressedrow.i;
1189     ridx = a->compressedrow.rindex;
1190   } else {
1191     ii = a->i;
1192     y  = yarray;
1193     z  = zarray;
1194   }
1195 
1196   for (i=0; i<mbs; i++) {
1197     n = ii[1] - ii[0]; ii++;
1198     if (usecprow) {
1199       z = zarray + 3*ridx[i];
1200       y = yarray + 3*ridx[i];
1201     }
1202     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1203     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1204     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1205     for (j=0; j<n; j++) {
1206       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1207       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1208       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1209       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1210       v    += 9;
1211     }
1212     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1213     if (!usecprow) {
1214       z += 3; y += 3;
1215     }
1216   }
1217   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1218   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1219   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1220   PetscFunctionReturn(0);
1221 }
1222 
1223 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1224 {
1225   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1226   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1227   const PetscScalar *x,*xb;
1228   const MatScalar   *v;
1229   PetscErrorCode    ierr;
1230   PetscInt          mbs = a->mbs,i,j,n;
1231   const PetscInt    *idx,*ii,*ridx=NULL;
1232   PetscBool         usecprow=a->compressedrow.use;
1233 
1234   PetscFunctionBegin;
1235   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1236   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1237 
1238   idx = a->j;
1239   v   = a->a;
1240   if (usecprow) {
1241     if (zz != yy) {
1242       ierr = PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1243     }
1244     mbs  = a->compressedrow.nrows;
1245     ii   = a->compressedrow.i;
1246     ridx = a->compressedrow.rindex;
1247   } else {
1248     ii = a->i;
1249     y  = yarray;
1250     z  = zarray;
1251   }
1252 
1253   for (i=0; i<mbs; i++) {
1254     n = ii[1] - ii[0]; ii++;
1255     if (usecprow) {
1256       z = zarray + 4*ridx[i];
1257       y = yarray + 4*ridx[i];
1258     }
1259     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1260     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1261     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1262     for (j=0; j<n; j++) {
1263       xb    = x + 4*(*idx++);
1264       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1265       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1266       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1267       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1268       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1269       v    += 16;
1270     }
1271     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1272     if (!usecprow) {
1273       z += 4; y += 4;
1274     }
1275   }
1276   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1277   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1278   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1279   PetscFunctionReturn(0);
1280 }
1281 
1282 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1283 {
1284   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1285   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1286   const PetscScalar *x,*xb;
1287   PetscScalar       *yarray,*zarray;
1288   const MatScalar   *v;
1289   PetscErrorCode    ierr;
1290   PetscInt          mbs = a->mbs,i,j,n;
1291   const PetscInt    *idx,*ii,*ridx = NULL;
1292   PetscBool         usecprow=a->compressedrow.use;
1293 
1294   PetscFunctionBegin;
1295   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1296   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1297 
1298   idx = a->j;
1299   v   = a->a;
1300   if (usecprow) {
1301     if (zz != yy) {
1302       ierr = PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1303     }
1304     mbs  = a->compressedrow.nrows;
1305     ii   = a->compressedrow.i;
1306     ridx = a->compressedrow.rindex;
1307   } else {
1308     ii = a->i;
1309     y  = yarray;
1310     z  = zarray;
1311   }
1312 
1313   for (i=0; i<mbs; i++) {
1314     n = ii[1] - ii[0]; ii++;
1315     if (usecprow) {
1316       z = zarray + 5*ridx[i];
1317       y = yarray + 5*ridx[i];
1318     }
1319     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1320     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1321     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1322     for (j=0; j<n; j++) {
1323       xb    = x + 5*(*idx++);
1324       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1325       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1326       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1327       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1328       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1329       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1330       v    += 25;
1331     }
1332     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1333     if (!usecprow) {
1334       z += 5; y += 5;
1335     }
1336   }
1337   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1338   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1339   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1340   PetscFunctionReturn(0);
1341 }
1342 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1343 {
1344   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1345   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1346   const PetscScalar *x,*xb;
1347   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1348   const MatScalar   *v;
1349   PetscErrorCode    ierr;
1350   PetscInt          mbs = a->mbs,i,j,n;
1351   const PetscInt    *idx,*ii,*ridx=NULL;
1352   PetscBool         usecprow=a->compressedrow.use;
1353 
1354   PetscFunctionBegin;
1355   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1356   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1357 
1358   idx = a->j;
1359   v   = a->a;
1360   if (usecprow) {
1361     if (zz != yy) {
1362       ierr = PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1363     }
1364     mbs  = a->compressedrow.nrows;
1365     ii   = a->compressedrow.i;
1366     ridx = a->compressedrow.rindex;
1367   } else {
1368     ii = a->i;
1369     y  = yarray;
1370     z  = zarray;
1371   }
1372 
1373   for (i=0; i<mbs; i++) {
1374     n = ii[1] - ii[0]; ii++;
1375     if (usecprow) {
1376       z = zarray + 6*ridx[i];
1377       y = yarray + 6*ridx[i];
1378     }
1379     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1380     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1381     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1382     for (j=0; j<n; j++) {
1383       xb    = x + 6*(*idx++);
1384       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1385       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1386       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1387       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1388       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1389       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1390       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1391       v    += 36;
1392     }
1393     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1394     if (!usecprow) {
1395       z += 6; y += 6;
1396     }
1397   }
1398   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1399   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1400   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1401   PetscFunctionReturn(0);
1402 }
1403 
1404 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1405 {
1406   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1407   PetscScalar       *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1408   const PetscScalar *x,*xb;
1409   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1410   const MatScalar   *v;
1411   PetscErrorCode    ierr;
1412   PetscInt          mbs = a->mbs,i,j,n;
1413   const PetscInt    *idx,*ii,*ridx = NULL;
1414   PetscBool         usecprow=a->compressedrow.use;
1415 
1416   PetscFunctionBegin;
1417   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1418   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1419 
1420   idx = a->j;
1421   v   = a->a;
1422   if (usecprow) {
1423     if (zz != yy) {
1424       ierr = PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1425     }
1426     mbs  = a->compressedrow.nrows;
1427     ii   = a->compressedrow.i;
1428     ridx = a->compressedrow.rindex;
1429   } else {
1430     ii = a->i;
1431     y  = yarray;
1432     z  = zarray;
1433   }
1434 
1435   for (i=0; i<mbs; i++) {
1436     n = ii[1] - ii[0]; ii++;
1437     if (usecprow) {
1438       z = zarray + 7*ridx[i];
1439       y = yarray + 7*ridx[i];
1440     }
1441     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1442     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1443     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1444     for (j=0; j<n; j++) {
1445       xb    = x + 7*(*idx++);
1446       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1447       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1448       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1449       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1450       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1451       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1452       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1453       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1454       v    += 49;
1455     }
1456     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1457     if (!usecprow) {
1458       z += 7; y += 7;
1459     }
1460   }
1461   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1462   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1463   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1464   PetscFunctionReturn(0);
1465 }
1466 
1467 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1468 {
1469   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1470   PetscScalar       *z = 0,*work,*workt,*zarray;
1471   const PetscScalar *x,*xb;
1472   const MatScalar   *v;
1473   PetscErrorCode    ierr;
1474   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1475   PetscInt          ncols,k;
1476   const PetscInt    *ridx = NULL,*idx,*ii;
1477   PetscBool         usecprow = a->compressedrow.use;
1478 
1479   PetscFunctionBegin;
1480   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1481   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1482   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1483 
1484   idx = a->j;
1485   v   = a->a;
1486   if (usecprow) {
1487     mbs  = a->compressedrow.nrows;
1488     ii   = a->compressedrow.i;
1489     ridx = a->compressedrow.rindex;
1490   } else {
1491     mbs = a->mbs;
1492     ii  = a->i;
1493     z   = zarray;
1494   }
1495 
1496   if (!a->mult_work) {
1497     k    = PetscMax(A->rmap->n,A->cmap->n);
1498     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1499   }
1500   work = a->mult_work;
1501   for (i=0; i<mbs; i++) {
1502     n     = ii[1] - ii[0]; ii++;
1503     ncols = n*bs;
1504     workt = work;
1505     for (j=0; j<n; j++) {
1506       xb = x + bs*(*idx++);
1507       for (k=0; k<bs; k++) workt[k] = xb[k];
1508       workt += bs;
1509     }
1510     if (usecprow) z = zarray + bs*ridx[i];
1511     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1512     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1513     v += n*bs2;
1514     if (!usecprow) z += bs;
1515   }
1516   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1517   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1518   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1519   PetscFunctionReturn(0);
1520 }
1521 
1522 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1523 {
1524   PetscScalar    zero = 0.0;
1525   PetscErrorCode ierr;
1526 
1527   PetscFunctionBegin;
1528   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1529   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1530   PetscFunctionReturn(0);
1531 }
1532 
1533 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1534 {
1535   PetscScalar    zero = 0.0;
1536   PetscErrorCode ierr;
1537 
1538   PetscFunctionBegin;
1539   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1540   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1545 {
1546   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1547   PetscScalar       *z,x1,x2,x3,x4,x5;
1548   const PetscScalar *x,*xb = NULL;
1549   const MatScalar   *v;
1550   PetscErrorCode    ierr;
1551   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1552   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1553   Mat_CompressedRow cprow = a->compressedrow;
1554   PetscBool         usecprow = cprow.use;
1555 
1556   PetscFunctionBegin;
1557   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1558   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1559   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1560 
1561   idx = a->j;
1562   v   = a->a;
1563   if (usecprow) {
1564     mbs  = cprow.nrows;
1565     ii   = cprow.i;
1566     ridx = cprow.rindex;
1567   } else {
1568     mbs=a->mbs;
1569     ii = a->i;
1570     xb = x;
1571   }
1572 
1573   switch (bs) {
1574   case 1:
1575     for (i=0; i<mbs; i++) {
1576       if (usecprow) xb = x + ridx[i];
1577       x1 = xb[0];
1578       ib = idx + ii[0];
1579       n  = ii[1] - ii[0]; ii++;
1580       for (j=0; j<n; j++) {
1581         rval     = ib[j];
1582         z[rval] += PetscConj(*v) * x1;
1583         v++;
1584       }
1585       if (!usecprow) xb++;
1586     }
1587     break;
1588   case 2:
1589     for (i=0; i<mbs; i++) {
1590       if (usecprow) xb = x + 2*ridx[i];
1591       x1 = xb[0]; x2 = xb[1];
1592       ib = idx + ii[0];
1593       n  = ii[1] - ii[0]; ii++;
1594       for (j=0; j<n; j++) {
1595         rval       = ib[j]*2;
1596         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1597         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1598         v         += 4;
1599       }
1600       if (!usecprow) xb += 2;
1601     }
1602     break;
1603   case 3:
1604     for (i=0; i<mbs; i++) {
1605       if (usecprow) xb = x + 3*ridx[i];
1606       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1607       ib = idx + ii[0];
1608       n  = ii[1] - ii[0]; ii++;
1609       for (j=0; j<n; j++) {
1610         rval       = ib[j]*3;
1611         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1612         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1613         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1614         v         += 9;
1615       }
1616       if (!usecprow) xb += 3;
1617     }
1618     break;
1619   case 4:
1620     for (i=0; i<mbs; i++) {
1621       if (usecprow) xb = x + 4*ridx[i];
1622       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1623       ib = idx + ii[0];
1624       n  = ii[1] - ii[0]; ii++;
1625       for (j=0; j<n; j++) {
1626         rval       = ib[j]*4;
1627         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1628         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1629         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1630         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1631         v         += 16;
1632       }
1633       if (!usecprow) xb += 4;
1634     }
1635     break;
1636   case 5:
1637     for (i=0; i<mbs; i++) {
1638       if (usecprow) xb = x + 5*ridx[i];
1639       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1640       x4 = xb[3]; x5 = xb[4];
1641       ib = idx + ii[0];
1642       n  = ii[1] - ii[0]; ii++;
1643       for (j=0; j<n; j++) {
1644         rval       = ib[j]*5;
1645         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1646         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1647         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1648         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1649         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1650         v         += 25;
1651       }
1652       if (!usecprow) xb += 5;
1653     }
1654     break;
1655   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1656     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1657 #if 0
1658     {
1659       PetscInt          ncols,k,bs2=a->bs2;
1660       PetscScalar       *work,*workt,zb;
1661       const PetscScalar *xtmp;
1662       if (!a->mult_work) {
1663         k    = PetscMax(A->rmap->n,A->cmap->n);
1664         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1665       }
1666       work = a->mult_work;
1667       xtmp = x;
1668       for (i=0; i<mbs; i++) {
1669         n     = ii[1] - ii[0]; ii++;
1670         ncols = n*bs;
1671         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1672         if (usecprow) xtmp = x + bs*ridx[i];
1673         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1674         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1675         v += n*bs2;
1676         if (!usecprow) xtmp += bs;
1677         workt = work;
1678         for (j=0; j<n; j++) {
1679           zb = z + bs*(*idx++);
1680           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1681           workt += bs;
1682         }
1683       }
1684     }
1685 #endif
1686   }
1687   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1688   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1689   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1690   PetscFunctionReturn(0);
1691 }
1692 
1693 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1694 {
1695   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1696   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
1697   const PetscScalar *x,*xb = 0;
1698   const MatScalar   *v;
1699   PetscErrorCode    ierr;
1700   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1701   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1702   Mat_CompressedRow cprow   = a->compressedrow;
1703   PetscBool         usecprow=cprow.use;
1704 
1705   PetscFunctionBegin;
1706   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1707   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1708   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1709 
1710   idx = a->j;
1711   v   = a->a;
1712   if (usecprow) {
1713     mbs  = cprow.nrows;
1714     ii   = cprow.i;
1715     ridx = cprow.rindex;
1716   } else {
1717     mbs=a->mbs;
1718     ii = a->i;
1719     xb = x;
1720   }
1721 
1722   switch (bs) {
1723   case 1:
1724     for (i=0; i<mbs; i++) {
1725       if (usecprow) xb = x + ridx[i];
1726       x1 = xb[0];
1727       ib = idx + ii[0];
1728       n  = ii[1] - ii[0]; ii++;
1729       for (j=0; j<n; j++) {
1730         rval     = ib[j];
1731         z[rval] += *v * x1;
1732         v++;
1733       }
1734       if (!usecprow) xb++;
1735     }
1736     break;
1737   case 2:
1738     for (i=0; i<mbs; i++) {
1739       if (usecprow) xb = x + 2*ridx[i];
1740       x1 = xb[0]; x2 = xb[1];
1741       ib = idx + ii[0];
1742       n  = ii[1] - ii[0]; ii++;
1743       for (j=0; j<n; j++) {
1744         rval       = ib[j]*2;
1745         z[rval++] += v[0]*x1 + v[1]*x2;
1746         z[rval++] += v[2]*x1 + v[3]*x2;
1747         v         += 4;
1748       }
1749       if (!usecprow) xb += 2;
1750     }
1751     break;
1752   case 3:
1753     for (i=0; i<mbs; i++) {
1754       if (usecprow) xb = x + 3*ridx[i];
1755       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1756       ib = idx + ii[0];
1757       n  = ii[1] - ii[0]; ii++;
1758       for (j=0; j<n; j++) {
1759         rval       = ib[j]*3;
1760         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1761         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1762         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1763         v         += 9;
1764       }
1765       if (!usecprow) xb += 3;
1766     }
1767     break;
1768   case 4:
1769     for (i=0; i<mbs; i++) {
1770       if (usecprow) xb = x + 4*ridx[i];
1771       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1772       ib = idx + ii[0];
1773       n  = ii[1] - ii[0]; ii++;
1774       for (j=0; j<n; j++) {
1775         rval       = ib[j]*4;
1776         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1777         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1778         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1779         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1780         v         += 16;
1781       }
1782       if (!usecprow) xb += 4;
1783     }
1784     break;
1785   case 5:
1786     for (i=0; i<mbs; i++) {
1787       if (usecprow) xb = x + 5*ridx[i];
1788       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1789       x4 = xb[3]; x5 = xb[4];
1790       ib = idx + ii[0];
1791       n  = ii[1] - ii[0]; ii++;
1792       for (j=0; j<n; j++) {
1793         rval       = ib[j]*5;
1794         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1795         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1796         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1797         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1798         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1799         v         += 25;
1800       }
1801       if (!usecprow) xb += 5;
1802     }
1803     break;
1804   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1805     PetscInt          ncols,k;
1806     PetscScalar       *work,*workt;
1807     const PetscScalar *xtmp;
1808     if (!a->mult_work) {
1809       k    = PetscMax(A->rmap->n,A->cmap->n);
1810       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1811     }
1812     work = a->mult_work;
1813     xtmp = x;
1814     for (i=0; i<mbs; i++) {
1815       n     = ii[1] - ii[0]; ii++;
1816       ncols = n*bs;
1817       ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1818       if (usecprow) xtmp = x + bs*ridx[i];
1819       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1820       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1821       v += n*bs2;
1822       if (!usecprow) xtmp += bs;
1823       workt = work;
1824       for (j=0; j<n; j++) {
1825         zb = z + bs*(*idx++);
1826         for (k=0; k<bs; k++) zb[k] += workt[k];
1827         workt += bs;
1828       }
1829     }
1830     }
1831   }
1832   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1833   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1834   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1839 {
1840   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
1841   PetscInt       totalnz = a->bs2*a->nz;
1842   PetscScalar    oalpha  = alpha;
1843   PetscErrorCode ierr;
1844   PetscBLASInt   one = 1,tnz;
1845 
1846   PetscFunctionBegin;
1847   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
1848   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1849   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1854 {
1855   PetscErrorCode ierr;
1856   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
1857   MatScalar      *v  = a->a;
1858   PetscReal      sum = 0.0;
1859   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1860 
1861   PetscFunctionBegin;
1862   if (type == NORM_FROBENIUS) {
1863 #if defined(PETSC_USE_REAL___FP16)
1864     PetscBLASInt one = 1,cnt = bs2*nz;
1865     *norm = BLASnrm2_(&cnt,v,&one);
1866 #else
1867     for (i=0; i<bs2*nz; i++) {
1868       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1869     }
1870 #endif
1871     *norm = PetscSqrtReal(sum);
1872     ierr = PetscLogFlops(2*bs2*nz);CHKERRQ(ierr);
1873   } else if (type == NORM_1) { /* maximum column sum */
1874     PetscReal *tmp;
1875     PetscInt  *bcol = a->j;
1876     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
1877     for (i=0; i<nz; i++) {
1878       for (j=0; j<bs; j++) {
1879         k1 = bs*(*bcol) + j; /* column index */
1880         for (k=0; k<bs; k++) {
1881           tmp[k1] += PetscAbsScalar(*v); v++;
1882         }
1883       }
1884       bcol++;
1885     }
1886     *norm = 0.0;
1887     for (j=0; j<A->cmap->n; j++) {
1888       if (tmp[j] > *norm) *norm = tmp[j];
1889     }
1890     ierr = PetscFree(tmp);CHKERRQ(ierr);
1891     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
1892   } else if (type == NORM_INFINITY) { /* maximum row sum */
1893     *norm = 0.0;
1894     for (k=0; k<bs; k++) {
1895       for (j=0; j<a->mbs; j++) {
1896         v   = a->a + bs2*a->i[j] + k;
1897         sum = 0.0;
1898         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1899           for (k1=0; k1<bs; k1++) {
1900             sum += PetscAbsScalar(*v);
1901             v   += bs;
1902           }
1903         }
1904         if (sum > *norm) *norm = sum;
1905       }
1906     }
1907     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
1908   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1909   PetscFunctionReturn(0);
1910 }
1911 
1912 
1913 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1914 {
1915   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1916   PetscErrorCode ierr;
1917 
1918   PetscFunctionBegin;
1919   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1920   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1921     *flg = PETSC_FALSE;
1922     PetscFunctionReturn(0);
1923   }
1924 
1925   /* if the a->i are the same */
1926   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1927   if (!*flg) PetscFunctionReturn(0);
1928 
1929   /* if a->j are the same */
1930   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1931   if (!*flg) PetscFunctionReturn(0);
1932 
1933   /* if a->a are the same */
1934   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1935   PetscFunctionReturn(0);
1936 
1937 }
1938 
1939 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1940 {
1941   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1942   PetscErrorCode ierr;
1943   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1944   PetscScalar    *x,zero = 0.0;
1945   MatScalar      *aa,*aa_j;
1946 
1947   PetscFunctionBegin;
1948   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1949   bs   = A->rmap->bs;
1950   aa   = a->a;
1951   ai   = a->i;
1952   aj   = a->j;
1953   ambs = a->mbs;
1954   bs2  = a->bs2;
1955 
1956   ierr = VecSet(v,zero);CHKERRQ(ierr);
1957   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1958   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1959   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1960   for (i=0; i<ambs; i++) {
1961     for (j=ai[i]; j<ai[i+1]; j++) {
1962       if (aj[j] == i) {
1963         row  = i*bs;
1964         aa_j = aa+j*bs2;
1965         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1966         break;
1967       }
1968     }
1969   }
1970   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1971   PetscFunctionReturn(0);
1972 }
1973 
1974 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1975 {
1976   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1977   const PetscScalar *l,*r,*li,*ri;
1978   PetscScalar       x;
1979   MatScalar         *aa, *v;
1980   PetscErrorCode    ierr;
1981   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
1982   const PetscInt    *ai,*aj;
1983 
1984   PetscFunctionBegin;
1985   ai  = a->i;
1986   aj  = a->j;
1987   aa  = a->a;
1988   m   = A->rmap->n;
1989   n   = A->cmap->n;
1990   bs  = A->rmap->bs;
1991   mbs = a->mbs;
1992   bs2 = a->bs2;
1993   if (ll) {
1994     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1995     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1996     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1997     for (i=0; i<mbs; i++) { /* for each block row */
1998       M  = ai[i+1] - ai[i];
1999       li = l + i*bs;
2000       v  = aa + bs2*ai[i];
2001       for (j=0; j<M; j++) { /* for each block */
2002         for (k=0; k<bs2; k++) {
2003           (*v++) *= li[k%bs];
2004         }
2005       }
2006     }
2007     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2008     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2009   }
2010 
2011   if (rr) {
2012     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2013     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2014     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2015     for (i=0; i<mbs; i++) { /* for each block row */
2016       iai = ai[i];
2017       M   = ai[i+1] - iai;
2018       v   = aa + bs2*iai;
2019       for (j=0; j<M; j++) { /* for each block */
2020         ri = r + bs*aj[iai+j];
2021         for (k=0; k<bs; k++) {
2022           x = ri[k];
2023           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2024           v += bs;
2025         }
2026       }
2027     }
2028     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2029     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2030   }
2031   PetscFunctionReturn(0);
2032 }
2033 
2034 
2035 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2036 {
2037   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2038 
2039   PetscFunctionBegin;
2040   info->block_size   = a->bs2;
2041   info->nz_allocated = a->bs2*a->maxnz;
2042   info->nz_used      = a->bs2*a->nz;
2043   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2044   info->assemblies   = A->num_ass;
2045   info->mallocs      = A->info.mallocs;
2046   info->memory       = ((PetscObject)A)->mem;
2047   if (A->factortype) {
2048     info->fill_ratio_given  = A->info.fill_ratio_given;
2049     info->fill_ratio_needed = A->info.fill_ratio_needed;
2050     info->factor_mallocs    = A->info.factor_mallocs;
2051   } else {
2052     info->fill_ratio_given  = 0;
2053     info->fill_ratio_needed = 0;
2054     info->factor_mallocs    = 0;
2055   }
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2060 {
2061   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2062   PetscErrorCode ierr;
2063 
2064   PetscFunctionBegin;
2065   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
2066   PetscFunctionReturn(0);
2067 }
2068