xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 4e8afd12df985612b40e26aef947da2f566aee4e)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
14   const PetscInt *idx;
15   PetscBT        table_out,table_in;
16 
17   PetscFunctionBegin;
18   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19   mbs  = a->mbs;
20   ai   = a->i;
21   aj   = a->j;
22   bs   = A->rmap->bs;
23   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
24   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
25   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
26   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
27 
28   for (i=0; i<is_max; i++) { /* for each is */
29     isz  = 0;
30     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
31 
32     /* Extract the indices, assume there can be duplicate entries */
33     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
34     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
35 
36     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
37     bcol_max = 0;
38     for (j=0; j<n; ++j) {
39       brow = idx[j]/bs; /* convert the indices into block indices */
40       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41       if (!PetscBTLookupSet(table_out,brow)) {
42         nidx[isz++] = brow;
43         if (bcol_max < brow) bcol_max = brow;
44       }
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++) { /* for each overlap */
51       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
53       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
54 
55       n = isz;  /* length of the updated is[i] */
56       for (brow=0; brow<mbs; brow++) {
57         start = ai[brow]; end   = ai[brow+1];
58         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59           for (l = start; l<end; l++) {
60             bcol = aj[l];
61             if (!PetscBTLookupSet(table_out,bcol)) {
62               nidx[isz++] = bcol;
63               if (bcol_max < bcol) bcol_max = bcol;
64             }
65           }
66           k++;
67           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69           for (l = start; l<end; l++) {
70             bcol = aj[l];
71             if (bcol > bcol_max) break;
72             if (PetscBTLookup(table_in,bcol)) {
73               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
74               break; /* for l = start; l<end ; l++) */
75             }
76           }
77         }
78       }
79     } /* for each overlap */
80 
81     /* expand the Index Set */
82     for (j=0; j<isz; j++) {
83       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
84     }
85     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
86   }
87   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
88   ierr = PetscFree(nidx);CHKERRQ(ierr);
89   ierr = PetscFree(nidx2);CHKERRQ(ierr);
90   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95         Zero some ops' to avoid invalid usse */
96 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
97 {
98   PetscErrorCode ierr;
99 
100   PetscFunctionBegin;
101   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
102   Bseq->ops->mult                   = 0;
103   Bseq->ops->multadd                = 0;
104   Bseq->ops->multtranspose          = 0;
105   Bseq->ops->multtransposeadd       = 0;
106   Bseq->ops->lufactor               = 0;
107   Bseq->ops->choleskyfactor         = 0;
108   Bseq->ops->lufactorsymbolic       = 0;
109   Bseq->ops->choleskyfactorsymbolic = 0;
110   Bseq->ops->getinertia             = 0;
111   PetscFunctionReturn(0);
112 }
113 
114 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
115 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116 {
117   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
118   PetscErrorCode ierr;
119   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
121   const PetscInt *irow,*icol;
122   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123   PetscInt       *aj = a->j,*ai = a->i;
124   MatScalar      *mat_a;
125   Mat            C;
126   PetscBool      flag;
127 
128   PetscFunctionBegin;
129 
130   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
131   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
132   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
133   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
134 
135   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
136   ssmap = smap;
137   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
138   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139   /* determine lens of each row */
140   for (i=0; i<nrows; i++) {
141     kstart  = ai[irow[i]];
142     kend    = kstart + a->ilen[irow[i]];
143     lens[i] = 0;
144     for (k=kstart; k<kend; k++) {
145       if (ssmap[aj[k]]) lens[i]++;
146     }
147   }
148   /* Create and fill new matrix */
149   if (scall == MAT_REUSE_MATRIX) {
150     c = (Mat_SeqSBAIJ*)((*B)->data);
151 
152     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
154     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
156     C    = *B;
157   } else {
158     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
159     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
160     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
161     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
162   }
163   c = (Mat_SeqSBAIJ*)(C->data);
164   for (i=0; i<nrows; i++) {
165     row      = irow[i];
166     kstart   = ai[row];
167     kend     = kstart + a->ilen[row];
168     mat_i    = c->i[i];
169     mat_j    = c->j + mat_i;
170     mat_a    = c->a + mat_i*bs2;
171     mat_ilen = c->ilen + i;
172     for (k=kstart; k<kend; k++) {
173       if ((tcol=ssmap[a->j[k]])) {
174         *mat_j++ = tcol - 1;
175         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
176         mat_a   += bs2;
177         (*mat_ilen)++;
178       }
179     }
180   }
181   /* sort */
182   {
183     MatScalar *work;
184 
185     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
186     for (i=0; i<nrows; i++) {
187       PetscInt ilen;
188       mat_i = c->i[i];
189       mat_j = c->j + mat_i;
190       mat_a = c->a + mat_i*bs2;
191       ilen  = c->ilen[i];
192       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
193     }
194     ierr = PetscFree(work);CHKERRQ(ierr);
195   }
196 
197   /* Free work space */
198   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199   ierr = PetscFree(smap);CHKERRQ(ierr);
200   ierr = PetscFree(lens);CHKERRQ(ierr);
201   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203 
204   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205   *B   = C;
206   PetscFunctionReturn(0);
207 }
208 
209 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
210 {
211   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
212   IS             is1,is2;
213   PetscErrorCode ierr;
214   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215   const PetscInt *irow,*icol;
216 
217   PetscFunctionBegin;
218   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
219   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
220   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
221   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
222 
223   /* Verify if the indices corespond to each element in a block
224    and form the IS with compressed IS */
225   maxmnbs = PetscMax(a->mbs,a->nbs);
226   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
227   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
228   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229   for (i=0; i<a->mbs; i++) {
230     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231   }
232   count = 0;
233   for (i=0; i<nrows; i++) {
234     PetscInt j = irow[i] / bs;
235     if ((vary[j]--)==bs) iary[count++] = j;
236   }
237   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
238 
239   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
240   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241   for (i=0; i<a->nbs; i++) {
242     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243   }
244   count = 0;
245   for (i=0; i<ncols; i++) {
246     PetscInt j = icol[i] / bs;
247     if ((vary[j]--)==bs) iary[count++] = j;
248   }
249   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
250   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
251   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
252   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
253 
254   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
255   ierr = ISDestroy(&is1);CHKERRQ(ierr);
256   ierr = ISDestroy(&is2);CHKERRQ(ierr);
257 
258   if (isrow != iscol) {
259     PetscBool isequal;
260     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
261     if (!isequal) {
262       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
263     }
264   }
265   PetscFunctionReturn(0);
266 }
267 
268 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
269 {
270   PetscErrorCode ierr;
271   PetscInt       i;
272 
273   PetscFunctionBegin;
274   if (scall == MAT_INITIAL_MATRIX) {
275     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
276   }
277 
278   for (i=0; i<n; i++) {
279     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
280   }
281   PetscFunctionReturn(0);
282 }
283 
284 /* -------------------------------------------------------*/
285 /* Should check that shapes of vectors and matrices match */
286 /* -------------------------------------------------------*/
287 
288 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
289 {
290   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
291   PetscScalar       *z,x1,x2,zero=0.0;
292   const PetscScalar *x,*xb;
293   const MatScalar   *v;
294   PetscErrorCode    ierr;
295   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
296   const PetscInt    *aj=a->j,*ai=a->i,*ib;
297   PetscInt          nonzerorow=0;
298 
299   PetscFunctionBegin;
300   ierr = VecSet(zz,zero);CHKERRQ(ierr);
301   if (!a->nz) PetscFunctionReturn(0);
302   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
303   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
304 
305   v  = a->a;
306   xb = x;
307 
308   for (i=0; i<mbs; i++) {
309     n           = ai[1] - ai[0]; /* length of i_th block row of A */
310     x1          = xb[0]; x2 = xb[1];
311     ib          = aj + *ai;
312     jmin        = 0;
313     nonzerorow += (n>0);
314     if (*ib == i) {     /* (diag of A)*x */
315       z[2*i]   += v[0]*x1 + v[2]*x2;
316       z[2*i+1] += v[2]*x1 + v[3]*x2;
317       v        += 4; jmin++;
318     }
319     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
321     for (j=jmin; j<n; j++) {
322       /* (strict lower triangular part of A)*x  */
323       cval       = ib[j]*2;
324       z[cval]   += v[0]*x1 + v[1]*x2;
325       z[cval+1] += v[2]*x1 + v[3]*x2;
326       /* (strict upper triangular part of A)*x  */
327       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
328       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
329       v        += 4;
330     }
331     xb +=2; ai++;
332   }
333 
334   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
335   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
336   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
337   PetscFunctionReturn(0);
338 }
339 
340 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
341 {
342   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
343   PetscScalar       *z,x1,x2,x3,zero=0.0;
344   const PetscScalar *x,*xb;
345   const MatScalar   *v;
346   PetscErrorCode    ierr;
347   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
348   const PetscInt    *aj = a->j,*ai = a->i,*ib;
349   PetscInt          nonzerorow=0;
350 
351   PetscFunctionBegin;
352   ierr = VecSet(zz,zero);CHKERRQ(ierr);
353   if (!a->nz) PetscFunctionReturn(0);
354   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
355   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
356 
357   v  = a->a;
358   xb = x;
359 
360   for (i=0; i<mbs; i++) {
361     n           = ai[1] - ai[0]; /* length of i_th block row of A */
362     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
363     ib          = aj + *ai;
364     jmin        = 0;
365     nonzerorow += (n>0);
366     if (*ib == i) {     /* (diag of A)*x */
367       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
368       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
369       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370       v        += 9; jmin++;
371     }
372     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
374     for (j=jmin; j<n; j++) {
375       /* (strict lower triangular part of A)*x  */
376       cval       = ib[j]*3;
377       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
378       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
379       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
380       /* (strict upper triangular part of A)*x  */
381       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
382       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
383       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
384       v        += 9;
385     }
386     xb +=3; ai++;
387   }
388 
389   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
390   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
391   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
392   PetscFunctionReturn(0);
393 }
394 
395 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
396 {
397   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
398   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
399   const PetscScalar *x,*xb;
400   const MatScalar   *v;
401   PetscErrorCode    ierr;
402   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
403   const PetscInt    *aj = a->j,*ai = a->i,*ib;
404   PetscInt          nonzerorow = 0;
405 
406   PetscFunctionBegin;
407   ierr = VecSet(zz,zero);CHKERRQ(ierr);
408   if (!a->nz) PetscFunctionReturn(0);
409   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
411 
412   v  = a->a;
413   xb = x;
414 
415   for (i=0; i<mbs; i++) {
416     n           = ai[1] - ai[0]; /* length of i_th block row of A */
417     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
418     ib          = aj + *ai;
419     jmin        = 0;
420     nonzerorow += (n>0);
421     if (*ib == i) {     /* (diag of A)*x */
422       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
423       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
424       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
425       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426       v        += 16; jmin++;
427     }
428     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430     for (j=jmin; j<n; j++) {
431       /* (strict lower triangular part of A)*x  */
432       cval       = ib[j]*4;
433       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
434       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
435       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
436       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
437       /* (strict upper triangular part of A)*x  */
438       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
439       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
440       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
441       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
442       v        += 16;
443     }
444     xb +=4; ai++;
445   }
446 
447   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
448   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
449   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
450   PetscFunctionReturn(0);
451 }
452 
453 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
454 {
455   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
456   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
457   const PetscScalar *x,*xb;
458   const MatScalar   *v;
459   PetscErrorCode    ierr;
460   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
461   const PetscInt    *aj = a->j,*ai = a->i,*ib;
462   PetscInt          nonzerorow=0;
463 
464   PetscFunctionBegin;
465   ierr = VecSet(zz,zero);CHKERRQ(ierr);
466   if (!a->nz) PetscFunctionReturn(0);
467   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
468   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
469 
470   v  = a->a;
471   xb = x;
472 
473   for (i=0; i<mbs; i++) {
474     n           = ai[1] - ai[0]; /* length of i_th block row of A */
475     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
476     ib          = aj + *ai;
477     jmin        = 0;
478     nonzerorow += (n>0);
479     if (*ib == i) {      /* (diag of A)*x */
480       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
481       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
482       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
483       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
484       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485       v        += 25; jmin++;
486     }
487     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489     for (j=jmin; j<n; j++) {
490       /* (strict lower triangular part of A)*x  */
491       cval       = ib[j]*5;
492       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
493       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
494       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
495       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
496       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
497       /* (strict upper triangular part of A)*x  */
498       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
499       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
500       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
501       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
502       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
503       v        += 25;
504     }
505     xb +=5; ai++;
506   }
507 
508   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
509   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
511   PetscFunctionReturn(0);
512 }
513 
514 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515 {
516   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518   const PetscScalar *x,*xb;
519   const MatScalar   *v;
520   PetscErrorCode    ierr;
521   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522   const PetscInt    *aj=a->j,*ai=a->i,*ib;
523   PetscInt          nonzerorow=0;
524 
525   PetscFunctionBegin;
526   ierr = VecSet(zz,zero);CHKERRQ(ierr);
527   if (!a->nz) PetscFunctionReturn(0);
528   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
529   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
530 
531   v  = a->a;
532   xb = x;
533 
534   for (i=0; i<mbs; i++) {
535     n           = ai[1] - ai[0]; /* length of i_th block row of A */
536     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537     ib          = aj + *ai;
538     jmin        = 0;
539     nonzerorow += (n>0);
540     if (*ib == i) {      /* (diag of A)*x */
541       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547       v        += 36; jmin++;
548     }
549     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551     for (j=jmin; j<n; j++) {
552       /* (strict lower triangular part of A)*x  */
553       cval       = ib[j]*6;
554       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560       /* (strict upper triangular part of A)*x  */
561       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567       v        += 36;
568     }
569     xb +=6; ai++;
570   }
571 
572   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
573   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
575   PetscFunctionReturn(0);
576 }
577 
578 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
579 {
580   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
581   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582   const PetscScalar *x,*xb;
583   const MatScalar   *v;
584   PetscErrorCode    ierr;
585   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
586   const PetscInt    *aj=a->j,*ai=a->i,*ib;
587   PetscInt          nonzerorow=0;
588 
589   PetscFunctionBegin;
590   ierr = VecSet(zz,zero);CHKERRQ(ierr);
591   if (!a->nz) PetscFunctionReturn(0);
592   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
593   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
594 
595   v  = a->a;
596   xb = x;
597 
598   for (i=0; i<mbs; i++) {
599     n           = ai[1] - ai[0]; /* length of i_th block row of A */
600     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
601     ib          = aj + *ai;
602     jmin        = 0;
603     nonzerorow += (n>0);
604     if (*ib == i) {      /* (diag of A)*x */
605       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
606       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
607       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
608       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
609       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
610       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
611       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612       v        += 49; jmin++;
613     }
614     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616     for (j=jmin; j<n; j++) {
617       /* (strict lower triangular part of A)*x  */
618       cval       = ib[j]*7;
619       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
620       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
621       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
622       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
623       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
624       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
625       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
626       /* (strict upper triangular part of A)*x  */
627       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
628       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
629       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
630       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
631       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
632       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
633       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
634       v       += 49;
635     }
636     xb +=7; ai++;
637   }
638   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
639   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
640   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
641   PetscFunctionReturn(0);
642 }
643 
644 /*
645     This will not work with MatScalar == float because it calls the BLAS
646 */
647 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
648 {
649   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
650   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651   const PetscScalar *x,*x_ptr,*xb;
652   const MatScalar   *v;
653   PetscErrorCode    ierr;
654   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655   const PetscInt    *idx,*aj,*ii;
656   PetscInt          nonzerorow=0;
657 
658   PetscFunctionBegin;
659   ierr = VecSet(zz,zero);CHKERRQ(ierr);
660   if (!a->nz) PetscFunctionReturn(0);
661   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
662   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
663 
664   aj = a->j;
665   v  = a->a;
666   ii = a->i;
667 
668   if (!a->mult_work) {
669     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
670   }
671   work = a->mult_work;
672 
673   for (i=0; i<mbs; i++) {
674     n           = ii[1] - ii[0]; ncols = n*bs;
675     workt       = work; idx=aj+ii[0];
676     nonzerorow += (n>0);
677 
678     /* upper triangular part */
679     for (j=0; j<n; j++) {
680       xb = x_ptr + bs*(*idx++);
681       for (k=0; k<bs; k++) workt[k] = xb[k];
682       workt += bs;
683     }
684     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
685     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
686 
687     /* strict lower triangular part */
688     idx = aj+ii[0];
689     if (*idx == i) {
690       ncols -= bs; v += bs2; idx++; n--;
691     }
692 
693     if (ncols > 0) {
694       workt = work;
695       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
696       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
697       for (j=0; j<n; j++) {
698         zb = z_ptr + bs*(*idx++);
699         for (k=0; k<bs; k++) zb[k] += workt[k];
700         workt += bs;
701       }
702     }
703     x += bs; v += n*bs2; z += bs; ii++;
704   }
705 
706   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
707   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
708   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
713 {
714   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
715   PetscScalar       *z,x1;
716   const PetscScalar *x,*xb;
717   const MatScalar   *v;
718   PetscErrorCode    ierr;
719   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
720   const PetscInt    *aj=a->j,*ai=a->i,*ib;
721   PetscInt          nonzerorow=0;
722 #if defined(PETSC_USE_COMPLEX)
723   const int         aconj = A->hermitian;
724 #else
725   const int         aconj = 0;
726 #endif
727 
728   PetscFunctionBegin;
729   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
730   if (!a->nz) PetscFunctionReturn(0);
731   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
732   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
733   v    = a->a;
734   xb   = x;
735 
736   for (i=0; i<mbs; i++) {
737     n           = ai[1] - ai[0]; /* length of i_th row of A */
738     x1          = xb[0];
739     ib          = aj + *ai;
740     jmin        = 0;
741     nonzerorow += (n>0);
742     if (*ib == i) {            /* (diag of A)*x */
743       z[i] += *v++ * x[*ib++]; jmin++;
744     }
745     if (aconj) {
746       for (j=jmin; j<n; j++) {
747         cval    = *ib;
748         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
749         z[i]    += *v++ * x[*ib++];    /* (strict upper triangular part of A)*x  */
750       }
751     } else {
752       for (j=jmin; j<n; j++) {
753         cval    = *ib;
754         z[cval] += *v * x1;         /* (strict lower triangular part of A)*x  */
755         z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
756       }
757     }
758     xb++; ai++;
759   }
760 
761   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
762   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
763 
764   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
765   PetscFunctionReturn(0);
766 }
767 
768 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
769 {
770   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
771   PetscScalar       *z,x1,x2;
772   const PetscScalar *x,*xb;
773   const MatScalar   *v;
774   PetscErrorCode    ierr;
775   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
776   const PetscInt    *aj=a->j,*ai=a->i,*ib;
777   PetscInt          nonzerorow=0;
778 
779   PetscFunctionBegin;
780   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
781   if (!a->nz) PetscFunctionReturn(0);
782   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
783   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
784 
785   v  = a->a;
786   xb = x;
787 
788   for (i=0; i<mbs; i++) {
789     n           = ai[1] - ai[0]; /* length of i_th block row of A */
790     x1          = xb[0]; x2 = xb[1];
791     ib          = aj + *ai;
792     jmin        = 0;
793     nonzerorow += (n>0);
794     if (*ib == i) {      /* (diag of A)*x */
795       z[2*i]   += v[0]*x1 + v[2]*x2;
796       z[2*i+1] += v[2]*x1 + v[3]*x2;
797       v        += 4; jmin++;
798     }
799     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
800     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
801     for (j=jmin; j<n; j++) {
802       /* (strict lower triangular part of A)*x  */
803       cval       = ib[j]*2;
804       z[cval]   += v[0]*x1 + v[1]*x2;
805       z[cval+1] += v[2]*x1 + v[3]*x2;
806       /* (strict upper triangular part of A)*x  */
807       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
808       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
809       v        += 4;
810     }
811     xb +=2; ai++;
812   }
813   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
814   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
815 
816   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
817   PetscFunctionReturn(0);
818 }
819 
820 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
821 {
822   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
823   PetscScalar       *z,x1,x2,x3;
824   const PetscScalar *x,*xb;
825   const MatScalar   *v;
826   PetscErrorCode    ierr;
827   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
828   const PetscInt    *aj=a->j,*ai=a->i,*ib;
829   PetscInt          nonzerorow=0;
830 
831   PetscFunctionBegin;
832   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
833   if (!a->nz) PetscFunctionReturn(0);
834   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
835   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
836 
837   v  = a->a;
838   xb = x;
839 
840   for (i=0; i<mbs; i++) {
841     n           = ai[1] - ai[0]; /* length of i_th block row of A */
842     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
843     ib          = aj + *ai;
844     jmin        = 0;
845     nonzerorow += (n>0);
846     if (*ib == i) {     /* (diag of A)*x */
847       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
848       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
849       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
850       v        += 9; jmin++;
851     }
852     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
853     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
854     for (j=jmin; j<n; j++) {
855       /* (strict lower triangular part of A)*x  */
856       cval       = ib[j]*3;
857       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
858       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
859       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
860       /* (strict upper triangular part of A)*x  */
861       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
862       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
863       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
864       v        += 9;
865     }
866     xb +=3; ai++;
867   }
868 
869   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
870   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
871 
872   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
873   PetscFunctionReturn(0);
874 }
875 
876 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
877 {
878   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
879   PetscScalar       *z,x1,x2,x3,x4;
880   const PetscScalar *x,*xb;
881   const MatScalar   *v;
882   PetscErrorCode    ierr;
883   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
884   const PetscInt    *aj=a->j,*ai=a->i,*ib;
885   PetscInt          nonzerorow=0;
886 
887   PetscFunctionBegin;
888   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
889   if (!a->nz) PetscFunctionReturn(0);
890   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
891   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
892 
893   v  = a->a;
894   xb = x;
895 
896   for (i=0; i<mbs; i++) {
897     n           = ai[1] - ai[0]; /* length of i_th block row of A */
898     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
899     ib          = aj + *ai;
900     jmin        = 0;
901     nonzerorow += (n>0);
902     if (*ib == i) {      /* (diag of A)*x */
903       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
904       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
905       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
906       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
907       v        += 16; jmin++;
908     }
909     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
910     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
911     for (j=jmin; j<n; j++) {
912       /* (strict lower triangular part of A)*x  */
913       cval       = ib[j]*4;
914       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
915       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
916       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
917       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
918       /* (strict upper triangular part of A)*x  */
919       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
920       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
921       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
922       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
923       v        += 16;
924     }
925     xb +=4; ai++;
926   }
927 
928   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
929   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
930 
931   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
932   PetscFunctionReturn(0);
933 }
934 
935 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
936 {
937   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
938   PetscScalar       *z,x1,x2,x3,x4,x5;
939   const PetscScalar *x,*xb;
940   const MatScalar   *v;
941   PetscErrorCode    ierr;
942   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
943   const PetscInt    *aj=a->j,*ai=a->i,*ib;
944   PetscInt          nonzerorow=0;
945 
946   PetscFunctionBegin;
947   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
948   if (!a->nz) PetscFunctionReturn(0);
949   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
950   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
951 
952   v  = a->a;
953   xb = x;
954 
955   for (i=0; i<mbs; i++) {
956     n           = ai[1] - ai[0]; /* length of i_th block row of A */
957     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
958     ib          = aj + *ai;
959     jmin        = 0;
960     nonzerorow += (n>0);
961     if (*ib == i) {      /* (diag of A)*x */
962       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
963       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
964       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
965       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
966       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
967       v        += 25; jmin++;
968     }
969     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
970     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
971     for (j=jmin; j<n; j++) {
972       /* (strict lower triangular part of A)*x  */
973       cval       = ib[j]*5;
974       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
975       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
976       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
977       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
978       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
979       /* (strict upper triangular part of A)*x  */
980       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
981       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
982       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
983       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
984       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
985       v        += 25;
986     }
987     xb +=5; ai++;
988   }
989 
990   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
991   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
992 
993   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
994   PetscFunctionReturn(0);
995 }
996 
997 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
998 {
999   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1000   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
1001   const PetscScalar *x,*xb;
1002   const MatScalar   *v;
1003   PetscErrorCode    ierr;
1004   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1005   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1006   PetscInt          nonzerorow=0;
1007 
1008   PetscFunctionBegin;
1009   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1010   if (!a->nz) PetscFunctionReturn(0);
1011   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1012   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1013 
1014   v  = a->a;
1015   xb = x;
1016 
1017   for (i=0; i<mbs; i++) {
1018     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1019     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1020     ib          = aj + *ai;
1021     jmin        = 0;
1022     nonzerorow += (n>0);
1023     if (*ib == i) {     /* (diag of A)*x */
1024       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1025       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1026       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1027       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1028       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1029       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1030       v        += 36; jmin++;
1031     }
1032     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1033     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1034     for (j=jmin; j<n; j++) {
1035       /* (strict lower triangular part of A)*x  */
1036       cval       = ib[j]*6;
1037       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1038       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1039       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1040       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1041       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1042       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1043       /* (strict upper triangular part of A)*x  */
1044       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1045       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1046       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1047       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1048       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1049       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1050       v        += 36;
1051     }
1052     xb +=6; ai++;
1053   }
1054 
1055   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1056   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1057 
1058   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1059   PetscFunctionReturn(0);
1060 }
1061 
1062 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1063 {
1064   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1065   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1066   const PetscScalar *x,*xb;
1067   const MatScalar   *v;
1068   PetscErrorCode    ierr;
1069   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1070   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1071   PetscInt          nonzerorow=0;
1072 
1073   PetscFunctionBegin;
1074   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1075   if (!a->nz) PetscFunctionReturn(0);
1076   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1077   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1078 
1079   v  = a->a;
1080   xb = x;
1081 
1082   for (i=0; i<mbs; i++) {
1083     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1084     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1085     ib          = aj + *ai;
1086     jmin        = 0;
1087     nonzerorow += (n>0);
1088     if (*ib == i) {     /* (diag of A)*x */
1089       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1090       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1091       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1092       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1093       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1094       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1095       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1096       v        += 49; jmin++;
1097     }
1098     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1099     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1100     for (j=jmin; j<n; j++) {
1101       /* (strict lower triangular part of A)*x  */
1102       cval       = ib[j]*7;
1103       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1104       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1105       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1106       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1107       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1108       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1109       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1110       /* (strict upper triangular part of A)*x  */
1111       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1112       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1113       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1114       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1115       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1116       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1117       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1118       v       += 49;
1119     }
1120     xb +=7; ai++;
1121   }
1122 
1123   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1124   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1125 
1126   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1127   PetscFunctionReturn(0);
1128 }
1129 
1130 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1131 {
1132   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1133   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1134   const PetscScalar *x,*x_ptr,*xb;
1135   const MatScalar   *v;
1136   PetscErrorCode    ierr;
1137   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1138   const PetscInt    *idx,*aj,*ii;
1139   PetscInt          nonzerorow=0;
1140 
1141   PetscFunctionBegin;
1142   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1143   if (!a->nz) PetscFunctionReturn(0);
1144   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1145   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1146 
1147   aj = a->j;
1148   v  = a->a;
1149   ii = a->i;
1150 
1151   if (!a->mult_work) {
1152     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1153   }
1154   work = a->mult_work;
1155 
1156 
1157   for (i=0; i<mbs; i++) {
1158     n           = ii[1] - ii[0]; ncols = n*bs;
1159     workt       = work; idx=aj+ii[0];
1160     nonzerorow += (n>0);
1161 
1162     /* upper triangular part */
1163     for (j=0; j<n; j++) {
1164       xb = x_ptr + bs*(*idx++);
1165       for (k=0; k<bs; k++) workt[k] = xb[k];
1166       workt += bs;
1167     }
1168     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1169     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1170 
1171     /* strict lower triangular part */
1172     idx = aj+ii[0];
1173     if (*idx == i) {
1174       ncols -= bs; v += bs2; idx++; n--;
1175     }
1176     if (ncols > 0) {
1177       workt = work;
1178       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
1179       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1180       for (j=0; j<n; j++) {
1181         zb = z_ptr + bs*(*idx++);
1182         for (k=0; k<bs; k++) zb[k] += workt[k];
1183         workt += bs;
1184       }
1185     }
1186 
1187     x += bs; v += n*bs2; z += bs; ii++;
1188   }
1189 
1190   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1191   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1192 
1193   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1194   PetscFunctionReturn(0);
1195 }
1196 
1197 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1198 {
1199   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1200   PetscScalar    oalpha = alpha;
1201   PetscErrorCode ierr;
1202   PetscBLASInt   one = 1,totalnz;
1203 
1204   PetscFunctionBegin;
1205   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1206   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1207   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1208   PetscFunctionReturn(0);
1209 }
1210 
1211 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1212 {
1213   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1214   const MatScalar *v       = a->a;
1215   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1216   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1217   PetscErrorCode  ierr;
1218   const PetscInt  *aj=a->j,*col;
1219 
1220   PetscFunctionBegin;
1221   if (!a->nz) {
1222     *norm = 0.0;
1223     PetscFunctionReturn(0);
1224   }
1225   if (type == NORM_FROBENIUS) {
1226     for (k=0; k<mbs; k++) {
1227       jmin = a->i[k]; jmax = a->i[k+1];
1228       col  = aj + jmin;
1229       if (*col == k) {         /* diagonal block */
1230         for (i=0; i<bs2; i++) {
1231           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1232         }
1233         jmin++;
1234       }
1235       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1236         for (i=0; i<bs2; i++) {
1237           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1238         }
1239       }
1240     }
1241     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1242     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
1243   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1244     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1245     for (i=0; i<mbs; i++) jl[i] = mbs;
1246     il[0] = 0;
1247 
1248     *norm = 0.0;
1249     for (k=0; k<mbs; k++) { /* k_th block row */
1250       for (j=0; j<bs; j++) sum[j]=0.0;
1251       /*-- col sum --*/
1252       i = jl[k]; /* first |A(i,k)| to be added */
1253       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1254                   at step k */
1255       while (i<mbs) {
1256         nexti = jl[i];  /* next block row to be added */
1257         ik    = il[i];  /* block index of A(i,k) in the array a */
1258         for (j=0; j<bs; j++) {
1259           v = a->a + ik*bs2 + j*bs;
1260           for (k1=0; k1<bs; k1++) {
1261             sum[j] += PetscAbsScalar(*v); v++;
1262           }
1263         }
1264         /* update il, jl */
1265         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1266         jmax = a->i[i+1];
1267         if (jmin < jmax) {
1268           il[i] = jmin;
1269           j     = a->j[jmin];
1270           jl[i] = jl[j]; jl[j]=i;
1271         }
1272         i = nexti;
1273       }
1274       /*-- row sum --*/
1275       jmin = a->i[k]; jmax = a->i[k+1];
1276       for (i=jmin; i<jmax; i++) {
1277         for (j=0; j<bs; j++) {
1278           v = a->a + i*bs2 + j;
1279           for (k1=0; k1<bs; k1++) {
1280             sum[j] += PetscAbsScalar(*v); v += bs;
1281           }
1282         }
1283       }
1284       /* add k_th block row to il, jl */
1285       col = aj+jmin;
1286       if (*col == k) jmin++;
1287       if (jmin < jmax) {
1288         il[k] = jmin;
1289         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1290       }
1291       for (j=0; j<bs; j++) {
1292         if (sum[j] > *norm) *norm = sum[j];
1293       }
1294     }
1295     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1296     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1297   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1298   PetscFunctionReturn(0);
1299 }
1300 
1301 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1302 {
1303   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1304   PetscErrorCode ierr;
1305 
1306   PetscFunctionBegin;
1307   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1308   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1309     *flg = PETSC_FALSE;
1310     PetscFunctionReturn(0);
1311   }
1312 
1313   /* if the a->i are the same */
1314   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
1315   if (!*flg) PetscFunctionReturn(0);
1316 
1317   /* if a->j are the same */
1318   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
1319   if (!*flg) PetscFunctionReturn(0);
1320 
1321   /* if a->a are the same */
1322   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr);
1323   PetscFunctionReturn(0);
1324 }
1325 
1326 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1327 {
1328   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1329   PetscErrorCode  ierr;
1330   PetscInt        i,j,k,row,bs,ambs,bs2;
1331   const PetscInt  *ai,*aj;
1332   PetscScalar     *x,zero = 0.0;
1333   const MatScalar *aa,*aa_j;
1334 
1335   PetscFunctionBegin;
1336   bs = A->rmap->bs;
1337   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1338 
1339   aa   = a->a;
1340   ambs = a->mbs;
1341 
1342   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1343     PetscInt *diag=a->diag;
1344     aa   = a->a;
1345     ambs = a->mbs;
1346     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1347     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1348     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1349     PetscFunctionReturn(0);
1350   }
1351 
1352   ai   = a->i;
1353   aj   = a->j;
1354   bs2  = a->bs2;
1355   ierr = VecSet(v,zero);CHKERRQ(ierr);
1356   if (!a->nz) PetscFunctionReturn(0);
1357   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1358   for (i=0; i<ambs; i++) {
1359     j=ai[i];
1360     if (aj[j] == i) {    /* if this is a diagonal element */
1361       row  = i*bs;
1362       aa_j = aa + j*bs2;
1363       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1364     }
1365   }
1366   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1371 {
1372   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1373   PetscScalar       x;
1374   const PetscScalar *l,*li,*ri;
1375   MatScalar         *aa,*v;
1376   PetscErrorCode    ierr;
1377   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1378   const PetscInt    *ai,*aj;
1379   PetscBool         flg;
1380 
1381   PetscFunctionBegin;
1382   if (ll != rr) {
1383     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1384     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1385   }
1386   if (!ll) PetscFunctionReturn(0);
1387   ai  = a->i;
1388   aj  = a->j;
1389   aa  = a->a;
1390   m   = A->rmap->N;
1391   bs  = A->rmap->bs;
1392   mbs = a->mbs;
1393   bs2 = a->bs2;
1394 
1395   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1396   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1397   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1398   for (i=0; i<mbs; i++) { /* for each block row */
1399     M  = ai[i+1] - ai[i];
1400     li = l + i*bs;
1401     v  = aa + bs2*ai[i];
1402     for (j=0; j<M; j++) { /* for each block */
1403       ri = l + bs*aj[ai[i]+j];
1404       for (k=0; k<bs; k++) {
1405         x = ri[k];
1406         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1407       }
1408     }
1409   }
1410   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1411   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1416 {
1417   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1418 
1419   PetscFunctionBegin;
1420   info->block_size   = a->bs2;
1421   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1422   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1423   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1424   info->assemblies   = A->num_ass;
1425   info->mallocs      = A->info.mallocs;
1426   info->memory       = ((PetscObject)A)->mem;
1427   if (A->factortype) {
1428     info->fill_ratio_given  = A->info.fill_ratio_given;
1429     info->fill_ratio_needed = A->info.fill_ratio_needed;
1430     info->factor_mallocs    = A->info.factor_mallocs;
1431   } else {
1432     info->fill_ratio_given  = 0;
1433     info->fill_ratio_needed = 0;
1434     info->factor_mallocs    = 0;
1435   }
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1440 {
1441   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1442   PetscErrorCode ierr;
1443 
1444   PetscFunctionBegin;
1445   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
1446   PetscFunctionReturn(0);
1447 }
1448 
1449 /*
1450    This code does not work since it only checks the upper triangular part of
1451   the matrix. Hence it is not listed in the function table.
1452 */
1453 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1454 {
1455   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1456   PetscErrorCode  ierr;
1457   PetscInt        i,j,n,row,col,bs,mbs;
1458   const PetscInt  *ai,*aj;
1459   PetscReal       atmp;
1460   const MatScalar *aa;
1461   PetscScalar     *x;
1462   PetscInt        ncols,brow,bcol,krow,kcol;
1463 
1464   PetscFunctionBegin;
1465   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1466   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1467   bs  = A->rmap->bs;
1468   aa  = a->a;
1469   ai  = a->i;
1470   aj  = a->j;
1471   mbs = a->mbs;
1472 
1473   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1474   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1475   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1476   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1477   for (i=0; i<mbs; i++) {
1478     ncols = ai[1] - ai[0]; ai++;
1479     brow  = bs*i;
1480     for (j=0; j<ncols; j++) {
1481       bcol = bs*(*aj);
1482       for (kcol=0; kcol<bs; kcol++) {
1483         col = bcol + kcol;      /* col index */
1484         for (krow=0; krow<bs; krow++) {
1485           atmp = PetscAbsScalar(*aa); aa++;
1486           row  = brow + krow;   /* row index */
1487           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1488           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1489         }
1490       }
1491       aj++;
1492     }
1493   }
1494   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1499 {
1500   PetscErrorCode ierr;
1501 
1502   PetscFunctionBegin;
1503   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1504   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1505   PetscFunctionReturn(0);
1506 }
1507 
1508 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1509 {
1510   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1511   PetscScalar       *z = c;
1512   const PetscScalar *xb;
1513   PetscScalar       x1;
1514   const MatScalar   *v = a->a,*vv;
1515   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1516 
1517   PetscFunctionBegin;
1518   for (i=0; i<mbs; i++) {
1519     n = ii[1] - ii[0]; ii++;
1520     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1521     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1522     jj = idx;
1523     vv = v;
1524     for (k=0; k<cn; k++) {
1525       idx = jj;
1526       v = vv;
1527       for (j=0; j<n; j++) {
1528         xb = b + (*idx); x1 = xb[0+k*bm];
1529         z[0+k*cm] += v[0]*x1;
1530         if (*idx != i) c[(*idx)+k*cm] += v[0]*b[i+k*bm];
1531         v += 1;
1532         ++idx;
1533       }
1534     }
1535     z += 1;
1536   }
1537   PetscFunctionReturn(0);
1538 }
1539 
1540 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1541 {
1542   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1543   PetscScalar       *z = c;
1544   const PetscScalar *xb;
1545   PetscScalar       x1,x2;
1546   const MatScalar   *v = a->a,*vv;
1547   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1548 
1549   PetscFunctionBegin;
1550   for (i=0; i<mbs; i++) {
1551     n = ii[1] - ii[0]; ii++;
1552     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1553     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1554     jj = idx;
1555     vv = v;
1556     for (k=0; k<cn; k++) {
1557       idx = jj;
1558       v = vv;
1559       for (j=0; j<n; j++) {
1560         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1561         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1562         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1563         if (*idx != i) {
1564           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1565           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1566         }
1567         v += 4;
1568         ++idx;
1569       }
1570     }
1571     z += 2;
1572   }
1573   PetscFunctionReturn(0);
1574 }
1575 
1576 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1577 {
1578   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1579   PetscScalar       *z = c;
1580   const PetscScalar *xb;
1581   PetscScalar       x1,x2,x3;
1582   const MatScalar   *v = a->a,*vv;
1583   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1584 
1585   PetscFunctionBegin;
1586   for (i=0; i<mbs; i++) {
1587     n = ii[1] - ii[0]; ii++;
1588     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1589     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1590     jj = idx;
1591     vv = v;
1592     for (k=0; k<cn; k++) {
1593       idx = jj;
1594       v = vv;
1595       for (j=0; j<n; j++) {
1596         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1597         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1598         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1599         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1600         if (*idx != i) {
1601           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1602           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1603           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1604         }
1605         v += 9;
1606         ++idx;
1607       }
1608     }
1609     z += 3;
1610   }
1611   PetscFunctionReturn(0);
1612 }
1613 
1614 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1615 {
1616   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1617   PetscScalar       *z = c;
1618   const PetscScalar *xb;
1619   PetscScalar       x1,x2,x3,x4;
1620   const MatScalar   *v = a->a,*vv;
1621   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1622 
1623   PetscFunctionBegin;
1624   for (i=0; i<mbs; i++) {
1625     n = ii[1] - ii[0]; ii++;
1626     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1627     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1628     jj = idx;
1629     vv = v;
1630     for (k=0; k<cn; k++) {
1631       idx = jj;
1632       v = vv;
1633       for (j=0; j<n; j++) {
1634         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1635         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1636         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1637         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1638         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1639         if (*idx != i) {
1640           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1641           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1642           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1643           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1644         }
1645         v += 16;
1646         ++idx;
1647       }
1648     }
1649     z += 4;
1650   }
1651   PetscFunctionReturn(0);
1652 }
1653 
1654 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1655 {
1656   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1657   PetscScalar       *z = c;
1658   const PetscScalar *xb;
1659   PetscScalar       x1,x2,x3,x4,x5;
1660   const MatScalar   *v = a->a,*vv;
1661   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1662 
1663   PetscFunctionBegin;
1664   for (i=0; i<mbs; i++) {
1665     n = ii[1] - ii[0]; ii++;
1666     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1667     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1668     jj = idx;
1669     vv = v;
1670     for (k=0; k<cn; k++) {
1671       idx = jj;
1672       v = vv;
1673       for (j=0; j<n; j++) {
1674         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1675         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1676         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1677         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1678         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1679         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1680         if (*idx != i) {
1681           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1682           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1683           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1684           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1685           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1686         }
1687         v += 25;
1688         ++idx;
1689       }
1690     }
1691     z += 5;
1692   }
1693   PetscFunctionReturn(0);
1694 }
1695 
1696 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1697 {
1698   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1699   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1700   Mat_SeqDense      *cd = (Mat_SeqDense*)B->data;
1701   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1702   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1703   PetscBLASInt      bbs,bcn,bbm,bcm;
1704   PetscScalar       *z = 0;
1705   PetscScalar       *c,*b;
1706   const MatScalar   *v;
1707   const PetscInt    *idx,*ii;
1708   PetscScalar       _DOne=1.0;
1709   PetscErrorCode    ierr;
1710 
1711   PetscFunctionBegin;
1712   if (!cm || !cn) PetscFunctionReturn(0);
1713   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1714   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1715   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1716   b = bd->v;
1717   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1718   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1719   switch (bs) {
1720   case 1:
1721     ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1722     break;
1723   case 2:
1724     ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1725     break;
1726   case 3:
1727     ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1728     break;
1729   case 4:
1730     ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1731     break;
1732   case 5:
1733     ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1734     break;
1735   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1736     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
1737     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
1738     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
1739     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
1740     idx = a->j;
1741     v   = a->a;
1742     mbs = a->mbs;
1743     ii  = a->i;
1744     z   = c;
1745     for (i=0; i<mbs; i++) {
1746       n = ii[1] - ii[0]; ii++;
1747       for (j=0; j<n; j++) {
1748         if (*idx != i)
1749           PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1750         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1751         v += bs2;
1752       }
1753       z += bs;
1754     }
1755   }
1756   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1757   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1758   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1759   ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr);
1760   PetscFunctionReturn(0);
1761 }
1762