xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 94ef8dde638caef1d0cd84a7dc8a2db65fcda8b6)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
9 {
10   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
11   PetscErrorCode ierr;
12   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
13   const PetscInt *idx;
14   PetscBT        table_out,table_in;
15 
16   PetscFunctionBegin;
17   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
18   mbs  = a->mbs;
19   ai   = a->i;
20   aj   = a->j;
21   bs   = A->rmap->bs;
22   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
23   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
24   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
25   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
26 
27   for (i=0; i<is_max; i++) { /* for each is */
28     isz  = 0;
29     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
30 
31     /* Extract the indices, assume there can be duplicate entries */
32     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
33     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
34 
35     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36     bcol_max = 0;
37     for (j=0; j<n; ++j) {
38       brow = idx[j]/bs; /* convert the indices into block indices */
39       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
40       if (!PetscBTLookupSet(table_out,brow)) {
41         nidx[isz++] = brow;
42         if (bcol_max < brow) bcol_max = brow;
43       }
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++) { /* for each overlap */
50       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
51       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
52       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
53 
54       n = isz;  /* length of the updated is[i] */
55       for (brow=0; brow<mbs; brow++) {
56         start = ai[brow]; end   = ai[brow+1];
57         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58           for (l = start; l<end; l++) {
59             bcol = aj[l];
60             if (!PetscBTLookupSet(table_out,bcol)) {
61               nidx[isz++] = bcol;
62               if (bcol_max < bcol) bcol_max = bcol;
63             }
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l<end; l++) {
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table_in,bcol)) {
72               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j=0; j<isz; j++) {
82       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94         Zero some ops' to avoid invalid usse */
95 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
96 {
97   PetscErrorCode ierr;
98 
99   PetscFunctionBegin;
100   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
101   Bseq->ops->mult                   = 0;
102   Bseq->ops->multadd                = 0;
103   Bseq->ops->multtranspose          = 0;
104   Bseq->ops->multtransposeadd       = 0;
105   Bseq->ops->lufactor               = 0;
106   Bseq->ops->choleskyfactor         = 0;
107   Bseq->ops->lufactorsymbolic       = 0;
108   Bseq->ops->choleskyfactorsymbolic = 0;
109   Bseq->ops->getinertia             = 0;
110   PetscFunctionReturn(0);
111 }
112 
113 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
114 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
115 {
116   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
117   PetscErrorCode ierr;
118   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
119   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
120   const PetscInt *irow,*icol;
121   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
122   PetscInt       *aj = a->j,*ai = a->i;
123   MatScalar      *mat_a;
124   Mat            C;
125   PetscBool      flag;
126 
127   PetscFunctionBegin;
128 
129   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
130   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
131   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
132   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
133 
134   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
135   ssmap = smap;
136   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
137   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
138   /* determine lens of each row */
139   for (i=0; i<nrows; i++) {
140     kstart  = ai[irow[i]];
141     kend    = kstart + a->ilen[irow[i]];
142     lens[i] = 0;
143     for (k=kstart; k<kend; k++) {
144       if (ssmap[aj[k]]) lens[i]++;
145     }
146   }
147   /* Create and fill new matrix */
148   if (scall == MAT_REUSE_MATRIX) {
149     c = (Mat_SeqSBAIJ*)((*B)->data);
150 
151     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
152     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
153     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
154     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
155     C    = *B;
156   } else {
157     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
158     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
159     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
160     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
161   }
162   c = (Mat_SeqSBAIJ*)(C->data);
163   for (i=0; i<nrows; i++) {
164     row      = irow[i];
165     kstart   = ai[row];
166     kend     = kstart + a->ilen[row];
167     mat_i    = c->i[i];
168     mat_j    = c->j + mat_i;
169     mat_a    = c->a + mat_i*bs2;
170     mat_ilen = c->ilen + i;
171     for (k=kstart; k<kend; k++) {
172       if ((tcol=ssmap[a->j[k]])) {
173         *mat_j++ = tcol - 1;
174         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
175         mat_a   += bs2;
176         (*mat_ilen)++;
177       }
178     }
179   }
180   /* sort */
181   {
182     MatScalar *work;
183 
184     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
185     for (i=0; i<nrows; i++) {
186       PetscInt ilen;
187       mat_i = c->i[i];
188       mat_j = c->j + mat_i;
189       mat_a = c->a + mat_i*bs2;
190       ilen  = c->ilen[i];
191       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
192     }
193     ierr = PetscFree(work);CHKERRQ(ierr);
194   }
195 
196   /* Free work space */
197   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
198   ierr = PetscFree(smap);CHKERRQ(ierr);
199   ierr = PetscFree(lens);CHKERRQ(ierr);
200   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
201   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202 
203   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
204   *B   = C;
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
209 {
210   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
211   IS             is1,is2;
212   PetscErrorCode ierr;
213   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
214   const PetscInt *irow,*icol;
215 
216   PetscFunctionBegin;
217   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
218   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
219   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
220   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
221 
222   /* Verify if the indices corespond to each element in a block
223    and form the IS with compressed IS */
224   maxmnbs = PetscMax(a->mbs,a->nbs);
225   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
226   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
227   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
228   for (i=0; i<a->mbs; i++) {
229     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
230   }
231   count = 0;
232   for (i=0; i<nrows; i++) {
233     PetscInt j = irow[i] / bs;
234     if ((vary[j]--)==bs) iary[count++] = j;
235   }
236   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
237 
238   ierr = PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));CHKERRQ(ierr);
239   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
240   for (i=0; i<a->nbs; i++) {
241     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
242   }
243   count = 0;
244   for (i=0; i<ncols; i++) {
245     PetscInt j = icol[i] / bs;
246     if ((vary[j]--)==bs) iary[count++] = j;
247   }
248   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
249   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
250   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
251   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
252 
253   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
254   ierr = ISDestroy(&is1);CHKERRQ(ierr);
255   ierr = ISDestroy(&is2);CHKERRQ(ierr);
256 
257   if (isrow != iscol) {
258     PetscBool isequal;
259     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
260     if (!isequal) {
261       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
262     }
263   }
264   PetscFunctionReturn(0);
265 }
266 
267 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
268 {
269   PetscErrorCode ierr;
270   PetscInt       i;
271 
272   PetscFunctionBegin;
273   if (scall == MAT_INITIAL_MATRIX) {
274     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
275   }
276 
277   for (i=0; i<n; i++) {
278     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
279   }
280   PetscFunctionReturn(0);
281 }
282 
283 /* -------------------------------------------------------*/
284 /* Should check that shapes of vectors and matrices match */
285 /* -------------------------------------------------------*/
286 
287 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
288 {
289   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
290   PetscScalar       *z,x1,x2,zero=0.0;
291   const PetscScalar *x,*xb;
292   const MatScalar   *v;
293   PetscErrorCode    ierr;
294   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
295   const PetscInt    *aj=a->j,*ai=a->i,*ib;
296   PetscInt          nonzerorow=0;
297 
298   PetscFunctionBegin;
299   ierr = VecSet(zz,zero);CHKERRQ(ierr);
300   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
301   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
302 
303   v  = a->a;
304   xb = x;
305 
306   for (i=0; i<mbs; i++) {
307     n           = ai[1] - ai[0]; /* length of i_th block row of A */
308     x1          = xb[0]; x2 = xb[1];
309     ib          = aj + *ai;
310     jmin        = 0;
311     nonzerorow += (n>0);
312     if (*ib == i) {     /* (diag of A)*x */
313       z[2*i]   += v[0]*x1 + v[2]*x2;
314       z[2*i+1] += v[2]*x1 + v[3]*x2;
315       v        += 4; jmin++;
316     }
317     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
318     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
319     for (j=jmin; j<n; j++) {
320       /* (strict lower triangular part of A)*x  */
321       cval       = ib[j]*2;
322       z[cval]   += v[0]*x1 + v[1]*x2;
323       z[cval+1] += v[2]*x1 + v[3]*x2;
324       /* (strict upper triangular part of A)*x  */
325       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
326       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
327       v        += 4;
328     }
329     xb +=2; ai++;
330   }
331 
332   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
333   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
334   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
335   PetscFunctionReturn(0);
336 }
337 
338 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
339 {
340   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
341   PetscScalar       *z,x1,x2,x3,zero=0.0;
342   const PetscScalar *x,*xb;
343   const MatScalar   *v;
344   PetscErrorCode    ierr;
345   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
346   const PetscInt    *aj = a->j,*ai = a->i,*ib;
347   PetscInt          nonzerorow=0;
348 
349   PetscFunctionBegin;
350   ierr = VecSet(zz,zero);CHKERRQ(ierr);
351   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
352   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
353 
354   v  = a->a;
355   xb = x;
356 
357   for (i=0; i<mbs; i++) {
358     n           = ai[1] - ai[0]; /* length of i_th block row of A */
359     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
360     ib          = aj + *ai;
361     jmin        = 0;
362     nonzerorow += (n>0);
363     if (*ib == i) {     /* (diag of A)*x */
364       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
365       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
366       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
367       v        += 9; jmin++;
368     }
369     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
370     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
371     for (j=jmin; j<n; j++) {
372       /* (strict lower triangular part of A)*x  */
373       cval       = ib[j]*3;
374       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
375       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
376       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
377       /* (strict upper triangular part of A)*x  */
378       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
379       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
380       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
381       v        += 9;
382     }
383     xb +=3; ai++;
384   }
385 
386   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
387   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
388   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
389   PetscFunctionReturn(0);
390 }
391 
392 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
393 {
394   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
395   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
396   const PetscScalar *x,*xb;
397   const MatScalar   *v;
398   PetscErrorCode    ierr;
399   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
400   const PetscInt    *aj = a->j,*ai = a->i,*ib;
401   PetscInt          nonzerorow = 0;
402 
403   PetscFunctionBegin;
404   ierr = VecSet(zz,zero);CHKERRQ(ierr);
405   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
406   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
407 
408   v  = a->a;
409   xb = x;
410 
411   for (i=0; i<mbs; i++) {
412     n           = ai[1] - ai[0]; /* length of i_th block row of A */
413     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
414     ib          = aj + *ai;
415     jmin        = 0;
416     nonzerorow += (n>0);
417     if (*ib == i) {     /* (diag of A)*x */
418       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
419       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
420       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
421       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
422       v        += 16; jmin++;
423     }
424     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
425     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
426     for (j=jmin; j<n; j++) {
427       /* (strict lower triangular part of A)*x  */
428       cval       = ib[j]*4;
429       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
430       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
431       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
432       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
433       /* (strict upper triangular part of A)*x  */
434       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
435       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
436       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
437       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
438       v        += 16;
439     }
440     xb +=4; ai++;
441   }
442 
443   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
444   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
445   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
446   PetscFunctionReturn(0);
447 }
448 
449 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
450 {
451   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
452   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
453   const PetscScalar *x,*xb;
454   const MatScalar   *v;
455   PetscErrorCode    ierr;
456   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
457   const PetscInt    *aj = a->j,*ai = a->i,*ib;
458   PetscInt          nonzerorow=0;
459 
460   PetscFunctionBegin;
461   ierr = VecSet(zz,zero);CHKERRQ(ierr);
462   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
463   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
464 
465   v  = a->a;
466   xb = x;
467 
468   for (i=0; i<mbs; i++) {
469     n           = ai[1] - ai[0]; /* length of i_th block row of A */
470     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
471     ib          = aj + *ai;
472     jmin        = 0;
473     nonzerorow += (n>0);
474     if (*ib == i) {      /* (diag of A)*x */
475       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
476       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
477       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
478       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
479       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
480       v        += 25; jmin++;
481     }
482     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
483     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
484     for (j=jmin; j<n; j++) {
485       /* (strict lower triangular part of A)*x  */
486       cval       = ib[j]*5;
487       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
488       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
489       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
490       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
491       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
492       /* (strict upper triangular part of A)*x  */
493       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
494       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
495       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
496       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
497       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
498       v        += 25;
499     }
500     xb +=5; ai++;
501   }
502 
503   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
504   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
505   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
506   PetscFunctionReturn(0);
507 }
508 
509 
510 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
511 {
512   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
513   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
514   const PetscScalar *x,*xb;
515   const MatScalar   *v;
516   PetscErrorCode    ierr;
517   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
518   const PetscInt    *aj=a->j,*ai=a->i,*ib;
519   PetscInt          nonzerorow=0;
520 
521   PetscFunctionBegin;
522   ierr = VecSet(zz,zero);CHKERRQ(ierr);
523   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
524   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
525 
526   v  = a->a;
527   xb = x;
528 
529   for (i=0; i<mbs; i++) {
530     n           = ai[1] - ai[0]; /* length of i_th block row of A */
531     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
532     ib          = aj + *ai;
533     jmin        = 0;
534     nonzerorow += (n>0);
535     if (*ib == i) {      /* (diag of A)*x */
536       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
537       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
538       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
539       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
540       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
541       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
542       v        += 36; jmin++;
543     }
544     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
545     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
546     for (j=jmin; j<n; j++) {
547       /* (strict lower triangular part of A)*x  */
548       cval       = ib[j]*6;
549       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
550       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
551       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
552       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
553       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
554       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
555       /* (strict upper triangular part of A)*x  */
556       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
557       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
558       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
559       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
560       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
561       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
562       v        += 36;
563     }
564     xb +=6; ai++;
565   }
566 
567   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
568   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
569   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
570   PetscFunctionReturn(0);
571 }
572 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
573 {
574   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
575   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
576   const PetscScalar *x,*xb;
577   const MatScalar   *v;
578   PetscErrorCode    ierr;
579   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
580   const PetscInt    *aj=a->j,*ai=a->i,*ib;
581   PetscInt          nonzerorow=0;
582 
583   PetscFunctionBegin;
584   ierr = VecSet(zz,zero);CHKERRQ(ierr);
585   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
586   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
587 
588   v  = a->a;
589   xb = x;
590 
591   for (i=0; i<mbs; i++) {
592     n           = ai[1] - ai[0]; /* length of i_th block row of A */
593     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
594     ib          = aj + *ai;
595     jmin        = 0;
596     nonzerorow += (n>0);
597     if (*ib == i) {      /* (diag of A)*x */
598       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
599       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
600       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
601       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
602       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
603       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
604       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
605       v        += 49; jmin++;
606     }
607     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
608     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
609     for (j=jmin; j<n; j++) {
610       /* (strict lower triangular part of A)*x  */
611       cval       = ib[j]*7;
612       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
613       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
614       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
615       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
616       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
617       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
618       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
619       /* (strict upper triangular part of A)*x  */
620       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
621       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
622       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
623       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
624       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
625       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
626       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
627       v       += 49;
628     }
629     xb +=7; ai++;
630   }
631   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
632   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
633   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
634   PetscFunctionReturn(0);
635 }
636 
637 /*
638     This will not work with MatScalar == float because it calls the BLAS
639 */
640 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
641 {
642   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
643   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
644   const PetscScalar *x,*x_ptr,*xb;
645   const MatScalar   *v;
646   PetscErrorCode    ierr;
647   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
648   const PetscInt    *idx,*aj,*ii;
649   PetscInt          nonzerorow=0;
650 
651   PetscFunctionBegin;
652   ierr = VecSet(zz,zero);CHKERRQ(ierr);
653   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
654   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
655 
656   aj = a->j;
657   v  = a->a;
658   ii = a->i;
659 
660   if (!a->mult_work) {
661     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
662   }
663   work = a->mult_work;
664 
665   for (i=0; i<mbs; i++) {
666     n           = ii[1] - ii[0]; ncols = n*bs;
667     workt       = work; idx=aj+ii[0];
668     nonzerorow += (n>0);
669 
670     /* upper triangular part */
671     for (j=0; j<n; j++) {
672       xb = x_ptr + bs*(*idx++);
673       for (k=0; k<bs; k++) workt[k] = xb[k];
674       workt += bs;
675     }
676     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
677     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
678 
679     /* strict lower triangular part */
680     idx = aj+ii[0];
681     if (*idx == i) {
682       ncols -= bs; v += bs2; idx++; n--;
683     }
684 
685     if (ncols > 0) {
686       workt = work;
687       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
688       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
689       for (j=0; j<n; j++) {
690         zb = z_ptr + bs*(*idx++);
691         for (k=0; k<bs; k++) zb[k] += workt[k];
692         workt += bs;
693       }
694     }
695     x += bs; v += n*bs2; z += bs; ii++;
696   }
697 
698   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
699   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
700   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
701   PetscFunctionReturn(0);
702 }
703 
704 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
705 {
706   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
707   PetscScalar       *z,x1;
708   const PetscScalar *x,*xb;
709   const MatScalar   *v;
710   PetscErrorCode    ierr;
711   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
712   const PetscInt    *aj=a->j,*ai=a->i,*ib;
713   PetscInt          nonzerorow=0;
714 
715   PetscFunctionBegin;
716   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
717   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
718   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
719   v    = a->a;
720   xb   = x;
721 
722   for (i=0; i<mbs; i++) {
723     n           = ai[1] - ai[0]; /* length of i_th row of A */
724     x1          = xb[0];
725     ib          = aj + *ai;
726     jmin        = 0;
727     nonzerorow += (n>0);
728     if (*ib == i) {            /* (diag of A)*x */
729       z[i] += *v++ * x[*ib++]; jmin++;
730     }
731     for (j=jmin; j<n; j++) {
732       cval    = *ib;
733       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
734       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
735     }
736     xb++; ai++;
737   }
738 
739   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
740   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
741 
742   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
743   PetscFunctionReturn(0);
744 }
745 
746 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
747 {
748   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
749   PetscScalar       *z,x1,x2;
750   const PetscScalar *x,*xb;
751   const MatScalar   *v;
752   PetscErrorCode    ierr;
753   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
754   const PetscInt    *aj=a->j,*ai=a->i,*ib;
755   PetscInt          nonzerorow=0;
756 
757   PetscFunctionBegin;
758   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
759   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
760   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
761 
762   v  = a->a;
763   xb = x;
764 
765   for (i=0; i<mbs; i++) {
766     n           = ai[1] - ai[0]; /* length of i_th block row of A */
767     x1          = xb[0]; x2 = xb[1];
768     ib          = aj + *ai;
769     jmin        = 0;
770     nonzerorow += (n>0);
771     if (*ib == i) {      /* (diag of A)*x */
772       z[2*i]   += v[0]*x1 + v[2]*x2;
773       z[2*i+1] += v[2]*x1 + v[3]*x2;
774       v        += 4; jmin++;
775     }
776     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
777     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
778     for (j=jmin; j<n; j++) {
779       /* (strict lower triangular part of A)*x  */
780       cval       = ib[j]*2;
781       z[cval]   += v[0]*x1 + v[1]*x2;
782       z[cval+1] += v[2]*x1 + v[3]*x2;
783       /* (strict upper triangular part of A)*x  */
784       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
785       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
786       v        += 4;
787     }
788     xb +=2; ai++;
789   }
790   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
791   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
792 
793   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
794   PetscFunctionReturn(0);
795 }
796 
797 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
798 {
799   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
800   PetscScalar       *z,x1,x2,x3;
801   const PetscScalar *x,*xb;
802   const MatScalar   *v;
803   PetscErrorCode    ierr;
804   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
805   const PetscInt    *aj=a->j,*ai=a->i,*ib;
806   PetscInt          nonzerorow=0;
807 
808   PetscFunctionBegin;
809   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
810   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
811   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
812 
813   v  = a->a;
814   xb = x;
815 
816   for (i=0; i<mbs; i++) {
817     n           = ai[1] - ai[0]; /* length of i_th block row of A */
818     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
819     ib          = aj + *ai;
820     jmin        = 0;
821     nonzerorow += (n>0);
822     if (*ib == i) {     /* (diag of A)*x */
823       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
824       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
825       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
826       v        += 9; jmin++;
827     }
828     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
829     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
830     for (j=jmin; j<n; j++) {
831       /* (strict lower triangular part of A)*x  */
832       cval       = ib[j]*3;
833       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
834       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
835       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
836       /* (strict upper triangular part of A)*x  */
837       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
838       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
839       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
840       v        += 9;
841     }
842     xb +=3; ai++;
843   }
844 
845   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
846   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
847 
848   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
849   PetscFunctionReturn(0);
850 }
851 
852 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
853 {
854   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
855   PetscScalar       *z,x1,x2,x3,x4;
856   const PetscScalar *x,*xb;
857   const MatScalar   *v;
858   PetscErrorCode    ierr;
859   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
860   const PetscInt    *aj=a->j,*ai=a->i,*ib;
861   PetscInt          nonzerorow=0;
862 
863   PetscFunctionBegin;
864   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
865   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
866   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
867 
868   v  = a->a;
869   xb = x;
870 
871   for (i=0; i<mbs; i++) {
872     n           = ai[1] - ai[0]; /* length of i_th block row of A */
873     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
874     ib          = aj + *ai;
875     jmin        = 0;
876     nonzerorow += (n>0);
877     if (*ib == i) {      /* (diag of A)*x */
878       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
879       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
880       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
881       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
882       v        += 16; jmin++;
883     }
884     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
885     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
886     for (j=jmin; j<n; j++) {
887       /* (strict lower triangular part of A)*x  */
888       cval       = ib[j]*4;
889       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
890       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
891       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
892       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
893       /* (strict upper triangular part of A)*x  */
894       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
895       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
896       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
897       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
898       v        += 16;
899     }
900     xb +=4; ai++;
901   }
902 
903   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
904   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
905 
906   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
907   PetscFunctionReturn(0);
908 }
909 
910 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
911 {
912   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
913   PetscScalar       *z,x1,x2,x3,x4,x5;
914   const PetscScalar *x,*xb;
915   const MatScalar   *v;
916   PetscErrorCode    ierr;
917   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
918   const PetscInt    *aj=a->j,*ai=a->i,*ib;
919   PetscInt          nonzerorow=0;
920 
921   PetscFunctionBegin;
922   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
923   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
924   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
925 
926   v  = a->a;
927   xb = x;
928 
929   for (i=0; i<mbs; i++) {
930     n           = ai[1] - ai[0]; /* length of i_th block row of A */
931     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
932     ib          = aj + *ai;
933     jmin        = 0;
934     nonzerorow += (n>0);
935     if (*ib == i) {      /* (diag of A)*x */
936       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
937       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
938       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
939       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
940       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
941       v        += 25; jmin++;
942     }
943     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
944     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
945     for (j=jmin; j<n; j++) {
946       /* (strict lower triangular part of A)*x  */
947       cval       = ib[j]*5;
948       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
949       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
950       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
951       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
952       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
953       /* (strict upper triangular part of A)*x  */
954       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
955       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
956       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
957       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
958       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
959       v        += 25;
960     }
961     xb +=5; ai++;
962   }
963 
964   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
965   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
966 
967   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
968   PetscFunctionReturn(0);
969 }
970 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
971 {
972   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
973   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
974   const PetscScalar *x,*xb;
975   const MatScalar   *v;
976   PetscErrorCode    ierr;
977   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
978   const PetscInt    *aj=a->j,*ai=a->i,*ib;
979   PetscInt          nonzerorow=0;
980 
981   PetscFunctionBegin;
982   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
983   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
984   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
985 
986   v  = a->a;
987   xb = x;
988 
989   for (i=0; i<mbs; i++) {
990     n           = ai[1] - ai[0]; /* length of i_th block row of A */
991     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
992     ib          = aj + *ai;
993     jmin        = 0;
994     nonzerorow += (n>0);
995     if (*ib == i) {     /* (diag of A)*x */
996       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
997       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
998       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
999       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1000       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1001       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1002       v        += 36; jmin++;
1003     }
1004     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1005     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1006     for (j=jmin; j<n; j++) {
1007       /* (strict lower triangular part of A)*x  */
1008       cval       = ib[j]*6;
1009       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1010       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1011       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1012       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1013       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1014       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1015       /* (strict upper triangular part of A)*x  */
1016       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1017       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1018       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1019       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1020       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1021       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1022       v        += 36;
1023     }
1024     xb +=6; ai++;
1025   }
1026 
1027   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1028   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1029 
1030   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1031   PetscFunctionReturn(0);
1032 }
1033 
1034 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1035 {
1036   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1037   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1038   const PetscScalar *x,*xb;
1039   const MatScalar   *v;
1040   PetscErrorCode    ierr;
1041   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1042   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1043   PetscInt          nonzerorow=0;
1044 
1045   PetscFunctionBegin;
1046   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1047   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1048   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1049 
1050   v  = a->a;
1051   xb = x;
1052 
1053   for (i=0; i<mbs; i++) {
1054     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1055     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1056     ib          = aj + *ai;
1057     jmin        = 0;
1058     nonzerorow += (n>0);
1059     if (*ib == i) {     /* (diag of A)*x */
1060       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1061       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1062       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1063       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1064       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1065       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1066       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1067       v        += 49; jmin++;
1068     }
1069     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1070     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1071     for (j=jmin; j<n; j++) {
1072       /* (strict lower triangular part of A)*x  */
1073       cval       = ib[j]*7;
1074       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1075       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1076       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1077       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1078       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1079       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1080       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1081       /* (strict upper triangular part of A)*x  */
1082       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1083       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1084       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1085       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1086       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1087       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1088       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1089       v       += 49;
1090     }
1091     xb +=7; ai++;
1092   }
1093 
1094   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1095   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1096 
1097   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1098   PetscFunctionReturn(0);
1099 }
1100 
1101 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1102 {
1103   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1104   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1105   const PetscScalar *x,*x_ptr,*xb;
1106   const MatScalar   *v;
1107   PetscErrorCode    ierr;
1108   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1109   const PetscInt    *idx,*aj,*ii;
1110   PetscInt          nonzerorow=0;
1111 
1112   PetscFunctionBegin;
1113   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1114   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1115   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1116 
1117   aj = a->j;
1118   v  = a->a;
1119   ii = a->i;
1120 
1121   if (!a->mult_work) {
1122     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1123   }
1124   work = a->mult_work;
1125 
1126 
1127   for (i=0; i<mbs; i++) {
1128     n           = ii[1] - ii[0]; ncols = n*bs;
1129     workt       = work; idx=aj+ii[0];
1130     nonzerorow += (n>0);
1131 
1132     /* upper triangular part */
1133     for (j=0; j<n; j++) {
1134       xb = x_ptr + bs*(*idx++);
1135       for (k=0; k<bs; k++) workt[k] = xb[k];
1136       workt += bs;
1137     }
1138     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1139     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1140 
1141     /* strict lower triangular part */
1142     idx = aj+ii[0];
1143     if (*idx == i) {
1144       ncols -= bs; v += bs2; idx++; n--;
1145     }
1146     if (ncols > 0) {
1147       workt = work;
1148       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1149       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1150       for (j=0; j<n; j++) {
1151         zb = z_ptr + bs*(*idx++);
1152         for (k=0; k<bs; k++) zb[k] += workt[k];
1153         workt += bs;
1154       }
1155     }
1156 
1157     x += bs; v += n*bs2; z += bs; ii++;
1158   }
1159 
1160   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1162 
1163   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1168 {
1169   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1170   PetscScalar    oalpha = alpha;
1171   PetscErrorCode ierr;
1172   PetscBLASInt   one = 1,totalnz;
1173 
1174   PetscFunctionBegin;
1175   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1176   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1177   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1182 {
1183   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1184   const MatScalar *v       = a->a;
1185   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1186   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1187   PetscErrorCode  ierr;
1188   const PetscInt  *aj=a->j,*col;
1189 
1190   PetscFunctionBegin;
1191   if (type == NORM_FROBENIUS) {
1192     for (k=0; k<mbs; k++) {
1193       jmin = a->i[k]; jmax = a->i[k+1];
1194       col  = aj + jmin;
1195       if (*col == k) {         /* diagonal block */
1196         for (i=0; i<bs2; i++) {
1197           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1198         }
1199         jmin++;
1200       }
1201       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1202         for (i=0; i<bs2; i++) {
1203           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1204         }
1205       }
1206     }
1207     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1208     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
1209   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1210     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1211     for (i=0; i<mbs; i++) jl[i] = mbs;
1212     il[0] = 0;
1213 
1214     *norm = 0.0;
1215     for (k=0; k<mbs; k++) { /* k_th block row */
1216       for (j=0; j<bs; j++) sum[j]=0.0;
1217       /*-- col sum --*/
1218       i = jl[k]; /* first |A(i,k)| to be added */
1219       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1220                   at step k */
1221       while (i<mbs) {
1222         nexti = jl[i];  /* next block row to be added */
1223         ik    = il[i];  /* block index of A(i,k) in the array a */
1224         for (j=0; j<bs; j++) {
1225           v = a->a + ik*bs2 + j*bs;
1226           for (k1=0; k1<bs; k1++) {
1227             sum[j] += PetscAbsScalar(*v); v++;
1228           }
1229         }
1230         /* update il, jl */
1231         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1232         jmax = a->i[i+1];
1233         if (jmin < jmax) {
1234           il[i] = jmin;
1235           j     = a->j[jmin];
1236           jl[i] = jl[j]; jl[j]=i;
1237         }
1238         i = nexti;
1239       }
1240       /*-- row sum --*/
1241       jmin = a->i[k]; jmax = a->i[k+1];
1242       for (i=jmin; i<jmax; i++) {
1243         for (j=0; j<bs; j++) {
1244           v = a->a + i*bs2 + j;
1245           for (k1=0; k1<bs; k1++) {
1246             sum[j] += PetscAbsScalar(*v); v += bs;
1247           }
1248         }
1249       }
1250       /* add k_th block row to il, jl */
1251       col = aj+jmin;
1252       if (*col == k) jmin++;
1253       if (jmin < jmax) {
1254         il[k] = jmin;
1255         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1256       }
1257       for (j=0; j<bs; j++) {
1258         if (sum[j] > *norm) *norm = sum[j];
1259       }
1260     }
1261     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1262     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1263   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1268 {
1269   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1270   PetscErrorCode ierr;
1271 
1272   PetscFunctionBegin;
1273   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1274   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1275     *flg = PETSC_FALSE;
1276     PetscFunctionReturn(0);
1277   }
1278 
1279   /* if the a->i are the same */
1280   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1281   if (!*flg) PetscFunctionReturn(0);
1282 
1283   /* if a->j are the same */
1284   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1285   if (!*flg) PetscFunctionReturn(0);
1286 
1287   /* if a->a are the same */
1288   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1289   PetscFunctionReturn(0);
1290 }
1291 
1292 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1293 {
1294   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1295   PetscErrorCode  ierr;
1296   PetscInt        i,j,k,row,bs,ambs,bs2;
1297   const PetscInt  *ai,*aj;
1298   PetscScalar     *x,zero = 0.0;
1299   const MatScalar *aa,*aa_j;
1300 
1301   PetscFunctionBegin;
1302   bs = A->rmap->bs;
1303   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1304 
1305   aa   = a->a;
1306   ambs = a->mbs;
1307 
1308   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1309     PetscInt *diag=a->diag;
1310     aa   = a->a;
1311     ambs = a->mbs;
1312     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1313     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1314     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1315     PetscFunctionReturn(0);
1316   }
1317 
1318   ai   = a->i;
1319   aj   = a->j;
1320   bs2  = a->bs2;
1321   ierr = VecSet(v,zero);CHKERRQ(ierr);
1322   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1323   for (i=0; i<ambs; i++) {
1324     j=ai[i];
1325     if (aj[j] == i) {    /* if this is a diagonal element */
1326       row  = i*bs;
1327       aa_j = aa + j*bs2;
1328       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1329     }
1330   }
1331   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1332   PetscFunctionReturn(0);
1333 }
1334 
1335 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1336 {
1337   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1338   PetscScalar       x;
1339   const PetscScalar *l,*li,*ri;
1340   MatScalar         *aa,*v;
1341   PetscErrorCode    ierr;
1342   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1343   PetscBool         flg;
1344 
1345   PetscFunctionBegin;
1346   if (ll != rr) {
1347     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1348     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1349   }
1350   if (!ll) PetscFunctionReturn(0);
1351   ai  = a->i;
1352   aj  = a->j;
1353   aa  = a->a;
1354   m   = A->rmap->N;
1355   bs  = A->rmap->bs;
1356   mbs = a->mbs;
1357   bs2 = a->bs2;
1358 
1359   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1360   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1361   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1362   for (i=0; i<mbs; i++) { /* for each block row */
1363     M  = ai[i+1] - ai[i];
1364     li = l + i*bs;
1365     v  = aa + bs2*ai[i];
1366     for (j=0; j<M; j++) { /* for each block */
1367       ri = l + bs*aj[ai[i]+j];
1368       for (k=0; k<bs; k++) {
1369         x = ri[k];
1370         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1371       }
1372     }
1373   }
1374   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1375   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1376   PetscFunctionReturn(0);
1377 }
1378 
1379 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1380 {
1381   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1382 
1383   PetscFunctionBegin;
1384   info->block_size   = a->bs2;
1385   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1386   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1387   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1388   info->assemblies   = A->num_ass;
1389   info->mallocs      = A->info.mallocs;
1390   info->memory       = ((PetscObject)A)->mem;
1391   if (A->factortype) {
1392     info->fill_ratio_given  = A->info.fill_ratio_given;
1393     info->fill_ratio_needed = A->info.fill_ratio_needed;
1394     info->factor_mallocs    = A->info.factor_mallocs;
1395   } else {
1396     info->fill_ratio_given  = 0;
1397     info->fill_ratio_needed = 0;
1398     info->factor_mallocs    = 0;
1399   }
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 
1404 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1405 {
1406   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1407   PetscErrorCode ierr;
1408 
1409   PetscFunctionBegin;
1410   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 /*
1415    This code does not work since it only checks the upper triangular part of
1416   the matrix. Hence it is not listed in the function table.
1417 */
1418 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1419 {
1420   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1421   PetscErrorCode  ierr;
1422   PetscInt        i,j,n,row,col,bs,mbs;
1423   const PetscInt  *ai,*aj;
1424   PetscReal       atmp;
1425   const MatScalar *aa;
1426   PetscScalar     *x;
1427   PetscInt        ncols,brow,bcol,krow,kcol;
1428 
1429   PetscFunctionBegin;
1430   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1431   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1432   bs  = A->rmap->bs;
1433   aa  = a->a;
1434   ai  = a->i;
1435   aj  = a->j;
1436   mbs = a->mbs;
1437 
1438   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1439   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1440   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1441   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1442   for (i=0; i<mbs; i++) {
1443     ncols = ai[1] - ai[0]; ai++;
1444     brow  = bs*i;
1445     for (j=0; j<ncols; j++) {
1446       bcol = bs*(*aj);
1447       for (kcol=0; kcol<bs; kcol++) {
1448         col = bcol + kcol;      /* col index */
1449         for (krow=0; krow<bs; krow++) {
1450           atmp = PetscAbsScalar(*aa); aa++;
1451           row  = brow + krow;   /* row index */
1452           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1453           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1454         }
1455       }
1456       aj++;
1457     }
1458   }
1459   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1460   PetscFunctionReturn(0);
1461 }
1462