xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision d2fd7bfc6f0fd2e1d083decbb7cc7d77e16824f0)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
9 {
10   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
11   PetscErrorCode ierr;
12   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
13   const PetscInt *idx;
14   PetscBT        table_out,table_in;
15 
16   PetscFunctionBegin;
17   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
18   mbs  = a->mbs;
19   ai   = a->i;
20   aj   = a->j;
21   bs   = A->rmap->bs;
22   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
23   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
24   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
25   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
26 
27   for (i=0; i<is_max; i++) { /* for each is */
28     isz  = 0;
29     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
30 
31     /* Extract the indices, assume there can be duplicate entries */
32     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
33     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
34 
35     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
36     bcol_max = 0;
37     for (j=0; j<n; ++j) {
38       brow = idx[j]/bs; /* convert the indices into block indices */
39       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
40       if (!PetscBTLookupSet(table_out,brow)) {
41         nidx[isz++] = brow;
42         if (bcol_max < brow) bcol_max = brow;
43       }
44     }
45     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
46     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
47 
48     k = 0;
49     for (j=0; j<ov; j++) { /* for each overlap */
50       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
51       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
52       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
53 
54       n = isz;  /* length of the updated is[i] */
55       for (brow=0; brow<mbs; brow++) {
56         start = ai[brow]; end   = ai[brow+1];
57         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58           for (l = start; l<end; l++) {
59             bcol = aj[l];
60             if (!PetscBTLookupSet(table_out,bcol)) {
61               nidx[isz++] = bcol;
62               if (bcol_max < bcol) bcol_max = bcol;
63             }
64           }
65           k++;
66           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68           for (l = start; l<end; l++) {
69             bcol = aj[l];
70             if (bcol > bcol_max) break;
71             if (PetscBTLookup(table_in,bcol)) {
72               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
73               break; /* for l = start; l<end ; l++) */
74             }
75           }
76         }
77       }
78     } /* for each overlap */
79 
80     /* expand the Index Set */
81     for (j=0; j<isz; j++) {
82       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
83     }
84     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
85   }
86   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
87   ierr = PetscFree(nidx);CHKERRQ(ierr);
88   ierr = PetscFree(nidx2);CHKERRQ(ierr);
89   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94         Zero some ops' to avoid invalid usse */
95 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
96 {
97   PetscErrorCode ierr;
98 
99   PetscFunctionBegin;
100   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
101   Bseq->ops->mult                   = 0;
102   Bseq->ops->multadd                = 0;
103   Bseq->ops->multtranspose          = 0;
104   Bseq->ops->multtransposeadd       = 0;
105   Bseq->ops->lufactor               = 0;
106   Bseq->ops->choleskyfactor         = 0;
107   Bseq->ops->lufactorsymbolic       = 0;
108   Bseq->ops->choleskyfactorsymbolic = 0;
109   Bseq->ops->getinertia             = 0;
110   PetscFunctionReturn(0);
111 }
112 
113 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
114 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
115 {
116   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
117   PetscErrorCode ierr;
118   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
119   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
120   const PetscInt *irow,*icol;
121   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
122   PetscInt       *aj = a->j,*ai = a->i;
123   MatScalar      *mat_a;
124   Mat            C;
125   PetscBool      flag;
126 
127   PetscFunctionBegin;
128 
129   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
130   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
131   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
132   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
133 
134   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
135   ssmap = smap;
136   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
137   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
138   /* determine lens of each row */
139   for (i=0; i<nrows; i++) {
140     kstart  = ai[irow[i]];
141     kend    = kstart + a->ilen[irow[i]];
142     lens[i] = 0;
143     for (k=kstart; k<kend; k++) {
144       if (ssmap[aj[k]]) lens[i]++;
145     }
146   }
147   /* Create and fill new matrix */
148   if (scall == MAT_REUSE_MATRIX) {
149     c = (Mat_SeqSBAIJ*)((*B)->data);
150 
151     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
152     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
153     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
154     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
155     C    = *B;
156   } else {
157     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
158     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
159     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
160     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
161   }
162   c = (Mat_SeqSBAIJ*)(C->data);
163   for (i=0; i<nrows; i++) {
164     row      = irow[i];
165     kstart   = ai[row];
166     kend     = kstart + a->ilen[row];
167     mat_i    = c->i[i];
168     mat_j    = c->j + mat_i;
169     mat_a    = c->a + mat_i*bs2;
170     mat_ilen = c->ilen + i;
171     for (k=kstart; k<kend; k++) {
172       if ((tcol=ssmap[a->j[k]])) {
173         *mat_j++ = tcol - 1;
174         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
175         mat_a   += bs2;
176         (*mat_ilen)++;
177       }
178     }
179   }
180   /* sort */
181   {
182     MatScalar *work;
183 
184     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
185     for (i=0; i<nrows; i++) {
186       PetscInt ilen;
187       mat_i = c->i[i];
188       mat_j = c->j + mat_i;
189       mat_a = c->a + mat_i*bs2;
190       ilen  = c->ilen[i];
191       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
192     }
193     ierr = PetscFree(work);CHKERRQ(ierr);
194   }
195 
196   /* Free work space */
197   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
198   ierr = PetscFree(smap);CHKERRQ(ierr);
199   ierr = PetscFree(lens);CHKERRQ(ierr);
200   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
201   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202 
203   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
204   *B   = C;
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
209 {
210   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
211   IS             is1,is2;
212   PetscErrorCode ierr;
213   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
214   const PetscInt *irow,*icol;
215 
216   PetscFunctionBegin;
217   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
218   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
219   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
220   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
221 
222   /* Verify if the indices corespond to each element in a block
223    and form the IS with compressed IS */
224   maxmnbs = PetscMax(a->mbs,a->nbs);
225   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
226   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
227   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
228   for (i=0; i<a->mbs; i++) {
229     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
230   }
231   count = 0;
232   for (i=0; i<nrows; i++) {
233     PetscInt j = irow[i] / bs;
234     if ((vary[j]--)==bs) iary[count++] = j;
235   }
236   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
237 
238   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
239   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
240   for (i=0; i<a->nbs; i++) {
241     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
242   }
243   count = 0;
244   for (i=0; i<ncols; i++) {
245     PetscInt j = icol[i] / bs;
246     if ((vary[j]--)==bs) iary[count++] = j;
247   }
248   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
249   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
250   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
251   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
252 
253   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
254   ierr = ISDestroy(&is1);CHKERRQ(ierr);
255   ierr = ISDestroy(&is2);CHKERRQ(ierr);
256 
257   if (isrow != iscol) {
258     PetscBool isequal;
259     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
260     if (!isequal) {
261       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
262     }
263   }
264   PetscFunctionReturn(0);
265 }
266 
267 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
268 {
269   PetscErrorCode ierr;
270   PetscInt       i;
271 
272   PetscFunctionBegin;
273   if (scall == MAT_INITIAL_MATRIX) {
274     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
275   }
276 
277   for (i=0; i<n; i++) {
278     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
279   }
280   PetscFunctionReturn(0);
281 }
282 
283 /* -------------------------------------------------------*/
284 /* Should check that shapes of vectors and matrices match */
285 /* -------------------------------------------------------*/
286 
287 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
288 {
289   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
290   PetscScalar       *z,x1,x2,zero=0.0;
291   const PetscScalar *x,*xb;
292   const MatScalar   *v;
293   PetscErrorCode    ierr;
294   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
295   const PetscInt    *aj=a->j,*ai=a->i,*ib;
296   PetscInt          nonzerorow=0;
297 
298   PetscFunctionBegin;
299   ierr = VecSet(zz,zero);CHKERRQ(ierr);
300   if (!a->nz) PetscFunctionReturn(0);
301   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
302   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
303 
304   v  = a->a;
305   xb = x;
306 
307   for (i=0; i<mbs; i++) {
308     n           = ai[1] - ai[0]; /* length of i_th block row of A */
309     x1          = xb[0]; x2 = xb[1];
310     ib          = aj + *ai;
311     jmin        = 0;
312     nonzerorow += (n>0);
313     if (*ib == i) {     /* (diag of A)*x */
314       z[2*i]   += v[0]*x1 + v[2]*x2;
315       z[2*i+1] += v[2]*x1 + v[3]*x2;
316       v        += 4; jmin++;
317     }
318     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
319     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
320     for (j=jmin; j<n; j++) {
321       /* (strict lower triangular part of A)*x  */
322       cval       = ib[j]*2;
323       z[cval]   += v[0]*x1 + v[1]*x2;
324       z[cval+1] += v[2]*x1 + v[3]*x2;
325       /* (strict upper triangular part of A)*x  */
326       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
327       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
328       v        += 4;
329     }
330     xb +=2; ai++;
331   }
332 
333   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
334   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
335   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
336   PetscFunctionReturn(0);
337 }
338 
339 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
340 {
341   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
342   PetscScalar       *z,x1,x2,x3,zero=0.0;
343   const PetscScalar *x,*xb;
344   const MatScalar   *v;
345   PetscErrorCode    ierr;
346   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
347   const PetscInt    *aj = a->j,*ai = a->i,*ib;
348   PetscInt          nonzerorow=0;
349 
350   PetscFunctionBegin;
351   ierr = VecSet(zz,zero);CHKERRQ(ierr);
352   if (!a->nz) PetscFunctionReturn(0);
353   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
354   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
355 
356   v  = a->a;
357   xb = x;
358 
359   for (i=0; i<mbs; i++) {
360     n           = ai[1] - ai[0]; /* length of i_th block row of A */
361     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
362     ib          = aj + *ai;
363     jmin        = 0;
364     nonzerorow += (n>0);
365     if (*ib == i) {     /* (diag of A)*x */
366       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
367       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
368       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
369       v        += 9; jmin++;
370     }
371     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
372     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
373     for (j=jmin; j<n; j++) {
374       /* (strict lower triangular part of A)*x  */
375       cval       = ib[j]*3;
376       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
377       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
378       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
379       /* (strict upper triangular part of A)*x  */
380       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
381       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
382       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
383       v        += 9;
384     }
385     xb +=3; ai++;
386   }
387 
388   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
389   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
390   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
391   PetscFunctionReturn(0);
392 }
393 
394 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
395 {
396   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
397   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
398   const PetscScalar *x,*xb;
399   const MatScalar   *v;
400   PetscErrorCode    ierr;
401   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
402   const PetscInt    *aj = a->j,*ai = a->i,*ib;
403   PetscInt          nonzerorow = 0;
404 
405   PetscFunctionBegin;
406   ierr = VecSet(zz,zero);CHKERRQ(ierr);
407   if (!a->nz) PetscFunctionReturn(0);
408   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
409   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
410 
411   v  = a->a;
412   xb = x;
413 
414   for (i=0; i<mbs; i++) {
415     n           = ai[1] - ai[0]; /* length of i_th block row of A */
416     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
417     ib          = aj + *ai;
418     jmin        = 0;
419     nonzerorow += (n>0);
420     if (*ib == i) {     /* (diag of A)*x */
421       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
422       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
423       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
424       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
425       v        += 16; jmin++;
426     }
427     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
428     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
429     for (j=jmin; j<n; j++) {
430       /* (strict lower triangular part of A)*x  */
431       cval       = ib[j]*4;
432       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
433       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
434       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
435       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
436       /* (strict upper triangular part of A)*x  */
437       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
438       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
439       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
440       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
441       v        += 16;
442     }
443     xb +=4; ai++;
444   }
445 
446   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
447   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
448   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
449   PetscFunctionReturn(0);
450 }
451 
452 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
453 {
454   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
455   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
456   const PetscScalar *x,*xb;
457   const MatScalar   *v;
458   PetscErrorCode    ierr;
459   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
460   const PetscInt    *aj = a->j,*ai = a->i,*ib;
461   PetscInt          nonzerorow=0;
462 
463   PetscFunctionBegin;
464   ierr = VecSet(zz,zero);CHKERRQ(ierr);
465   if (!a->nz) PetscFunctionReturn(0);
466   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
467   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
468 
469   v  = a->a;
470   xb = x;
471 
472   for (i=0; i<mbs; i++) {
473     n           = ai[1] - ai[0]; /* length of i_th block row of A */
474     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
475     ib          = aj + *ai;
476     jmin        = 0;
477     nonzerorow += (n>0);
478     if (*ib == i) {      /* (diag of A)*x */
479       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
480       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
481       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
482       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
483       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
484       v        += 25; jmin++;
485     }
486     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
487     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
488     for (j=jmin; j<n; j++) {
489       /* (strict lower triangular part of A)*x  */
490       cval       = ib[j]*5;
491       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
492       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
493       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
494       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
495       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
496       /* (strict upper triangular part of A)*x  */
497       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
498       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
499       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
500       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
501       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
502       v        += 25;
503     }
504     xb +=5; ai++;
505   }
506 
507   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
508   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
509   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
510   PetscFunctionReturn(0);
511 }
512 
513 
514 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515 {
516   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518   const PetscScalar *x,*xb;
519   const MatScalar   *v;
520   PetscErrorCode    ierr;
521   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522   const PetscInt    *aj=a->j,*ai=a->i,*ib;
523   PetscInt          nonzerorow=0;
524 
525   PetscFunctionBegin;
526   ierr = VecSet(zz,zero);CHKERRQ(ierr);
527   if (!a->nz) PetscFunctionReturn(0);
528   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
529   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
530 
531   v  = a->a;
532   xb = x;
533 
534   for (i=0; i<mbs; i++) {
535     n           = ai[1] - ai[0]; /* length of i_th block row of A */
536     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537     ib          = aj + *ai;
538     jmin        = 0;
539     nonzerorow += (n>0);
540     if (*ib == i) {      /* (diag of A)*x */
541       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547       v        += 36; jmin++;
548     }
549     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551     for (j=jmin; j<n; j++) {
552       /* (strict lower triangular part of A)*x  */
553       cval       = ib[j]*6;
554       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560       /* (strict upper triangular part of A)*x  */
561       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567       v        += 36;
568     }
569     xb +=6; ai++;
570   }
571 
572   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
573   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
575   PetscFunctionReturn(0);
576 }
577 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
578 {
579   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
580   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
581   const PetscScalar *x,*xb;
582   const MatScalar   *v;
583   PetscErrorCode    ierr;
584   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
585   const PetscInt    *aj=a->j,*ai=a->i,*ib;
586   PetscInt          nonzerorow=0;
587 
588   PetscFunctionBegin;
589   ierr = VecSet(zz,zero);CHKERRQ(ierr);
590   if (!a->nz) PetscFunctionReturn(0);
591   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
592   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
593 
594   v  = a->a;
595   xb = x;
596 
597   for (i=0; i<mbs; i++) {
598     n           = ai[1] - ai[0]; /* length of i_th block row of A */
599     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
600     ib          = aj + *ai;
601     jmin        = 0;
602     nonzerorow += (n>0);
603     if (*ib == i) {      /* (diag of A)*x */
604       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
605       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
606       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
607       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
608       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
609       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
610       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
611       v        += 49; jmin++;
612     }
613     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
614     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
615     for (j=jmin; j<n; j++) {
616       /* (strict lower triangular part of A)*x  */
617       cval       = ib[j]*7;
618       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
619       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
620       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
621       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
622       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
623       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
624       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
625       /* (strict upper triangular part of A)*x  */
626       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
627       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
628       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
629       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
630       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
631       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
632       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
633       v       += 49;
634     }
635     xb +=7; ai++;
636   }
637   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
638   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
639   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
640   PetscFunctionReturn(0);
641 }
642 
643 /*
644     This will not work with MatScalar == float because it calls the BLAS
645 */
646 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
647 {
648   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
649   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
650   const PetscScalar *x,*x_ptr,*xb;
651   const MatScalar   *v;
652   PetscErrorCode    ierr;
653   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
654   const PetscInt    *idx,*aj,*ii;
655   PetscInt          nonzerorow=0;
656 
657   PetscFunctionBegin;
658   ierr = VecSet(zz,zero);CHKERRQ(ierr);
659   if (!a->nz) PetscFunctionReturn(0);
660   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
661   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
662 
663   aj = a->j;
664   v  = a->a;
665   ii = a->i;
666 
667   if (!a->mult_work) {
668     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
669   }
670   work = a->mult_work;
671 
672   for (i=0; i<mbs; i++) {
673     n           = ii[1] - ii[0]; ncols = n*bs;
674     workt       = work; idx=aj+ii[0];
675     nonzerorow += (n>0);
676 
677     /* upper triangular part */
678     for (j=0; j<n; j++) {
679       xb = x_ptr + bs*(*idx++);
680       for (k=0; k<bs; k++) workt[k] = xb[k];
681       workt += bs;
682     }
683     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
684     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
685 
686     /* strict lower triangular part */
687     idx = aj+ii[0];
688     if (*idx == i) {
689       ncols -= bs; v += bs2; idx++; n--;
690     }
691 
692     if (ncols > 0) {
693       workt = work;
694       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
695       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
696       for (j=0; j<n; j++) {
697         zb = z_ptr + bs*(*idx++);
698         for (k=0; k<bs; k++) zb[k] += workt[k];
699         workt += bs;
700       }
701     }
702     x += bs; v += n*bs2; z += bs; ii++;
703   }
704 
705   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
706   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
707   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
708   PetscFunctionReturn(0);
709 }
710 
711 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
712 {
713   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
714   PetscScalar       *z,x1;
715   const PetscScalar *x,*xb;
716   const MatScalar   *v;
717   PetscErrorCode    ierr;
718   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
719   const PetscInt    *aj=a->j,*ai=a->i,*ib;
720   PetscInt          nonzerorow=0;
721 
722   PetscFunctionBegin;
723   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
724   if (!a->nz) PetscFunctionReturn(0);
725   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
726   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
727   v    = a->a;
728   xb   = x;
729 
730   for (i=0; i<mbs; i++) {
731     n           = ai[1] - ai[0]; /* length of i_th row of A */
732     x1          = xb[0];
733     ib          = aj + *ai;
734     jmin        = 0;
735     nonzerorow += (n>0);
736     if (*ib == i) {            /* (diag of A)*x */
737       z[i] += *v++ * x[*ib++]; jmin++;
738     }
739     for (j=jmin; j<n; j++) {
740       cval    = *ib;
741       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
742       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
743     }
744     xb++; ai++;
745   }
746 
747   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
748   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
749 
750   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
751   PetscFunctionReturn(0);
752 }
753 
754 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
755 {
756   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
757   PetscScalar       *z,x1,x2;
758   const PetscScalar *x,*xb;
759   const MatScalar   *v;
760   PetscErrorCode    ierr;
761   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
762   const PetscInt    *aj=a->j,*ai=a->i,*ib;
763   PetscInt          nonzerorow=0;
764 
765   PetscFunctionBegin;
766   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
767   if (!a->nz) PetscFunctionReturn(0);
768   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
769   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
770 
771   v  = a->a;
772   xb = x;
773 
774   for (i=0; i<mbs; i++) {
775     n           = ai[1] - ai[0]; /* length of i_th block row of A */
776     x1          = xb[0]; x2 = xb[1];
777     ib          = aj + *ai;
778     jmin        = 0;
779     nonzerorow += (n>0);
780     if (*ib == i) {      /* (diag of A)*x */
781       z[2*i]   += v[0]*x1 + v[2]*x2;
782       z[2*i+1] += v[2]*x1 + v[3]*x2;
783       v        += 4; jmin++;
784     }
785     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
786     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
787     for (j=jmin; j<n; j++) {
788       /* (strict lower triangular part of A)*x  */
789       cval       = ib[j]*2;
790       z[cval]   += v[0]*x1 + v[1]*x2;
791       z[cval+1] += v[2]*x1 + v[3]*x2;
792       /* (strict upper triangular part of A)*x  */
793       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
794       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
795       v        += 4;
796     }
797     xb +=2; ai++;
798   }
799   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
800   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
801 
802   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
803   PetscFunctionReturn(0);
804 }
805 
806 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
807 {
808   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
809   PetscScalar       *z,x1,x2,x3;
810   const PetscScalar *x,*xb;
811   const MatScalar   *v;
812   PetscErrorCode    ierr;
813   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
814   const PetscInt    *aj=a->j,*ai=a->i,*ib;
815   PetscInt          nonzerorow=0;
816 
817   PetscFunctionBegin;
818   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
819   if (!a->nz) PetscFunctionReturn(0);
820   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
821   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
822 
823   v  = a->a;
824   xb = x;
825 
826   for (i=0; i<mbs; i++) {
827     n           = ai[1] - ai[0]; /* length of i_th block row of A */
828     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
829     ib          = aj + *ai;
830     jmin        = 0;
831     nonzerorow += (n>0);
832     if (*ib == i) {     /* (diag of A)*x */
833       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
834       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
835       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
836       v        += 9; jmin++;
837     }
838     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
839     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
840     for (j=jmin; j<n; j++) {
841       /* (strict lower triangular part of A)*x  */
842       cval       = ib[j]*3;
843       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
844       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
845       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
846       /* (strict upper triangular part of A)*x  */
847       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
848       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
849       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
850       v        += 9;
851     }
852     xb +=3; ai++;
853   }
854 
855   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
856   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
857 
858   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
859   PetscFunctionReturn(0);
860 }
861 
862 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
863 {
864   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
865   PetscScalar       *z,x1,x2,x3,x4;
866   const PetscScalar *x,*xb;
867   const MatScalar   *v;
868   PetscErrorCode    ierr;
869   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
870   const PetscInt    *aj=a->j,*ai=a->i,*ib;
871   PetscInt          nonzerorow=0;
872 
873   PetscFunctionBegin;
874   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
875   if (!a->nz) PetscFunctionReturn(0);
876   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
877   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
878 
879   v  = a->a;
880   xb = x;
881 
882   for (i=0; i<mbs; i++) {
883     n           = ai[1] - ai[0]; /* length of i_th block row of A */
884     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
885     ib          = aj + *ai;
886     jmin        = 0;
887     nonzerorow += (n>0);
888     if (*ib == i) {      /* (diag of A)*x */
889       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
890       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
891       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
892       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
893       v        += 16; jmin++;
894     }
895     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
896     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
897     for (j=jmin; j<n; j++) {
898       /* (strict lower triangular part of A)*x  */
899       cval       = ib[j]*4;
900       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
901       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
902       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
903       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
904       /* (strict upper triangular part of A)*x  */
905       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
906       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
907       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
908       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
909       v        += 16;
910     }
911     xb +=4; ai++;
912   }
913 
914   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
915   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
916 
917   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
918   PetscFunctionReturn(0);
919 }
920 
921 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
922 {
923   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
924   PetscScalar       *z,x1,x2,x3,x4,x5;
925   const PetscScalar *x,*xb;
926   const MatScalar   *v;
927   PetscErrorCode    ierr;
928   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
929   const PetscInt    *aj=a->j,*ai=a->i,*ib;
930   PetscInt          nonzerorow=0;
931 
932   PetscFunctionBegin;
933   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
934   if (!a->nz) PetscFunctionReturn(0);
935   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
936   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
937 
938   v  = a->a;
939   xb = x;
940 
941   for (i=0; i<mbs; i++) {
942     n           = ai[1] - ai[0]; /* length of i_th block row of A */
943     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
944     ib          = aj + *ai;
945     jmin        = 0;
946     nonzerorow += (n>0);
947     if (*ib == i) {      /* (diag of A)*x */
948       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
949       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
950       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
951       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
952       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
953       v        += 25; jmin++;
954     }
955     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
956     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
957     for (j=jmin; j<n; j++) {
958       /* (strict lower triangular part of A)*x  */
959       cval       = ib[j]*5;
960       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
961       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
962       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
963       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
964       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
965       /* (strict upper triangular part of A)*x  */
966       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
967       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
968       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
969       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
970       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
971       v        += 25;
972     }
973     xb +=5; ai++;
974   }
975 
976   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
977   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
978 
979   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
980   PetscFunctionReturn(0);
981 }
982 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
983 {
984   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
985   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
986   const PetscScalar *x,*xb;
987   const MatScalar   *v;
988   PetscErrorCode    ierr;
989   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
990   const PetscInt    *aj=a->j,*ai=a->i,*ib;
991   PetscInt          nonzerorow=0;
992 
993   PetscFunctionBegin;
994   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
995   if (!a->nz) PetscFunctionReturn(0);
996   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
997   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
998 
999   v  = a->a;
1000   xb = x;
1001 
1002   for (i=0; i<mbs; i++) {
1003     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1004     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1005     ib          = aj + *ai;
1006     jmin        = 0;
1007     nonzerorow += (n>0);
1008     if (*ib == i) {     /* (diag of A)*x */
1009       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1010       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1011       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1012       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1013       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1014       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1015       v        += 36; jmin++;
1016     }
1017     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1018     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1019     for (j=jmin; j<n; j++) {
1020       /* (strict lower triangular part of A)*x  */
1021       cval       = ib[j]*6;
1022       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1023       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1024       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1025       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1026       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1027       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1028       /* (strict upper triangular part of A)*x  */
1029       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1030       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1031       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1032       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1033       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1034       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1035       v        += 36;
1036     }
1037     xb +=6; ai++;
1038   }
1039 
1040   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1041   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1042 
1043   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1044   PetscFunctionReturn(0);
1045 }
1046 
1047 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1048 {
1049   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1050   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1051   const PetscScalar *x,*xb;
1052   const MatScalar   *v;
1053   PetscErrorCode    ierr;
1054   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1055   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1056   PetscInt          nonzerorow=0;
1057 
1058   PetscFunctionBegin;
1059   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1060   if (!a->nz) PetscFunctionReturn(0);
1061   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1062   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1063 
1064   v  = a->a;
1065   xb = x;
1066 
1067   for (i=0; i<mbs; i++) {
1068     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1069     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1070     ib          = aj + *ai;
1071     jmin        = 0;
1072     nonzerorow += (n>0);
1073     if (*ib == i) {     /* (diag of A)*x */
1074       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1075       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1076       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1077       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1078       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1079       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1080       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1081       v        += 49; jmin++;
1082     }
1083     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1084     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1085     for (j=jmin; j<n; j++) {
1086       /* (strict lower triangular part of A)*x  */
1087       cval       = ib[j]*7;
1088       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1089       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1090       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1091       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1092       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1093       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1094       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1095       /* (strict upper triangular part of A)*x  */
1096       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1097       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1098       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1099       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1100       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1101       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1102       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1103       v       += 49;
1104     }
1105     xb +=7; ai++;
1106   }
1107 
1108   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1109   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1110 
1111   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1112   PetscFunctionReturn(0);
1113 }
1114 
1115 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1116 {
1117   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1118   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1119   const PetscScalar *x,*x_ptr,*xb;
1120   const MatScalar   *v;
1121   PetscErrorCode    ierr;
1122   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1123   const PetscInt    *idx,*aj,*ii;
1124   PetscInt          nonzerorow=0;
1125 
1126   PetscFunctionBegin;
1127   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1128   if (!a->nz) PetscFunctionReturn(0);
1129   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1130   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1131 
1132   aj = a->j;
1133   v  = a->a;
1134   ii = a->i;
1135 
1136   if (!a->mult_work) {
1137     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1138   }
1139   work = a->mult_work;
1140 
1141 
1142   for (i=0; i<mbs; i++) {
1143     n           = ii[1] - ii[0]; ncols = n*bs;
1144     workt       = work; idx=aj+ii[0];
1145     nonzerorow += (n>0);
1146 
1147     /* upper triangular part */
1148     for (j=0; j<n; j++) {
1149       xb = x_ptr + bs*(*idx++);
1150       for (k=0; k<bs; k++) workt[k] = xb[k];
1151       workt += bs;
1152     }
1153     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1154     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1155 
1156     /* strict lower triangular part */
1157     idx = aj+ii[0];
1158     if (*idx == i) {
1159       ncols -= bs; v += bs2; idx++; n--;
1160     }
1161     if (ncols > 0) {
1162       workt = work;
1163       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
1164       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1165       for (j=0; j<n; j++) {
1166         zb = z_ptr + bs*(*idx++);
1167         for (k=0; k<bs; k++) zb[k] += workt[k];
1168         workt += bs;
1169       }
1170     }
1171 
1172     x += bs; v += n*bs2; z += bs; ii++;
1173   }
1174 
1175   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1177 
1178   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1179   PetscFunctionReturn(0);
1180 }
1181 
1182 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1183 {
1184   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1185   PetscScalar    oalpha = alpha;
1186   PetscErrorCode ierr;
1187   PetscBLASInt   one = 1,totalnz;
1188 
1189   PetscFunctionBegin;
1190   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1191   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1192   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1197 {
1198   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1199   const MatScalar *v       = a->a;
1200   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1201   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1202   PetscErrorCode  ierr;
1203   const PetscInt  *aj=a->j,*col;
1204 
1205   PetscFunctionBegin;
1206   if (!a->nz) {
1207     *norm = 0.0;
1208     PetscFunctionReturn(0);
1209   }
1210   if (type == NORM_FROBENIUS) {
1211     for (k=0; k<mbs; k++) {
1212       jmin = a->i[k]; jmax = a->i[k+1];
1213       col  = aj + jmin;
1214       if (*col == k) {         /* diagonal block */
1215         for (i=0; i<bs2; i++) {
1216           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1217         }
1218         jmin++;
1219       }
1220       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1221         for (i=0; i<bs2; i++) {
1222           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1223         }
1224       }
1225     }
1226     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1227     ierr = PetscLogFlops(2*bs2*a->nz);CHKERRQ(ierr);
1228   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1229     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1230     for (i=0; i<mbs; i++) jl[i] = mbs;
1231     il[0] = 0;
1232 
1233     *norm = 0.0;
1234     for (k=0; k<mbs; k++) { /* k_th block row */
1235       for (j=0; j<bs; j++) sum[j]=0.0;
1236       /*-- col sum --*/
1237       i = jl[k]; /* first |A(i,k)| to be added */
1238       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1239                   at step k */
1240       while (i<mbs) {
1241         nexti = jl[i];  /* next block row to be added */
1242         ik    = il[i];  /* block index of A(i,k) in the array a */
1243         for (j=0; j<bs; j++) {
1244           v = a->a + ik*bs2 + j*bs;
1245           for (k1=0; k1<bs; k1++) {
1246             sum[j] += PetscAbsScalar(*v); v++;
1247           }
1248         }
1249         /* update il, jl */
1250         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1251         jmax = a->i[i+1];
1252         if (jmin < jmax) {
1253           il[i] = jmin;
1254           j     = a->j[jmin];
1255           jl[i] = jl[j]; jl[j]=i;
1256         }
1257         i = nexti;
1258       }
1259       /*-- row sum --*/
1260       jmin = a->i[k]; jmax = a->i[k+1];
1261       for (i=jmin; i<jmax; i++) {
1262         for (j=0; j<bs; j++) {
1263           v = a->a + i*bs2 + j;
1264           for (k1=0; k1<bs; k1++) {
1265             sum[j] += PetscAbsScalar(*v); v += bs;
1266           }
1267         }
1268       }
1269       /* add k_th block row to il, jl */
1270       col = aj+jmin;
1271       if (*col == k) jmin++;
1272       if (jmin < jmax) {
1273         il[k] = jmin;
1274         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1275       }
1276       for (j=0; j<bs; j++) {
1277         if (sum[j] > *norm) *norm = sum[j];
1278       }
1279     }
1280     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1281     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1282   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1283   PetscFunctionReturn(0);
1284 }
1285 
1286 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1287 {
1288   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1289   PetscErrorCode ierr;
1290 
1291   PetscFunctionBegin;
1292   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1293   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1294     *flg = PETSC_FALSE;
1295     PetscFunctionReturn(0);
1296   }
1297 
1298   /* if the a->i are the same */
1299   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
1300   if (!*flg) PetscFunctionReturn(0);
1301 
1302   /* if a->j are the same */
1303   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
1304   if (!*flg) PetscFunctionReturn(0);
1305 
1306   /* if a->a are the same */
1307   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr);
1308   PetscFunctionReturn(0);
1309 }
1310 
1311 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1312 {
1313   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1314   PetscErrorCode  ierr;
1315   PetscInt        i,j,k,row,bs,ambs,bs2;
1316   const PetscInt  *ai,*aj;
1317   PetscScalar     *x,zero = 0.0;
1318   const MatScalar *aa,*aa_j;
1319 
1320   PetscFunctionBegin;
1321   bs = A->rmap->bs;
1322   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1323 
1324   aa   = a->a;
1325   ambs = a->mbs;
1326 
1327   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1328     PetscInt *diag=a->diag;
1329     aa   = a->a;
1330     ambs = a->mbs;
1331     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1332     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1333     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1334     PetscFunctionReturn(0);
1335   }
1336 
1337   ai   = a->i;
1338   aj   = a->j;
1339   bs2  = a->bs2;
1340   ierr = VecSet(v,zero);CHKERRQ(ierr);
1341   if (!a->nz) PetscFunctionReturn(0);
1342   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1343   for (i=0; i<ambs; i++) {
1344     j=ai[i];
1345     if (aj[j] == i) {    /* if this is a diagonal element */
1346       row  = i*bs;
1347       aa_j = aa + j*bs2;
1348       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1349     }
1350   }
1351   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1352   PetscFunctionReturn(0);
1353 }
1354 
1355 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1356 {
1357   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1358   PetscScalar       x;
1359   const PetscScalar *l,*li,*ri;
1360   MatScalar         *aa,*v;
1361   PetscErrorCode    ierr;
1362   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1363   const PetscInt    *ai,*aj;
1364   PetscBool         flg;
1365 
1366   PetscFunctionBegin;
1367   if (ll != rr) {
1368     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1369     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1370   }
1371   if (!ll) PetscFunctionReturn(0);
1372   ai  = a->i;
1373   aj  = a->j;
1374   aa  = a->a;
1375   m   = A->rmap->N;
1376   bs  = A->rmap->bs;
1377   mbs = a->mbs;
1378   bs2 = a->bs2;
1379 
1380   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1381   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1382   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1383   for (i=0; i<mbs; i++) { /* for each block row */
1384     M  = ai[i+1] - ai[i];
1385     li = l + i*bs;
1386     v  = aa + bs2*ai[i];
1387     for (j=0; j<M; j++) { /* for each block */
1388       ri = l + bs*aj[ai[i]+j];
1389       for (k=0; k<bs; k++) {
1390         x = ri[k];
1391         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1392       }
1393     }
1394   }
1395   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1396   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1397   PetscFunctionReturn(0);
1398 }
1399 
1400 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1401 {
1402   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1403 
1404   PetscFunctionBegin;
1405   info->block_size   = a->bs2;
1406   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1407   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1408   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1409   info->assemblies   = A->num_ass;
1410   info->mallocs      = A->info.mallocs;
1411   info->memory       = ((PetscObject)A)->mem;
1412   if (A->factortype) {
1413     info->fill_ratio_given  = A->info.fill_ratio_given;
1414     info->fill_ratio_needed = A->info.fill_ratio_needed;
1415     info->factor_mallocs    = A->info.factor_mallocs;
1416   } else {
1417     info->fill_ratio_given  = 0;
1418     info->fill_ratio_needed = 0;
1419     info->factor_mallocs    = 0;
1420   }
1421   PetscFunctionReturn(0);
1422 }
1423 
1424 
1425 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1426 {
1427   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1428   PetscErrorCode ierr;
1429 
1430   PetscFunctionBegin;
1431   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 /*
1436    This code does not work since it only checks the upper triangular part of
1437   the matrix. Hence it is not listed in the function table.
1438 */
1439 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1440 {
1441   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1442   PetscErrorCode  ierr;
1443   PetscInt        i,j,n,row,col,bs,mbs;
1444   const PetscInt  *ai,*aj;
1445   PetscReal       atmp;
1446   const MatScalar *aa;
1447   PetscScalar     *x;
1448   PetscInt        ncols,brow,bcol,krow,kcol;
1449 
1450   PetscFunctionBegin;
1451   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1452   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1453   bs  = A->rmap->bs;
1454   aa  = a->a;
1455   ai  = a->i;
1456   aj  = a->j;
1457   mbs = a->mbs;
1458 
1459   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1460   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1461   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1462   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1463   for (i=0; i<mbs; i++) {
1464     ncols = ai[1] - ai[0]; ai++;
1465     brow  = bs*i;
1466     for (j=0; j<ncols; j++) {
1467       bcol = bs*(*aj);
1468       for (kcol=0; kcol<bs; kcol++) {
1469         col = bcol + kcol;      /* col index */
1470         for (krow=0; krow<bs; krow++) {
1471           atmp = PetscAbsScalar(*aa); aa++;
1472           row  = brow + krow;   /* row index */
1473           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1474           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1475         }
1476       }
1477       aj++;
1478     }
1479   }
1480   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1481   PetscFunctionReturn(0);
1482 }
1483