xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 2205254efee3a00a594e5e2a3a70f74dcb40bc03)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table_out,table_in;
17 
18   PetscFunctionBegin;
19   if (ov < 0)  SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs = a->mbs;
21   ai  = a->i;
22   aj  = a->j;
23   bs  = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n ; ++j) {
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if (!PetscBTLookupSet(table_out,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap */
52       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++) {
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end ; l++) {
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table_out,bcol)) {
63               nidx[isz++] = bcol;
64               if (bcol_max < bcol) bcol_max = bcol;
65             }
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end ; l++) {
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table_in,bcol)) {
74               if (!PetscBTLookupSet(table_out,brow)) {nidx[isz++] = brow;}
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++)
85         nidx2[j*bs+k] = nidx[j]*bs+k;
86     }
87     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
88   }
89   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
90   ierr = PetscFree(nidx);CHKERRQ(ierr);
91   ierr = PetscFree(nidx2);CHKERRQ(ierr);
92   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
93   PetscFunctionReturn(0);
94 }
95 
96 #undef __FUNCT__
97 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
98 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
99 {
100   Mat_SeqSBAIJ  *a = (Mat_SeqSBAIJ*)A->data,*c;
101   PetscErrorCode ierr;
102   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
103   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
104   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
105   const PetscInt *irow,*aj = a->j,*ai = a->i;
106   MatScalar      *mat_a;
107   Mat            C;
108   PetscBool      flag,sorted;
109 
110   PetscFunctionBegin;
111   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
112   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
113   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
114 
115   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
116   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
117 
118   ierr  = PetscMalloc(oldcols*sizeof(PetscInt),&smap);CHKERRQ(ierr);
119   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
120   ssmap = smap;
121   ierr  = PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);CHKERRQ(ierr);
122   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
123   /* determine lens of each row */
124   for (i=0; i<nrows; i++) {
125     kstart  = ai[irow[i]];
126     kend    = kstart + a->ilen[irow[i]];
127     lens[i] = 0;
128       for (k=kstart; k<kend; k++) {
129         if (ssmap[aj[k]]) {
130           lens[i]++;
131         }
132       }
133     }
134   /* Create and fill new matrix */
135   if (scall == MAT_REUSE_MATRIX) {
136     c = (Mat_SeqSBAIJ *)((*B)->data);
137 
138     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
139     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
140     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
141     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
142     C = *B;
143   } else {
144     ierr = MatCreate(((PetscObject)A)->comm,&C);CHKERRQ(ierr);
145     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
146     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
147     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
148   }
149   c = (Mat_SeqSBAIJ *)(C->data);
150   for (i=0; i<nrows; i++) {
151     row    = irow[i];
152     kstart = ai[row];
153     kend   = kstart + a->ilen[row];
154     mat_i  = c->i[i];
155     mat_j  = c->j + mat_i;
156     mat_a  = c->a + mat_i*bs2;
157     mat_ilen = c->ilen + i;
158     for (k=kstart; k<kend; k++) {
159       if ((tcol=ssmap[a->j[k]])) {
160         *mat_j++ = tcol - 1;
161         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
162         mat_a   += bs2;
163         (*mat_ilen)++;
164       }
165     }
166   }
167 
168   /* Free work space */
169   ierr = PetscFree(smap);CHKERRQ(ierr);
170   ierr = PetscFree(lens);CHKERRQ(ierr);
171   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
172   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
173 
174   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
175   *B = C;
176   PetscFunctionReturn(0);
177 }
178 
179 #undef __FUNCT__
180 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
181 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
182 {
183   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
184   IS             is1;
185   PetscErrorCode ierr;
186   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
187   const PetscInt *irow;
188 
189   PetscFunctionBegin;
190   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
191 
192   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
193   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
194 
195   /* Verify if the indices corespond to each element in a block
196    and form the IS with compressed IS */
197   ierr = PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);CHKERRQ(ierr);
198   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
199   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200 
201   count = 0;
202   for (i=0; i<a->mbs; i++) {
203     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
204     if (vary[i]==bs) iary[count++] = i;
205   }
206   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
207   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
208   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
209 
210   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
211   ierr = ISDestroy(&is1);CHKERRQ(ierr);
212   PetscFunctionReturn(0);
213 }
214 
215 #undef __FUNCT__
216 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
217 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218 {
219   PetscErrorCode ierr;
220   PetscInt       i;
221 
222   PetscFunctionBegin;
223   if (scall == MAT_INITIAL_MATRIX) {
224     ierr = PetscMalloc((n+1)*sizeof(Mat),B);CHKERRQ(ierr);
225   }
226 
227   for (i=0; i<n; i++) {
228     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
229   }
230   PetscFunctionReturn(0);
231 }
232 
233 /* -------------------------------------------------------*/
234 /* Should check that shapes of vectors and matrices match */
235 /* -------------------------------------------------------*/
236 
237 #undef __FUNCT__
238 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
239 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
240 {
241   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
242   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
243   MatScalar      *v;
244   PetscErrorCode ierr;
245   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
246   PetscInt       nonzerorow=0;
247 
248   PetscFunctionBegin;
249   ierr = VecSet(zz,zero);CHKERRQ(ierr);
250   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
251   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
252 
253   v     = a->a;
254   xb = x;
255 
256   for (i=0; i<mbs; i++) {
257     n  = ai[1] - ai[0]; /* length of i_th block row of A */
258     x1 = xb[0]; x2 = xb[1];
259     ib = aj + *ai;
260     jmin = 0;
261     nonzerorow += (n>0);
262     if (*ib == i) {     /* (diag of A)*x */
263       z[2*i]   += v[0]*x1 + v[2]*x2;
264       z[2*i+1] += v[2]*x1 + v[3]*x2;
265       v += 4; jmin++;
266     }
267     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
268     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
269     for (j=jmin; j<n; j++) {
270       /* (strict lower triangular part of A)*x  */
271       cval       = ib[j]*2;
272       z[cval]     += v[0]*x1 + v[1]*x2;
273       z[cval+1]   += v[2]*x1 + v[3]*x2;
274       /* (strict upper triangular part of A)*x  */
275       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
276       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
277       v  += 4;
278     }
279     xb +=2; ai++;
280   }
281 
282   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
283   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
284   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
285   PetscFunctionReturn(0);
286 }
287 
288 #undef __FUNCT__
289 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
290 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
291 {
292   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
293   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
294   MatScalar      *v;
295   PetscErrorCode ierr;
296   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
297   PetscInt       nonzerorow=0;
298 
299   PetscFunctionBegin;
300   ierr = VecSet(zz,zero);CHKERRQ(ierr);
301   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
302   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
303 
304   v    = a->a;
305   xb   = x;
306 
307   for (i=0; i<mbs; i++) {
308     n  = ai[1] - ai[0]; /* length of i_th block row of A */
309     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
310     ib = aj + *ai;
311     jmin = 0;
312     nonzerorow += (n>0);
313     if (*ib == i) {     /* (diag of A)*x */
314       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
315       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
316       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
317       v += 9; jmin++;
318     }
319     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
321     for (j=jmin; j<n; j++) {
322       /* (strict lower triangular part of A)*x  */
323       cval       = ib[j]*3;
324       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
325       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
326       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
327       /* (strict upper triangular part of A)*x  */
328       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
329       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
330       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
331       v  += 9;
332     }
333     xb +=3; ai++;
334   }
335 
336   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
337   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
338   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
339   PetscFunctionReturn(0);
340 }
341 
342 #undef __FUNCT__
343 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
344 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
345 {
346   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
347   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
348   MatScalar      *v;
349   PetscErrorCode ierr;
350   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
351   PetscInt       nonzerorow=0;
352 
353   PetscFunctionBegin;
354   ierr = VecSet(zz,zero);CHKERRQ(ierr);
355   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
356   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
357 
358   v     = a->a;
359   xb = x;
360 
361   for (i=0; i<mbs; i++) {
362     n  = ai[1] - ai[0]; /* length of i_th block row of A */
363     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
364     ib = aj + *ai;
365     jmin = 0;
366     nonzerorow += (n>0);
367     if (*ib == i) {     /* (diag of A)*x */
368       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
369       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
370       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
371       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
372       v += 16; jmin++;
373     }
374     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
375     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
376     for (j=jmin; j<n; j++) {
377       /* (strict lower triangular part of A)*x  */
378       cval       = ib[j]*4;
379       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
380       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
381       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
382       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
383       /* (strict upper triangular part of A)*x  */
384       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
385       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
386       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
387       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
388       v  += 16;
389     }
390     xb +=4; ai++;
391   }
392 
393   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
394   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
395   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
396   PetscFunctionReturn(0);
397 }
398 
399 #undef __FUNCT__
400 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
401 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
402 {
403   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
404   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
405   MatScalar      *v;
406   PetscErrorCode ierr;
407   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
408   PetscInt       nonzerorow=0;
409 
410   PetscFunctionBegin;
411   ierr = VecSet(zz,zero);CHKERRQ(ierr);
412   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
413   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
414 
415   v     = a->a;
416   xb = x;
417 
418   for (i=0; i<mbs; i++) {
419     n  = ai[1] - ai[0]; /* length of i_th block row of A */
420     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
421     ib = aj + *ai;
422     jmin = 0;
423     nonzerorow += (n>0);
424     if (*ib == i) {      /* (diag of A)*x */
425       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
426       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
427       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
428       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
429       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
430       v += 25; jmin++;
431     }
432     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
433     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
434     for (j=jmin; j<n; j++) {
435       /* (strict lower triangular part of A)*x  */
436       cval       = ib[j]*5;
437       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
438       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
439       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
440       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
441       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
442       /* (strict upper triangular part of A)*x  */
443       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
444       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
445       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
446       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
447       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
448       v  += 25;
449     }
450     xb +=5; ai++;
451   }
452 
453   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
454   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
455   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
456   PetscFunctionReturn(0);
457 }
458 
459 
460 #undef __FUNCT__
461 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
462 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
463 {
464   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
465   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
466   MatScalar      *v;
467   PetscErrorCode ierr;
468   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
469   PetscInt       nonzerorow=0;
470 
471   PetscFunctionBegin;
472   ierr = VecSet(zz,zero);CHKERRQ(ierr);
473   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
475 
476   v     = a->a;
477   xb = x;
478 
479   for (i=0; i<mbs; i++) {
480     n  = ai[1] - ai[0]; /* length of i_th block row of A */
481     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
482     ib = aj + *ai;
483     jmin = 0;
484     nonzerorow += (n>0);
485     if (*ib == i) {      /* (diag of A)*x */
486       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
487       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
488       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
489       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
490       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
491       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
492       v += 36; jmin++;
493     }
494     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
495     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
496     for (j=jmin; j<n; j++) {
497       /* (strict lower triangular part of A)*x  */
498       cval       = ib[j]*6;
499       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
500       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
501       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
502       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
503       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
504       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
505       /* (strict upper triangular part of A)*x  */
506       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
507       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
508       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
509       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
510       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
511       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
512       v  += 36;
513     }
514     xb +=6; ai++;
515   }
516 
517   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
518   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
519   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
520   PetscFunctionReturn(0);
521 }
522 #undef __FUNCT__
523 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
524 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
525 {
526   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
527   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
528   MatScalar      *v;
529   PetscErrorCode ierr;
530   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
531   PetscInt       nonzerorow=0;
532 
533   PetscFunctionBegin;
534   ierr = VecSet(zz,zero);CHKERRQ(ierr);
535   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
536   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
537 
538   v     = a->a;
539   xb = x;
540 
541   for (i=0; i<mbs; i++) {
542     n  = ai[1] - ai[0]; /* length of i_th block row of A */
543     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
544     ib = aj + *ai;
545     jmin = 0;
546     nonzerorow += (n>0);
547     if (*ib == i) {      /* (diag of A)*x */
548       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
549       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
550       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
551       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
552       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
553       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
554       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
555       v += 49; jmin++;
556     }
557     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
558     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
559     for (j=jmin; j<n; j++) {
560       /* (strict lower triangular part of A)*x  */
561       cval       = ib[j]*7;
562       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
563       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
564       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
565       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
566       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
567       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
568       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
569       /* (strict upper triangular part of A)*x  */
570       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
571       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
572       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
573       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
574       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
575       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
576       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
577       v  += 49;
578     }
579     xb +=7; ai++;
580   }
581   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
582   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
583   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
584   PetscFunctionReturn(0);
585 }
586 
587 /*
588     This will not work with MatScalar == float because it calls the BLAS
589 */
590 #undef __FUNCT__
591 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
592 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
593 {
594   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
595   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
596   MatScalar      *v;
597   PetscErrorCode ierr;
598   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
599   PetscInt       nonzerorow=0;
600 
601   PetscFunctionBegin;
602   ierr = VecSet(zz,zero);CHKERRQ(ierr);
603   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
604   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
605 
606   aj   = a->j;
607   v    = a->a;
608   ii   = a->i;
609 
610   if (!a->mult_work) {
611     ierr = PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
612   }
613   work = a->mult_work;
614 
615   for (i=0; i<mbs; i++) {
616     n     = ii[1] - ii[0]; ncols = n*bs;
617     workt = work; idx=aj+ii[0];
618     nonzerorow += (n>0);
619 
620     /* upper triangular part */
621     for (j=0; j<n; j++) {
622       xb = x_ptr + bs*(*idx++);
623       for (k=0; k<bs; k++) workt[k] = xb[k];
624       workt += bs;
625     }
626     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
627     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
628 
629     /* strict lower triangular part */
630     idx = aj+ii[0];
631     if (*idx == i) {
632       ncols -= bs; v += bs2; idx++; n--;
633     }
634 
635     if (ncols > 0) {
636       workt = work;
637       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
638       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
639       for (j=0; j<n; j++) {
640         zb = z_ptr + bs*(*idx++);
641         for (k=0; k<bs; k++) zb[k] += workt[k] ;
642         workt += bs;
643       }
644     }
645     x += bs; v += n*bs2; z += bs; ii++;
646   }
647 
648   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
649   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
650   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
651   PetscFunctionReturn(0);
652 }
653 
654 #undef __FUNCT__
655 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
656 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
657 {
658   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
659   PetscScalar    *x,*z,*xb,x1;
660   MatScalar      *v;
661   PetscErrorCode ierr;
662   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
663   PetscInt       nonzerorow=0;
664 
665   PetscFunctionBegin;
666   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
667   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
668   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
669   v  = a->a;
670   xb = x;
671 
672   for (i=0; i<mbs; i++) {
673     n  = ai[1] - ai[0];  /* length of i_th row of A */
674     x1 = xb[0];
675     ib = aj + *ai;
676     jmin = 0;
677     nonzerorow += (n>0);
678     if (*ib == i) {            /* (diag of A)*x */
679       z[i] += *v++ * x[*ib++]; jmin++;
680     }
681     for (j=jmin; j<n; j++) {
682       cval    = *ib;
683       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
684       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
685     }
686     xb++; ai++;
687   }
688 
689   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
690   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
691 
692   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
693   PetscFunctionReturn(0);
694 }
695 
696 #undef __FUNCT__
697 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
698 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
699 {
700   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
701   PetscScalar    *x,*z,*xb,x1,x2;
702   MatScalar      *v;
703   PetscErrorCode ierr;
704   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
705   PetscInt       nonzerorow=0;
706 
707   PetscFunctionBegin;
708   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
709   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
710   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
711 
712   v  = a->a;
713   xb = x;
714 
715   for (i=0; i<mbs; i++) {
716     n  = ai[1] - ai[0]; /* length of i_th block row of A */
717     x1 = xb[0]; x2 = xb[1];
718     ib = aj + *ai;
719     jmin = 0;
720     nonzerorow += (n>0);
721     if (*ib == i) {      /* (diag of A)*x */
722       z[2*i]   += v[0]*x1 + v[2]*x2;
723       z[2*i+1] += v[2]*x1 + v[3]*x2;
724       v += 4; jmin++;
725     }
726     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
727     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
728     for (j=jmin; j<n; j++) {
729       /* (strict lower triangular part of A)*x  */
730       cval       = ib[j]*2;
731       z[cval]     += v[0]*x1 + v[1]*x2;
732       z[cval+1]   += v[2]*x1 + v[3]*x2;
733       /* (strict upper triangular part of A)*x  */
734       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
735       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
736       v  += 4;
737     }
738     xb +=2; ai++;
739   }
740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
741   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
742 
743   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
744   PetscFunctionReturn(0);
745 }
746 
747 #undef __FUNCT__
748 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
749 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
750 {
751   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
752   PetscScalar    *x,*z,*xb,x1,x2,x3;
753   MatScalar      *v;
754   PetscErrorCode ierr;
755   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
756   PetscInt       nonzerorow=0;
757 
758   PetscFunctionBegin;
759   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
760   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
761   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
762 
763   v     = a->a;
764   xb = x;
765 
766   for (i=0; i<mbs; i++) {
767     n  = ai[1] - ai[0]; /* length of i_th block row of A */
768     x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
769     ib = aj + *ai;
770     jmin = 0;
771     nonzerorow += (n>0);
772     if (*ib == i) {     /* (diag of A)*x */
773      z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
774      z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
775      z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
776      v += 9; jmin++;
777     }
778     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
779     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
780     for (j=jmin; j<n; j++) {
781       /* (strict lower triangular part of A)*x  */
782       cval       = ib[j]*3;
783       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3;
784       z[cval+1]   += v[3]*x1 + v[4]*x2 + v[5]*x3;
785       z[cval+2]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
786       /* (strict upper triangular part of A)*x  */
787       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
788       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
789       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
790       v  += 9;
791     }
792     xb +=3; ai++;
793   }
794 
795   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
796   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
797 
798   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
799   PetscFunctionReturn(0);
800 }
801 
802 #undef __FUNCT__
803 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
804 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
805 {
806   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
807   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
808   MatScalar      *v;
809   PetscErrorCode ierr;
810   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
811   PetscInt       nonzerorow=0;
812 
813   PetscFunctionBegin;
814   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
815   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
816   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
817 
818   v     = a->a;
819   xb = x;
820 
821   for (i=0; i<mbs; i++) {
822     n  = ai[1] - ai[0]; /* length of i_th block row of A */
823     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
824     ib = aj + *ai;
825     jmin = 0;
826     nonzerorow += (n>0);
827     if (*ib == i) {      /* (diag of A)*x */
828       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
829       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
830       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
831       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
832       v += 16; jmin++;
833     }
834     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
835     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
836     for (j=jmin; j<n; j++) {
837       /* (strict lower triangular part of A)*x  */
838       cval       = ib[j]*4;
839       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
840       z[cval+1]   += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
841       z[cval+2]   += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
842       z[cval+3]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
843       /* (strict upper triangular part of A)*x  */
844       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
845       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
846       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
847       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
848       v  += 16;
849     }
850     xb +=4; ai++;
851   }
852 
853   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
854   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
855 
856   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
857   PetscFunctionReturn(0);
858 }
859 
860 #undef __FUNCT__
861 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
862 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
863 {
864   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
865   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
866   MatScalar      *v;
867   PetscErrorCode ierr;
868   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
869   PetscInt       nonzerorow=0;
870 
871   PetscFunctionBegin;
872   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
873   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
874   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
875 
876   v     = a->a;
877   xb = x;
878 
879   for (i=0; i<mbs; i++) {
880     n  = ai[1] - ai[0]; /* length of i_th block row of A */
881     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
882     ib = aj + *ai;
883     jmin = 0;
884     nonzerorow += (n>0);
885     if (*ib == i) {      /* (diag of A)*x */
886       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
887       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
888       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
889       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
890       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
891       v += 25; jmin++;
892     }
893     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
894     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
895     for (j=jmin; j<n; j++) {
896       /* (strict lower triangular part of A)*x  */
897       cval       = ib[j]*5;
898       z[cval]     += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
899       z[cval+1]   += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
900       z[cval+2]   += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
901       z[cval+3]   += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
902       z[cval+4]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
903       /* (strict upper triangular part of A)*x  */
904       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
905       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
906       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
907       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
908       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
909       v  += 25;
910     }
911     xb +=5; ai++;
912   }
913 
914   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
915   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
916 
917   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
918   PetscFunctionReturn(0);
919 }
920 #undef __FUNCT__
921 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
922 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
923 {
924   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
925   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
926   MatScalar      *v;
927   PetscErrorCode ierr;
928   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
929   PetscInt       nonzerorow=0;
930 
931   PetscFunctionBegin;
932   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
933   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
934   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
935 
936   v     = a->a;
937   xb = x;
938 
939   for (i=0; i<mbs; i++) {
940     n  = ai[1] - ai[0]; /* length of i_th block row of A */
941     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
942     ib = aj + *ai;
943     jmin = 0;
944     nonzerorow += (n>0);
945     if (*ib == i) {     /* (diag of A)*x */
946       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
947       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
948       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
949       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
950       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
951       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
952       v += 36; jmin++;
953     }
954     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
955     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
956     for (j=jmin; j<n; j++) {
957       /* (strict lower triangular part of A)*x  */
958       cval       = ib[j]*6;
959       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
960       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
961       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
962       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
963       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
964       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
965       /* (strict upper triangular part of A)*x  */
966       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
967       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
968       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
969       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
970       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
971       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
972       v  += 36;
973     }
974     xb +=6; ai++;
975   }
976 
977   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
978   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
979 
980   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
981   PetscFunctionReturn(0);
982 }
983 
984 #undef __FUNCT__
985 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
986 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
987 {
988   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
989   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
990   MatScalar      *v;
991   PetscErrorCode ierr;
992   PetscInt       mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
993   PetscInt       nonzerorow=0;
994 
995   PetscFunctionBegin;
996   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
997   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
998   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
999 
1000   v     = a->a;
1001   xb = x;
1002 
1003   for (i=0; i<mbs; i++) {
1004     n  = ai[1] - ai[0]; /* length of i_th block row of A */
1005     x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1006     ib = aj + *ai;
1007     jmin = 0;
1008     nonzerorow += (n>0);
1009     if (*ib == i) {     /* (diag of A)*x */
1010       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1011       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1012       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1013       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1014       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1015       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1016       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1017       v += 49; jmin++;
1018     }
1019     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1020     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1021     for (j=jmin; j<n; j++) {
1022       /* (strict lower triangular part of A)*x  */
1023       cval       = ib[j]*7;
1024       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1025       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1026       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1027       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1028       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1029       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1030       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1031       /* (strict upper triangular part of A)*x  */
1032       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1033       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1034       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1035       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1036       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1037       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1038       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1039       v  += 49;
1040     }
1041     xb +=7; ai++;
1042   }
1043 
1044   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1045   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1046 
1047   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1048   PetscFunctionReturn(0);
1049 }
1050 
1051 #undef __FUNCT__
1052 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1053 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1054 {
1055   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1056   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1057   MatScalar      *v;
1058   PetscErrorCode ierr;
1059   PetscInt       mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1060   PetscInt       nonzerorow=0;
1061 
1062   PetscFunctionBegin;
1063   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1064   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1065   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1066 
1067   aj   = a->j;
1068   v    = a->a;
1069   ii   = a->i;
1070 
1071   if (!a->mult_work) {
1072     ierr = PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);CHKERRQ(ierr);
1073   }
1074   work = a->mult_work;
1075 
1076 
1077   for (i=0; i<mbs; i++) {
1078     n     = ii[1] - ii[0]; ncols = n*bs;
1079     workt = work; idx=aj+ii[0];
1080     nonzerorow += (n>0);
1081 
1082     /* upper triangular part */
1083     for (j=0; j<n; j++) {
1084       xb = x_ptr + bs*(*idx++);
1085       for (k=0; k<bs; k++) workt[k] = xb[k];
1086       workt += bs;
1087     }
1088     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1089     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1090 
1091     /* strict lower triangular part */
1092     idx = aj+ii[0];
1093     if (*idx == i) {
1094       ncols -= bs; v += bs2; idx++; n--;
1095     }
1096     if (ncols > 0) {
1097       workt = work;
1098       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1099       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1100       for (j=0; j<n; j++) {
1101         zb = z_ptr + bs*(*idx++);
1102         for (k=0; k<bs; k++) zb[k] += workt[k] ;
1103         workt += bs;
1104       }
1105     }
1106 
1107     x += bs; v += n*bs2; z += bs; ii++;
1108   }
1109 
1110   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1111   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1112 
1113   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1114   PetscFunctionReturn(0);
1115 }
1116 
1117 #undef __FUNCT__
1118 #define __FUNCT__ "MatScale_SeqSBAIJ"
1119 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1120 {
1121   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)inA->data;
1122   PetscScalar    oalpha = alpha;
1123   PetscErrorCode ierr;
1124   PetscBLASInt   one = 1,totalnz;
1125 
1126   PetscFunctionBegin;
1127   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1128   BLASscal_(&totalnz,&oalpha,a->a,&one);
1129   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1130   PetscFunctionReturn(0);
1131 }
1132 
1133 #undef __FUNCT__
1134 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1135 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1136 {
1137   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1138   MatScalar      *v = a->a;
1139   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1140   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1141   PetscErrorCode ierr;
1142   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1143 
1144   PetscFunctionBegin;
1145   if (type == NORM_FROBENIUS) {
1146     for (k=0; k<mbs; k++) {
1147       jmin = a->i[k]; jmax = a->i[k+1];
1148       col  = aj + jmin;
1149       if (*col == k) {         /* diagonal block */
1150         for (i=0; i<bs2; i++) {
1151           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1152         }
1153         jmin++;
1154       }
1155       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1156         for (i=0; i<bs2; i++) {
1157           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1158         }
1159       }
1160     }
1161     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1162   }  else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1163     ierr = PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);CHKERRQ(ierr);
1164     for (i=0; i<mbs; i++) jl[i] = mbs;
1165     il[0] = 0;
1166 
1167     *norm = 0.0;
1168     for (k=0; k<mbs; k++) { /* k_th block row */
1169       for (j=0; j<bs; j++) sum[j]=0.0;
1170       /*-- col sum --*/
1171       i = jl[k]; /* first |A(i,k)| to be added */
1172       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1173                   at step k */
1174       while (i<mbs) {
1175         nexti = jl[i];  /* next block row to be added */
1176         ik    = il[i];  /* block index of A(i,k) in the array a */
1177         for (j=0; j<bs; j++) {
1178           v = a->a + ik*bs2 + j*bs;
1179           for (k1=0; k1<bs; k1++) {
1180             sum[j] += PetscAbsScalar(*v); v++;
1181           }
1182         }
1183         /* update il, jl */
1184         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1185         jmax = a->i[i+1];
1186         if (jmin < jmax) {
1187           il[i] = jmin;
1188           j   = a->j[jmin];
1189           jl[i] = jl[j]; jl[j]=i;
1190         }
1191         i = nexti;
1192       }
1193       /*-- row sum --*/
1194       jmin = a->i[k]; jmax = a->i[k+1];
1195       for (i=jmin; i<jmax; i++) {
1196         for (j=0; j<bs; j++) {
1197           v = a->a + i*bs2 + j;
1198           for (k1=0; k1<bs; k1++) {
1199             sum[j] += PetscAbsScalar(*v); v += bs;
1200           }
1201         }
1202       }
1203       /* add k_th block row to il, jl */
1204       col = aj+jmin;
1205       if (*col == k) jmin++;
1206       if (jmin < jmax) {
1207         il[k] = jmin;
1208         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1209       }
1210       for (j=0; j<bs; j++) {
1211         if (sum[j] > *norm) *norm = sum[j];
1212       }
1213     }
1214     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1215   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1216   PetscFunctionReturn(0);
1217 }
1218 
1219 #undef __FUNCT__
1220 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1221 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1222 {
1223   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1224   PetscErrorCode ierr;
1225 
1226   PetscFunctionBegin;
1227   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1228   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1229     *flg = PETSC_FALSE;
1230     PetscFunctionReturn(0);
1231   }
1232 
1233   /* if the a->i are the same */
1234   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1235   if (!*flg) {
1236     PetscFunctionReturn(0);
1237   }
1238 
1239   /* if a->j are the same */
1240   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1241   if (!*flg) {
1242     PetscFunctionReturn(0);
1243   }
1244   /* if a->a are the same */
1245   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1246   PetscFunctionReturn(0);
1247 }
1248 
1249 #undef __FUNCT__
1250 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1251 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1252 {
1253   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1254   PetscErrorCode ierr;
1255   PetscInt       i,j,k,row,bs,*ai,*aj,ambs,bs2;
1256   PetscScalar    *x,zero = 0.0;
1257   MatScalar      *aa,*aa_j;
1258 
1259   PetscFunctionBegin;
1260   bs   = A->rmap->bs;
1261   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1262 
1263   aa   = a->a;
1264   ambs = a->mbs;
1265 
1266   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1267     PetscInt *diag=a->diag;
1268     aa   = a->a;
1269     ambs = a->mbs;
1270     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1271     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1272     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1273     PetscFunctionReturn(0);
1274   }
1275 
1276   ai   = a->i;
1277   aj   = a->j;
1278   bs2  = a->bs2;
1279   ierr = VecSet(v,zero);CHKERRQ(ierr);
1280   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1281   for (i=0; i<ambs; i++) {
1282     j=ai[i];
1283     if (aj[j] == i) {    /* if this is a diagonal element */
1284       row  = i*bs;
1285       aa_j = aa + j*bs2;
1286       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1287     }
1288   }
1289   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1290   PetscFunctionReturn(0);
1291 }
1292 
1293 #undef __FUNCT__
1294 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1295 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1296 {
1297   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1298   PetscScalar    *l,x,*li,*ri;
1299   MatScalar      *aa,*v;
1300   PetscErrorCode ierr;
1301   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1302   PetscBool      flg;
1303 
1304   PetscFunctionBegin;
1305   if (ll != rr) {
1306     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1307     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1308   }
1309   if (!ll) PetscFunctionReturn(0);
1310   ai  = a->i;
1311   aj  = a->j;
1312   aa  = a->a;
1313   m   = A->rmap->N;
1314   bs  = A->rmap->bs;
1315   mbs = a->mbs;
1316   bs2 = a->bs2;
1317 
1318   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1319   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1320   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1321   for (i=0; i<mbs; i++) { /* for each block row */
1322     M  = ai[i+1] - ai[i];
1323     li = l + i*bs;
1324     v  = aa + bs2*ai[i];
1325     for (j=0; j<M; j++) { /* for each block */
1326       ri = l + bs*aj[ai[i]+j];
1327       for (k=0; k<bs; k++) {
1328         x = ri[k];
1329         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1330       }
1331     }
1332   }
1333   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1334   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 #undef __FUNCT__
1339 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1340 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1341 {
1342   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1343 
1344   PetscFunctionBegin;
1345   info->block_size     = a->bs2;
1346   info->nz_allocated   = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1347   info->nz_used        = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1348   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1349   info->assemblies   = A->num_ass;
1350   info->mallocs      = A->info.mallocs;
1351   info->memory       = ((PetscObject)A)->mem;
1352   if (A->factortype) {
1353     info->fill_ratio_given  = A->info.fill_ratio_given;
1354     info->fill_ratio_needed = A->info.fill_ratio_needed;
1355     info->factor_mallocs    = A->info.factor_mallocs;
1356   } else {
1357     info->fill_ratio_given  = 0;
1358     info->fill_ratio_needed = 0;
1359     info->factor_mallocs    = 0;
1360   }
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 
1365 #undef __FUNCT__
1366 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1367 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1368 {
1369   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1370   PetscErrorCode ierr;
1371 
1372   PetscFunctionBegin;
1373   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 #undef __FUNCT__
1378 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1379 /*
1380    This code does not work since it only checks the upper triangular part of
1381   the matrix. Hence it is not listed in the function table.
1382 */
1383 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1384 {
1385   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1386   PetscErrorCode ierr;
1387   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1388   PetscReal      atmp;
1389   MatScalar      *aa;
1390   PetscScalar    *x;
1391   PetscInt       ncols,brow,bcol,krow,kcol;
1392 
1393   PetscFunctionBegin;
1394   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1395   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1396   bs   = A->rmap->bs;
1397   aa   = a->a;
1398   ai   = a->i;
1399   aj   = a->j;
1400   mbs = a->mbs;
1401 
1402   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1403   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1404   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1405   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1406   for (i=0; i<mbs; i++) {
1407     ncols = ai[1] - ai[0]; ai++;
1408     brow  = bs*i;
1409     for (j=0; j<ncols; j++) {
1410       bcol = bs*(*aj);
1411       for (kcol=0; kcol<bs; kcol++) {
1412         col = bcol + kcol;      /* col index */
1413         for (krow=0; krow<bs; krow++) {
1414           atmp = PetscAbsScalar(*aa); aa++;
1415           row = brow + krow;    /* row index */
1416           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1417           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1418         }
1419       }
1420       aj++;
1421     }
1422   }
1423   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1424   PetscFunctionReturn(0);
1425 }
1426