xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision bebe2cf65d55febe21a5af8db2bd2e168caaa2e7)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc/private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table_out,table_in;
17 
18   PetscFunctionBegin;
19   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs  = a->mbs;
21   ai   = a->i;
22   aj   = a->j;
23   bs   = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n; ++j) {
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if (!PetscBTLookupSet(table_out,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap */
52       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++) {
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end; l++) {
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table_out,bcol)) {
63               nidx[isz++] = bcol;
64               if (bcol_max < bcol) bcol_max = bcol;
65             }
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end; l++) {
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table_in,bcol)) {
74               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85     }
86     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87   }
88   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89   ierr = PetscFree(nidx);CHKERRQ(ierr);
90   ierr = PetscFree(nidx2);CHKERRQ(ierr);
91   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,MatReuse scall,Mat *B)
98 {
99   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data,*c;
100   PetscErrorCode  ierr;
101   PetscInt        *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
102   PetscInt        row,mat_i,*mat_j,tcol,*mat_ilen;
103   PetscInt        nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
104   const PetscInt  *irow,*aj = a->j,*ai = a->i;
105   MatScalar       *mat_a;
106   Mat             C;
107   PetscBool       flag,sorted;
108 
109   PetscFunctionBegin;
110   ierr = ISSorted(isrow,&sorted);CHKERRQ(ierr);
111   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
112 
113   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
114   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
115 
116   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
117   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
118   ssmap = smap;
119   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
120   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
121   /* determine lens of each row */
122   for (i=0; i<nrows; i++) {
123     kstart  = ai[irow[i]];
124     kend    = kstart + a->ilen[irow[i]];
125     lens[i] = 0;
126     for (k=kstart; k<kend; k++) {
127       if (ssmap[aj[k]]) lens[i]++;
128     }
129   }
130   /* Create and fill new matrix */
131   if (scall == MAT_REUSE_MATRIX) {
132     c = (Mat_SeqSBAIJ*)((*B)->data);
133 
134     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
135     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
136     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
137     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
138     C    = *B;
139   } else {
140     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
141     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
142     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
143     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
144   }
145   c = (Mat_SeqSBAIJ*)(C->data);
146   for (i=0; i<nrows; i++) {
147     row      = irow[i];
148     kstart   = ai[row];
149     kend     = kstart + a->ilen[row];
150     mat_i    = c->i[i];
151     mat_j    = c->j + mat_i;
152     mat_a    = c->a + mat_i*bs2;
153     mat_ilen = c->ilen + i;
154     for (k=kstart; k<kend; k++) {
155       if ((tcol=ssmap[a->j[k]])) {
156         *mat_j++ = tcol - 1;
157         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
158         mat_a   += bs2;
159         (*mat_ilen)++;
160       }
161     }
162   }
163 
164   /* Free work space */
165   ierr = PetscFree(smap);CHKERRQ(ierr);
166   ierr = PetscFree(lens);CHKERRQ(ierr);
167   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
168   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169 
170   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
171   *B   = C;
172   PetscFunctionReturn(0);
173 }
174 
175 #undef __FUNCT__
176 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
177 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
178 {
179   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
180   IS             is1;
181   PetscErrorCode ierr;
182   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
183   const PetscInt *irow;
184 
185   PetscFunctionBegin;
186   if (isrow != iscol) {
187     PetscBool isequal;
188     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
189     if (!isequal)
190       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isrow");
191   }
192 
193   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
194   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
195 
196   /* Verify if the indices corespond to each element in a block
197    and form the IS with compressed IS */
198   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
199   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
200   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
201 
202   count = 0;
203   for (i=0; i<a->mbs; i++) {
204     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
205     if (vary[i]==bs) iary[count++] = i;
206   }
207   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
208   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
209   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
210 
211   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,scall,B);CHKERRQ(ierr);
212   ierr = ISDestroy(&is1);CHKERRQ(ierr);
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
218 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
219 {
220   PetscErrorCode ierr;
221   PetscInt       i;
222 
223   PetscFunctionBegin;
224   if (scall == MAT_INITIAL_MATRIX) {
225     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<n; i++) {
229     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
230   }
231   PetscFunctionReturn(0);
232 }
233 
234 /* -------------------------------------------------------*/
235 /* Should check that shapes of vectors and matrices match */
236 /* -------------------------------------------------------*/
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
240 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
241 {
242   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
243   PetscScalar       *z,x1,x2,zero=0.0;
244   const PetscScalar *x,*xb;
245   const MatScalar   *v;
246   PetscErrorCode    ierr;
247   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
248   const PetscInt    *aj=a->j,*ai=a->i,*ib;
249   PetscInt          nonzerorow=0;
250 
251   PetscFunctionBegin;
252   ierr = VecSet(zz,zero);CHKERRQ(ierr);
253   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
254   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
255 
256   v  = a->a;
257   xb = x;
258 
259   for (i=0; i<mbs; i++) {
260     n           = ai[1] - ai[0]; /* length of i_th block row of A */
261     x1          = xb[0]; x2 = xb[1];
262     ib          = aj + *ai;
263     jmin        = 0;
264     nonzerorow += (n>0);
265     if (*ib == i) {     /* (diag of A)*x */
266       z[2*i]   += v[0]*x1 + v[2]*x2;
267       z[2*i+1] += v[2]*x1 + v[3]*x2;
268       v        += 4; jmin++;
269     }
270     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
271     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
272     for (j=jmin; j<n; j++) {
273       /* (strict lower triangular part of A)*x  */
274       cval       = ib[j]*2;
275       z[cval]   += v[0]*x1 + v[1]*x2;
276       z[cval+1] += v[2]*x1 + v[3]*x2;
277       /* (strict upper triangular part of A)*x  */
278       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
279       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
280       v        += 4;
281     }
282     xb +=2; ai++;
283   }
284 
285   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
286   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
287   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
288   PetscFunctionReturn(0);
289 }
290 
291 #undef __FUNCT__
292 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
293 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
294 {
295   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
296   PetscScalar       *z,x1,x2,x3,zero=0.0;
297   const PetscScalar *x,*xb;
298   const MatScalar   *v;
299   PetscErrorCode    ierr;
300   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
301   const PetscInt    *aj = a->j,*ai = a->i,*ib;
302   PetscInt          nonzerorow=0;
303 
304   PetscFunctionBegin;
305   ierr = VecSet(zz,zero);CHKERRQ(ierr);
306   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
307   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
308 
309   v  = a->a;
310   xb = x;
311 
312   for (i=0; i<mbs; i++) {
313     n           = ai[1] - ai[0]; /* length of i_th block row of A */
314     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
315     ib          = aj + *ai;
316     jmin        = 0;
317     nonzerorow += (n>0);
318     if (*ib == i) {     /* (diag of A)*x */
319       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
320       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
321       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
322       v        += 9; jmin++;
323     }
324     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
325     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
326     for (j=jmin; j<n; j++) {
327       /* (strict lower triangular part of A)*x  */
328       cval       = ib[j]*3;
329       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
330       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
331       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
332       /* (strict upper triangular part of A)*x  */
333       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
334       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
335       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
336       v        += 9;
337     }
338     xb +=3; ai++;
339   }
340 
341   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
342   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
343   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
349 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
350 {
351   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
352   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
353   const PetscScalar *x,*xb;
354   const MatScalar   *v;
355   PetscErrorCode    ierr;
356   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
357   const PetscInt    *aj = a->j,*ai = a->i,*ib;
358   PetscInt          nonzerorow = 0;
359 
360   PetscFunctionBegin;
361   ierr = VecSet(zz,zero);CHKERRQ(ierr);
362   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
363   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
364 
365   v  = a->a;
366   xb = x;
367 
368   for (i=0; i<mbs; i++) {
369     n           = ai[1] - ai[0]; /* length of i_th block row of A */
370     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
371     ib          = aj + *ai;
372     jmin        = 0;
373     nonzerorow += (n>0);
374     if (*ib == i) {     /* (diag of A)*x */
375       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
376       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
377       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
378       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
379       v        += 16; jmin++;
380     }
381     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
382     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
383     for (j=jmin; j<n; j++) {
384       /* (strict lower triangular part of A)*x  */
385       cval       = ib[j]*4;
386       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
387       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
388       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
389       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
390       /* (strict upper triangular part of A)*x  */
391       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
392       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
393       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
394       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
395       v        += 16;
396     }
397     xb +=4; ai++;
398   }
399 
400   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
401   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
402   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
408 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
409 {
410   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
411   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
412   const PetscScalar *x,*xb;
413   const MatScalar   *v;
414   PetscErrorCode    ierr;
415   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
416   const PetscInt    *aj = a->j,*ai = a->i,*ib;
417   PetscInt          nonzerorow=0;
418 
419   PetscFunctionBegin;
420   ierr = VecSet(zz,zero);CHKERRQ(ierr);
421   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
422   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
423 
424   v  = a->a;
425   xb = x;
426 
427   for (i=0; i<mbs; i++) {
428     n           = ai[1] - ai[0]; /* length of i_th block row of A */
429     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
430     ib          = aj + *ai;
431     jmin        = 0;
432     nonzerorow += (n>0);
433     if (*ib == i) {      /* (diag of A)*x */
434       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
435       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
436       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
437       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
438       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
439       v        += 25; jmin++;
440     }
441     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
442     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
443     for (j=jmin; j<n; j++) {
444       /* (strict lower triangular part of A)*x  */
445       cval       = ib[j]*5;
446       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
447       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
448       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
449       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
450       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
451       /* (strict upper triangular part of A)*x  */
452       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
453       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
454       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
455       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
456       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
457       v        += 25;
458     }
459     xb +=5; ai++;
460   }
461 
462   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
463   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
464   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
465   PetscFunctionReturn(0);
466 }
467 
468 
469 #undef __FUNCT__
470 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
471 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
472 {
473   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
474   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
475   const PetscScalar *x,*xb;
476   const MatScalar   *v;
477   PetscErrorCode    ierr;
478   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
479   const PetscInt    *aj=a->j,*ai=a->i,*ib;
480   PetscInt          nonzerorow=0;
481 
482   PetscFunctionBegin;
483   ierr = VecSet(zz,zero);CHKERRQ(ierr);
484   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
485   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
486 
487   v  = a->a;
488   xb = x;
489 
490   for (i=0; i<mbs; i++) {
491     n           = ai[1] - ai[0]; /* length of i_th block row of A */
492     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
493     ib          = aj + *ai;
494     jmin        = 0;
495     nonzerorow += (n>0);
496     if (*ib == i) {      /* (diag of A)*x */
497       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
498       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
499       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
500       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
501       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
502       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
503       v        += 36; jmin++;
504     }
505     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
506     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
507     for (j=jmin; j<n; j++) {
508       /* (strict lower triangular part of A)*x  */
509       cval       = ib[j]*6;
510       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
511       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
512       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
513       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
514       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
515       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
516       /* (strict upper triangular part of A)*x  */
517       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
518       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
519       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
520       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
521       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
522       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
523       v        += 36;
524     }
525     xb +=6; ai++;
526   }
527 
528   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
529   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
530   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
531   PetscFunctionReturn(0);
532 }
533 #undef __FUNCT__
534 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
535 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
536 {
537   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
538   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
539   const PetscScalar *x,*xb;
540   const MatScalar   *v;
541   PetscErrorCode    ierr;
542   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
543   const PetscInt    *aj=a->j,*ai=a->i,*ib;
544   PetscInt          nonzerorow=0;
545 
546   PetscFunctionBegin;
547   ierr = VecSet(zz,zero);CHKERRQ(ierr);
548   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
549   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
550 
551   v  = a->a;
552   xb = x;
553 
554   for (i=0; i<mbs; i++) {
555     n           = ai[1] - ai[0]; /* length of i_th block row of A */
556     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
557     ib          = aj + *ai;
558     jmin        = 0;
559     nonzerorow += (n>0);
560     if (*ib == i) {      /* (diag of A)*x */
561       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
562       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
563       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
564       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
565       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
566       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
567       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
568       v        += 49; jmin++;
569     }
570     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
571     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
572     for (j=jmin; j<n; j++) {
573       /* (strict lower triangular part of A)*x  */
574       cval       = ib[j]*7;
575       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
576       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
577       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
578       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
579       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
580       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
581       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
582       /* (strict upper triangular part of A)*x  */
583       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
584       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
585       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
586       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
587       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
588       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
589       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
590       v       += 49;
591     }
592     xb +=7; ai++;
593   }
594   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
595   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
596   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
597   PetscFunctionReturn(0);
598 }
599 
600 /*
601     This will not work with MatScalar == float because it calls the BLAS
602 */
603 #undef __FUNCT__
604 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
605 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
606 {
607   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
608   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
609   const PetscScalar *x,*x_ptr,*xb;
610   const MatScalar   *v;
611   PetscErrorCode    ierr;
612   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
613   const PetscInt    *idx,*aj,*ii;
614   PetscInt          nonzerorow=0;
615 
616   PetscFunctionBegin;
617   ierr = VecSet(zz,zero);CHKERRQ(ierr);
618   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);x_ptr = x;
619   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
620 
621   aj = a->j;
622   v  = a->a;
623   ii = a->i;
624 
625   if (!a->mult_work) {
626     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
627   }
628   work = a->mult_work;
629 
630   for (i=0; i<mbs; i++) {
631     n           = ii[1] - ii[0]; ncols = n*bs;
632     workt       = work; idx=aj+ii[0];
633     nonzerorow += (n>0);
634 
635     /* upper triangular part */
636     for (j=0; j<n; j++) {
637       xb = x_ptr + bs*(*idx++);
638       for (k=0; k<bs; k++) workt[k] = xb[k];
639       workt += bs;
640     }
641     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
642     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
643 
644     /* strict lower triangular part */
645     idx = aj+ii[0];
646     if (*idx == i) {
647       ncols -= bs; v += bs2; idx++; n--;
648     }
649 
650     if (ncols > 0) {
651       workt = work;
652       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
653       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
654       for (j=0; j<n; j++) {
655         zb = z_ptr + bs*(*idx++);
656         for (k=0; k<bs; k++) zb[k] += workt[k];
657         workt += bs;
658       }
659     }
660     x += bs; v += n*bs2; z += bs; ii++;
661   }
662 
663   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
664   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
665   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
666   PetscFunctionReturn(0);
667 }
668 
669 #undef __FUNCT__
670 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
671 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
672 {
673   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
674   PetscScalar       *z,x1;
675   const PetscScalar *x,*xb;
676   const MatScalar   *v;
677   PetscErrorCode    ierr;
678   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
679   const PetscInt    *aj=a->j,*ai=a->i,*ib;
680   PetscInt          nonzerorow=0;
681 
682   PetscFunctionBegin;
683   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
684   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
685   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
686   v    = a->a;
687   xb   = x;
688 
689   for (i=0; i<mbs; i++) {
690     n           = ai[1] - ai[0]; /* length of i_th row of A */
691     x1          = xb[0];
692     ib          = aj + *ai;
693     jmin        = 0;
694     nonzerorow += (n>0);
695     if (*ib == i) {            /* (diag of A)*x */
696       z[i] += *v++ * x[*ib++]; jmin++;
697     }
698     for (j=jmin; j<n; j++) {
699       cval    = *ib;
700       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
701       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
702     }
703     xb++; ai++;
704   }
705 
706   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
707   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
708 
709   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
710   PetscFunctionReturn(0);
711 }
712 
713 #undef __FUNCT__
714 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
715 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
716 {
717   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
718   PetscScalar       *z,x1,x2;
719   const PetscScalar *x,*xb;
720   const MatScalar   *v;
721   PetscErrorCode    ierr;
722   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
723   const PetscInt    *aj=a->j,*ai=a->i,*ib;
724   PetscInt          nonzerorow=0;
725 
726   PetscFunctionBegin;
727   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
728   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
729   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
730 
731   v  = a->a;
732   xb = x;
733 
734   for (i=0; i<mbs; i++) {
735     n           = ai[1] - ai[0]; /* length of i_th block row of A */
736     x1          = xb[0]; x2 = xb[1];
737     ib          = aj + *ai;
738     jmin        = 0;
739     nonzerorow += (n>0);
740     if (*ib == i) {      /* (diag of A)*x */
741       z[2*i]   += v[0]*x1 + v[2]*x2;
742       z[2*i+1] += v[2]*x1 + v[3]*x2;
743       v        += 4; jmin++;
744     }
745     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
746     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
747     for (j=jmin; j<n; j++) {
748       /* (strict lower triangular part of A)*x  */
749       cval       = ib[j]*2;
750       z[cval]   += v[0]*x1 + v[1]*x2;
751       z[cval+1] += v[2]*x1 + v[3]*x2;
752       /* (strict upper triangular part of A)*x  */
753       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
754       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
755       v        += 4;
756     }
757     xb +=2; ai++;
758   }
759   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
760   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
761 
762   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
763   PetscFunctionReturn(0);
764 }
765 
766 #undef __FUNCT__
767 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
768 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
769 {
770   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
771   PetscScalar       *z,x1,x2,x3;
772   const PetscScalar *x,*xb;
773   const MatScalar   *v;
774   PetscErrorCode    ierr;
775   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
776   const PetscInt    *aj=a->j,*ai=a->i,*ib;
777   PetscInt          nonzerorow=0;
778 
779   PetscFunctionBegin;
780   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
781   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
782   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
783 
784   v  = a->a;
785   xb = x;
786 
787   for (i=0; i<mbs; i++) {
788     n           = ai[1] - ai[0]; /* length of i_th block row of A */
789     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
790     ib          = aj + *ai;
791     jmin        = 0;
792     nonzerorow += (n>0);
793     if (*ib == i) {     /* (diag of A)*x */
794       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
795       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
796       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
797       v        += 9; jmin++;
798     }
799     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
800     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
801     for (j=jmin; j<n; j++) {
802       /* (strict lower triangular part of A)*x  */
803       cval       = ib[j]*3;
804       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
805       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
806       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
807       /* (strict upper triangular part of A)*x  */
808       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
809       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
810       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
811       v        += 9;
812     }
813     xb +=3; ai++;
814   }
815 
816   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
817   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
818 
819   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
820   PetscFunctionReturn(0);
821 }
822 
823 #undef __FUNCT__
824 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
825 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
826 {
827   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
828   PetscScalar       *z,x1,x2,x3,x4;
829   const PetscScalar *x,*xb;
830   const MatScalar   *v;
831   PetscErrorCode    ierr;
832   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
833   const PetscInt    *aj=a->j,*ai=a->i,*ib;
834   PetscInt          nonzerorow=0;
835 
836   PetscFunctionBegin;
837   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
838   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
839   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
840 
841   v  = a->a;
842   xb = x;
843 
844   for (i=0; i<mbs; i++) {
845     n           = ai[1] - ai[0]; /* length of i_th block row of A */
846     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
847     ib          = aj + *ai;
848     jmin        = 0;
849     nonzerorow += (n>0);
850     if (*ib == i) {      /* (diag of A)*x */
851       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
852       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
853       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
854       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
855       v        += 16; jmin++;
856     }
857     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
858     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
859     for (j=jmin; j<n; j++) {
860       /* (strict lower triangular part of A)*x  */
861       cval       = ib[j]*4;
862       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
863       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
864       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
865       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
866       /* (strict upper triangular part of A)*x  */
867       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
868       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
869       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
870       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
871       v        += 16;
872     }
873     xb +=4; ai++;
874   }
875 
876   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
877   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
878 
879   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
880   PetscFunctionReturn(0);
881 }
882 
883 #undef __FUNCT__
884 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
885 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
886 {
887   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
888   PetscScalar       *z,x1,x2,x3,x4,x5;
889   const PetscScalar *x,*xb;
890   const MatScalar   *v;
891   PetscErrorCode    ierr;
892   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
893   const PetscInt    *aj=a->j,*ai=a->i,*ib;
894   PetscInt          nonzerorow=0;
895 
896   PetscFunctionBegin;
897   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
898   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
899   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
900 
901   v  = a->a;
902   xb = x;
903 
904   for (i=0; i<mbs; i++) {
905     n           = ai[1] - ai[0]; /* length of i_th block row of A */
906     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
907     ib          = aj + *ai;
908     jmin        = 0;
909     nonzerorow += (n>0);
910     if (*ib == i) {      /* (diag of A)*x */
911       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
912       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
913       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
914       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
915       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
916       v        += 25; jmin++;
917     }
918     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
919     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
920     for (j=jmin; j<n; j++) {
921       /* (strict lower triangular part of A)*x  */
922       cval       = ib[j]*5;
923       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
924       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
925       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
926       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
927       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
928       /* (strict upper triangular part of A)*x  */
929       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
930       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
931       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
932       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
933       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
934       v        += 25;
935     }
936     xb +=5; ai++;
937   }
938 
939   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
940   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
941 
942   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
943   PetscFunctionReturn(0);
944 }
945 #undef __FUNCT__
946 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
947 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
948 {
949   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
950   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
951   const PetscScalar *x,*xb;
952   const MatScalar   *v;
953   PetscErrorCode    ierr;
954   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
955   const PetscInt    *aj=a->j,*ai=a->i,*ib;
956   PetscInt          nonzerorow=0;
957 
958   PetscFunctionBegin;
959   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
960   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
961   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
962 
963   v  = a->a;
964   xb = x;
965 
966   for (i=0; i<mbs; i++) {
967     n           = ai[1] - ai[0]; /* length of i_th block row of A */
968     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
969     ib          = aj + *ai;
970     jmin        = 0;
971     nonzerorow += (n>0);
972     if (*ib == i) {     /* (diag of A)*x */
973       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
974       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
975       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
976       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
977       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
978       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
979       v        += 36; jmin++;
980     }
981     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
982     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
983     for (j=jmin; j<n; j++) {
984       /* (strict lower triangular part of A)*x  */
985       cval       = ib[j]*6;
986       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
987       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
988       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
989       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
990       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
991       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
992       /* (strict upper triangular part of A)*x  */
993       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
994       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
995       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
996       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
997       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
998       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
999       v        += 36;
1000     }
1001     xb +=6; ai++;
1002   }
1003 
1004   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1005   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1006 
1007   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1008   PetscFunctionReturn(0);
1009 }
1010 
1011 #undef __FUNCT__
1012 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
1013 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1014 {
1015   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1016   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1017   const PetscScalar *x,*xb;
1018   const MatScalar   *v;
1019   PetscErrorCode    ierr;
1020   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1021   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1022   PetscInt          nonzerorow=0;
1023 
1024   PetscFunctionBegin;
1025   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1026   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1027   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1028 
1029   v  = a->a;
1030   xb = x;
1031 
1032   for (i=0; i<mbs; i++) {
1033     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1034     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1035     ib          = aj + *ai;
1036     jmin        = 0;
1037     nonzerorow += (n>0);
1038     if (*ib == i) {     /* (diag of A)*x */
1039       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1040       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1041       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1042       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1043       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1044       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1045       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1046       v        += 49; jmin++;
1047     }
1048     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1049     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1050     for (j=jmin; j<n; j++) {
1051       /* (strict lower triangular part of A)*x  */
1052       cval       = ib[j]*7;
1053       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1054       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1055       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1056       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1057       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1058       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1059       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1060       /* (strict upper triangular part of A)*x  */
1061       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1062       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1063       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1064       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1065       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1066       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1067       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1068       v       += 49;
1069     }
1070     xb +=7; ai++;
1071   }
1072 
1073   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1074   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1075 
1076   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1077   PetscFunctionReturn(0);
1078 }
1079 
1080 #undef __FUNCT__
1081 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1082 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1083 {
1084   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1085   PetscScalar       *z,*z_ptr=0,*zb,*work,*workt;
1086   const PetscScalar *x,*x_ptr,*xb;
1087   const MatScalar   *v;
1088   PetscErrorCode    ierr;
1089   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1090   const PetscInt    *idx,*aj,*ii;
1091   PetscInt          nonzerorow=0;
1092 
1093   PetscFunctionBegin;
1094   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1095   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1096   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1097 
1098   aj = a->j;
1099   v  = a->a;
1100   ii = a->i;
1101 
1102   if (!a->mult_work) {
1103     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1104   }
1105   work = a->mult_work;
1106 
1107 
1108   for (i=0; i<mbs; i++) {
1109     n           = ii[1] - ii[0]; ncols = n*bs;
1110     workt       = work; idx=aj+ii[0];
1111     nonzerorow += (n>0);
1112 
1113     /* upper triangular part */
1114     for (j=0; j<n; j++) {
1115       xb = x_ptr + bs*(*idx++);
1116       for (k=0; k<bs; k++) workt[k] = xb[k];
1117       workt += bs;
1118     }
1119     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1120     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1121 
1122     /* strict lower triangular part */
1123     idx = aj+ii[0];
1124     if (*idx == i) {
1125       ncols -= bs; v += bs2; idx++; n--;
1126     }
1127     if (ncols > 0) {
1128       workt = work;
1129       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1130       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1131       for (j=0; j<n; j++) {
1132         zb = z_ptr + bs*(*idx++);
1133         for (k=0; k<bs; k++) zb[k] += workt[k];
1134         workt += bs;
1135       }
1136     }
1137 
1138     x += bs; v += n*bs2; z += bs; ii++;
1139   }
1140 
1141   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1142   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1143 
1144   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1145   PetscFunctionReturn(0);
1146 }
1147 
1148 #undef __FUNCT__
1149 #define __FUNCT__ "MatScale_SeqSBAIJ"
1150 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1151 {
1152   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1153   PetscScalar    oalpha = alpha;
1154   PetscErrorCode ierr;
1155   PetscBLASInt   one = 1,totalnz;
1156 
1157   PetscFunctionBegin;
1158   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1159   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1160   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1161   PetscFunctionReturn(0);
1162 }
1163 
1164 #undef __FUNCT__
1165 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1166 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1167 {
1168   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1169   const MatScalar *v       = a->a;
1170   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1171   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1172   PetscErrorCode  ierr;
1173   const PetscInt  *aj=a->j,*col;
1174 
1175   PetscFunctionBegin;
1176   if (type == NORM_FROBENIUS) {
1177     for (k=0; k<mbs; k++) {
1178       jmin = a->i[k]; jmax = a->i[k+1];
1179       col  = aj + jmin;
1180       if (*col == k) {         /* diagonal block */
1181         for (i=0; i<bs2; i++) {
1182           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1183         }
1184         jmin++;
1185       }
1186       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1187         for (i=0; i<bs2; i++) {
1188           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1189         }
1190       }
1191     }
1192     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1193   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1194     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1195     for (i=0; i<mbs; i++) jl[i] = mbs;
1196     il[0] = 0;
1197 
1198     *norm = 0.0;
1199     for (k=0; k<mbs; k++) { /* k_th block row */
1200       for (j=0; j<bs; j++) sum[j]=0.0;
1201       /*-- col sum --*/
1202       i = jl[k]; /* first |A(i,k)| to be added */
1203       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1204                   at step k */
1205       while (i<mbs) {
1206         nexti = jl[i];  /* next block row to be added */
1207         ik    = il[i];  /* block index of A(i,k) in the array a */
1208         for (j=0; j<bs; j++) {
1209           v = a->a + ik*bs2 + j*bs;
1210           for (k1=0; k1<bs; k1++) {
1211             sum[j] += PetscAbsScalar(*v); v++;
1212           }
1213         }
1214         /* update il, jl */
1215         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1216         jmax = a->i[i+1];
1217         if (jmin < jmax) {
1218           il[i] = jmin;
1219           j     = a->j[jmin];
1220           jl[i] = jl[j]; jl[j]=i;
1221         }
1222         i = nexti;
1223       }
1224       /*-- row sum --*/
1225       jmin = a->i[k]; jmax = a->i[k+1];
1226       for (i=jmin; i<jmax; i++) {
1227         for (j=0; j<bs; j++) {
1228           v = a->a + i*bs2 + j;
1229           for (k1=0; k1<bs; k1++) {
1230             sum[j] += PetscAbsScalar(*v); v += bs;
1231           }
1232         }
1233       }
1234       /* add k_th block row to il, jl */
1235       col = aj+jmin;
1236       if (*col == k) jmin++;
1237       if (jmin < jmax) {
1238         il[k] = jmin;
1239         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1240       }
1241       for (j=0; j<bs; j++) {
1242         if (sum[j] > *norm) *norm = sum[j];
1243       }
1244     }
1245     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1246   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1247   PetscFunctionReturn(0);
1248 }
1249 
1250 #undef __FUNCT__
1251 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1252 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1253 {
1254   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1255   PetscErrorCode ierr;
1256 
1257   PetscFunctionBegin;
1258   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1259   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1260     *flg = PETSC_FALSE;
1261     PetscFunctionReturn(0);
1262   }
1263 
1264   /* if the a->i are the same */
1265   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1266   if (!*flg) PetscFunctionReturn(0);
1267 
1268   /* if a->j are the same */
1269   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1270   if (!*flg) PetscFunctionReturn(0);
1271 
1272   /* if a->a are the same */
1273   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 #undef __FUNCT__
1278 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1279 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1280 {
1281   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1282   PetscErrorCode  ierr;
1283   PetscInt        i,j,k,row,bs,ambs,bs2;
1284   const PetscInt  *ai,*aj;
1285   PetscScalar     *x,zero = 0.0;
1286   const MatScalar *aa,*aa_j;
1287 
1288   PetscFunctionBegin;
1289   bs = A->rmap->bs;
1290   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1291 
1292   aa   = a->a;
1293   ambs = a->mbs;
1294 
1295   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1296     PetscInt *diag=a->diag;
1297     aa   = a->a;
1298     ambs = a->mbs;
1299     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1300     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1301     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1302     PetscFunctionReturn(0);
1303   }
1304 
1305   ai   = a->i;
1306   aj   = a->j;
1307   bs2  = a->bs2;
1308   ierr = VecSet(v,zero);CHKERRQ(ierr);
1309   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1310   for (i=0; i<ambs; i++) {
1311     j=ai[i];
1312     if (aj[j] == i) {    /* if this is a diagonal element */
1313       row  = i*bs;
1314       aa_j = aa + j*bs2;
1315       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1316     }
1317   }
1318   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1319   PetscFunctionReturn(0);
1320 }
1321 
1322 #undef __FUNCT__
1323 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1324 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1325 {
1326   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1327   PetscScalar       x;
1328   const PetscScalar *l,*li,*ri;
1329   MatScalar         *aa,*v;
1330   PetscErrorCode    ierr;
1331   PetscInt          i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1332   PetscBool         flg;
1333 
1334   PetscFunctionBegin;
1335   if (ll != rr) {
1336     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1337     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1338   }
1339   if (!ll) PetscFunctionReturn(0);
1340   ai  = a->i;
1341   aj  = a->j;
1342   aa  = a->a;
1343   m   = A->rmap->N;
1344   bs  = A->rmap->bs;
1345   mbs = a->mbs;
1346   bs2 = a->bs2;
1347 
1348   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1349   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1350   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1351   for (i=0; i<mbs; i++) { /* for each block row */
1352     M  = ai[i+1] - ai[i];
1353     li = l + i*bs;
1354     v  = aa + bs2*ai[i];
1355     for (j=0; j<M; j++) { /* for each block */
1356       ri = l + bs*aj[ai[i]+j];
1357       for (k=0; k<bs; k++) {
1358         x = ri[k];
1359         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1360       }
1361     }
1362   }
1363   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1364   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 #undef __FUNCT__
1369 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1370 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1371 {
1372   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1373 
1374   PetscFunctionBegin;
1375   info->block_size   = a->bs2;
1376   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1377   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1378   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1379   info->assemblies   = A->num_ass;
1380   info->mallocs      = A->info.mallocs;
1381   info->memory       = ((PetscObject)A)->mem;
1382   if (A->factortype) {
1383     info->fill_ratio_given  = A->info.fill_ratio_given;
1384     info->fill_ratio_needed = A->info.fill_ratio_needed;
1385     info->factor_mallocs    = A->info.factor_mallocs;
1386   } else {
1387     info->fill_ratio_given  = 0;
1388     info->fill_ratio_needed = 0;
1389     info->factor_mallocs    = 0;
1390   }
1391   PetscFunctionReturn(0);
1392 }
1393 
1394 
1395 #undef __FUNCT__
1396 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1397 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1398 {
1399   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1400   PetscErrorCode ierr;
1401 
1402   PetscFunctionBegin;
1403   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 #undef __FUNCT__
1408 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1409 /*
1410    This code does not work since it only checks the upper triangular part of
1411   the matrix. Hence it is not listed in the function table.
1412 */
1413 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1414 {
1415   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1416   PetscErrorCode  ierr;
1417   PetscInt        i,j,n,row,col,bs,mbs;
1418   const PetscInt  *ai,*aj;
1419   PetscReal       atmp;
1420   const MatScalar *aa;
1421   PetscScalar     *x;
1422   PetscInt        ncols,brow,bcol,krow,kcol;
1423 
1424   PetscFunctionBegin;
1425   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1426   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1427   bs  = A->rmap->bs;
1428   aa  = a->a;
1429   ai  = a->i;
1430   aj  = a->j;
1431   mbs = a->mbs;
1432 
1433   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1434   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1435   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1436   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1437   for (i=0; i<mbs; i++) {
1438     ncols = ai[1] - ai[0]; ai++;
1439     brow  = bs*i;
1440     for (j=0; j<ncols; j++) {
1441       bcol = bs*(*aj);
1442       for (kcol=0; kcol<bs; kcol++) {
1443         col = bcol + kcol;      /* col index */
1444         for (krow=0; krow<bs; krow++) {
1445           atmp = PetscAbsScalar(*aa); aa++;
1446           row  = brow + krow;   /* row index */
1447           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1448           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1449         }
1450       }
1451       aj++;
1452     }
1453   }
1454   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1455   PetscFunctionReturn(0);
1456 }
1457