xref: /petsc/src/mat/impls/sbaij/seq/sbaij2.c (revision 3e7ff0edd3573be01c8c0fa32db97c3db8fa5c8d)
1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <petsc-private/kernels/blockinvert.h>
4 #include <petscbt.h>
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <petscblaslapack.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "MatIncreaseOverlap_SeqSBAIJ"
10 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11 {
12   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15   const PetscInt *idx;
16   PetscBT        table_out,table_in;
17 
18   PetscFunctionBegin;
19   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20   mbs  = a->mbs;
21   ai   = a->i;
22   aj   = a->j;
23   bs   = A->rmap->bs;
24   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
25   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
26   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
27   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
28 
29   for (i=0; i<is_max; i++) { /* for each is */
30     isz  = 0;
31     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
32 
33     /* Extract the indices, assume there can be duplicate entries */
34     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
35     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
36 
37     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38     bcol_max = 0;
39     for (j=0; j<n; ++j) {
40       brow = idx[j]/bs; /* convert the indices into block indices */
41       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42       if (!PetscBTLookupSet(table_out,brow)) {
43         nidx[isz++] = brow;
44         if (bcol_max < brow) bcol_max = brow;
45       }
46     }
47     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
48     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
49 
50     k = 0;
51     for (j=0; j<ov; j++) { /* for each overlap */
52       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
54       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
55 
56       n = isz;  /* length of the updated is[i] */
57       for (brow=0; brow<mbs; brow++) {
58         start = ai[brow]; end   = ai[brow+1];
59         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60           for (l = start; l<end; l++) {
61             bcol = aj[l];
62             if (!PetscBTLookupSet(table_out,bcol)) {
63               nidx[isz++] = bcol;
64               if (bcol_max < bcol) bcol_max = bcol;
65             }
66           }
67           k++;
68           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70           for (l = start; l<end; l++) {
71             bcol = aj[l];
72             if (bcol > bcol_max) break;
73             if (PetscBTLookup(table_in,bcol)) {
74               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75               break; /* for l = start; l<end ; l++) */
76             }
77           }
78         }
79       }
80     } /* for each overlap */
81 
82     /* expand the Index Set */
83     for (j=0; j<isz; j++) {
84       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85     }
86     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
87   }
88   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
89   ierr = PetscFree(nidx);CHKERRQ(ierr);
90   ierr = PetscFree(nidx2);CHKERRQ(ierr);
91   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ_Private"
97 PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
98 {
99   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
100   PetscErrorCode ierr;
101   PetscInt       *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
102   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
103   PetscInt       nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
104   const PetscInt *irow,*aj = a->j,*ai = a->i;
105   MatScalar      *mat_a;
106   Mat            C;
107   PetscBool      flag,sorted;
108 
109   PetscFunctionBegin;
110   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
111   ierr = ISSorted(iscol,&sorted);CHKERRQ(ierr);
112   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
113 
114   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
115   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
116 
117   ierr  = PetscMalloc1(oldcols,&smap);CHKERRQ(ierr);
118   ierr  = PetscMemzero(smap,oldcols*sizeof(PetscInt));CHKERRQ(ierr);
119   ssmap = smap;
120   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
121   for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
122   /* determine lens of each row */
123   for (i=0; i<nrows; i++) {
124     kstart  = ai[irow[i]];
125     kend    = kstart + a->ilen[irow[i]];
126     lens[i] = 0;
127     for (k=kstart; k<kend; k++) {
128       if (ssmap[aj[k]]) lens[i]++;
129     }
130   }
131   /* Create and fill new matrix */
132   if (scall == MAT_REUSE_MATRIX) {
133     c = (Mat_SeqSBAIJ*)((*B)->data);
134 
135     if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136     ierr = PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);CHKERRQ(ierr);
137     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
138     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));CHKERRQ(ierr);
139     C    = *B;
140   } else {
141     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
142     ierr = MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
143     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
144     ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);CHKERRQ(ierr);
145   }
146   c = (Mat_SeqSBAIJ*)(C->data);
147   for (i=0; i<nrows; i++) {
148     row      = irow[i];
149     kstart   = ai[row];
150     kend     = kstart + a->ilen[row];
151     mat_i    = c->i[i];
152     mat_j    = c->j + mat_i;
153     mat_a    = c->a + mat_i*bs2;
154     mat_ilen = c->ilen + i;
155     for (k=kstart; k<kend; k++) {
156       if ((tcol=ssmap[a->j[k]])) {
157         *mat_j++ = tcol - 1;
158         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
159         mat_a   += bs2;
160         (*mat_ilen)++;
161       }
162     }
163   }
164 
165   /* Free work space */
166   ierr = PetscFree(smap);CHKERRQ(ierr);
167   ierr = PetscFree(lens);CHKERRQ(ierr);
168   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
169   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
170 
171   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
172   *B   = C;
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatGetSubMatrix_SeqSBAIJ"
178 PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
179 {
180   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
181   IS             is1;
182   PetscErrorCode ierr;
183   PetscInt       *vary,*iary,nrows,i,bs=A->rmap->bs,count;
184   const PetscInt *irow;
185 
186   PetscFunctionBegin;
187   if (isrow != iscol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
188 
189   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
190   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
191 
192   /* Verify if the indices corespond to each element in a block
193    and form the IS with compressed IS */
194   ierr = PetscMalloc2(a->mbs,&vary,a->mbs,&iary);CHKERRQ(ierr);
195   ierr = PetscMemzero(vary,a->mbs*sizeof(PetscInt));CHKERRQ(ierr);
196   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
197 
198   count = 0;
199   for (i=0; i<a->mbs; i++) {
200     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
201     if (vary[i]==bs) iary[count++] = i;
202   }
203   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
204   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
205   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
206 
207   ierr = MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);CHKERRQ(ierr);
208   ierr = ISDestroy(&is1);CHKERRQ(ierr);
209   PetscFunctionReturn(0);
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatGetSubMatrices_SeqSBAIJ"
214 PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
215 {
216   PetscErrorCode ierr;
217   PetscInt       i;
218 
219   PetscFunctionBegin;
220   if (scall == MAT_INITIAL_MATRIX) {
221     ierr = PetscMalloc1(n+1,B);CHKERRQ(ierr);
222   }
223 
224   for (i=0; i<n; i++) {
225     ierr = MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
226   }
227   PetscFunctionReturn(0);
228 }
229 
230 /* -------------------------------------------------------*/
231 /* Should check that shapes of vectors and matrices match */
232 /* -------------------------------------------------------*/
233 
234 #undef __FUNCT__
235 #define __FUNCT__ "MatMult_SeqSBAIJ_2"
236 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
237 {
238   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
239   PetscScalar    *x,*z,*xb,x1,x2,zero=0.0;
240   MatScalar      *v;
241   PetscErrorCode ierr;
242   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
243   PetscInt       nonzerorow=0;
244 
245   PetscFunctionBegin;
246   ierr = VecSet(zz,zero);CHKERRQ(ierr);
247   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
248   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
249 
250   v  = a->a;
251   xb = x;
252 
253   for (i=0; i<mbs; i++) {
254     n           = ai[1] - ai[0]; /* length of i_th block row of A */
255     x1          = xb[0]; x2 = xb[1];
256     ib          = aj + *ai;
257     jmin        = 0;
258     nonzerorow += (n>0);
259     if (*ib == i) {     /* (diag of A)*x */
260       z[2*i]   += v[0]*x1 + v[2]*x2;
261       z[2*i+1] += v[2]*x1 + v[3]*x2;
262       v        += 4; jmin++;
263     }
264     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
265     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
266     for (j=jmin; j<n; j++) {
267       /* (strict lower triangular part of A)*x  */
268       cval       = ib[j]*2;
269       z[cval]   += v[0]*x1 + v[1]*x2;
270       z[cval+1] += v[2]*x1 + v[3]*x2;
271       /* (strict upper triangular part of A)*x  */
272       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
273       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
274       v        += 4;
275     }
276     xb +=2; ai++;
277   }
278 
279   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
280   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
281   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
282   PetscFunctionReturn(0);
283 }
284 
285 #undef __FUNCT__
286 #define __FUNCT__ "MatMult_SeqSBAIJ_3"
287 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
288 {
289   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
290   PetscScalar    *x,*z,*xb,x1,x2,x3,zero=0.0;
291   MatScalar      *v;
292   PetscErrorCode ierr;
293   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
294   PetscInt       nonzerorow=0;
295 
296   PetscFunctionBegin;
297   ierr = VecSet(zz,zero);CHKERRQ(ierr);
298   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
299   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
300 
301   v  = a->a;
302   xb = x;
303 
304   for (i=0; i<mbs; i++) {
305     n           = ai[1] - ai[0]; /* length of i_th block row of A */
306     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
307     ib          = aj + *ai;
308     jmin        = 0;
309     nonzerorow += (n>0);
310     if (*ib == i) {     /* (diag of A)*x */
311       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
312       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
313       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
314       v        += 9; jmin++;
315     }
316     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
317     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
318     for (j=jmin; j<n; j++) {
319       /* (strict lower triangular part of A)*x  */
320       cval       = ib[j]*3;
321       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
322       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
323       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
324       /* (strict upper triangular part of A)*x  */
325       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
326       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
327       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
328       v        += 9;
329     }
330     xb +=3; ai++;
331   }
332 
333   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
334   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
335   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
336   PetscFunctionReturn(0);
337 }
338 
339 #undef __FUNCT__
340 #define __FUNCT__ "MatMult_SeqSBAIJ_4"
341 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
342 {
343   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
344   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
345   MatScalar      *v;
346   PetscErrorCode ierr;
347   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
348   PetscInt       nonzerorow=0;
349 
350   PetscFunctionBegin;
351   ierr = VecSet(zz,zero);CHKERRQ(ierr);
352   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
353   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
354 
355   v  = a->a;
356   xb = x;
357 
358   for (i=0; i<mbs; i++) {
359     n           = ai[1] - ai[0]; /* length of i_th block row of A */
360     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
361     ib          = aj + *ai;
362     jmin        = 0;
363     nonzerorow += (n>0);
364     if (*ib == i) {     /* (diag of A)*x */
365       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
366       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
367       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
368       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
369       v        += 16; jmin++;
370     }
371     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
372     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
373     for (j=jmin; j<n; j++) {
374       /* (strict lower triangular part of A)*x  */
375       cval       = ib[j]*4;
376       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
377       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
378       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
379       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
380       /* (strict upper triangular part of A)*x  */
381       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
382       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
383       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
384       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
385       v        += 16;
386     }
387     xb +=4; ai++;
388   }
389 
390   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
391   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
392   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
393   PetscFunctionReturn(0);
394 }
395 
396 #undef __FUNCT__
397 #define __FUNCT__ "MatMult_SeqSBAIJ_5"
398 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
399 {
400   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
401   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
402   MatScalar      *v;
403   PetscErrorCode ierr;
404   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
405   PetscInt       nonzerorow=0;
406 
407   PetscFunctionBegin;
408   ierr = VecSet(zz,zero);CHKERRQ(ierr);
409   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
410   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
411 
412   v  = a->a;
413   xb = x;
414 
415   for (i=0; i<mbs; i++) {
416     n           = ai[1] - ai[0]; /* length of i_th block row of A */
417     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
418     ib          = aj + *ai;
419     jmin        = 0;
420     nonzerorow += (n>0);
421     if (*ib == i) {      /* (diag of A)*x */
422       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
423       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
424       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
425       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
426       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
427       v        += 25; jmin++;
428     }
429     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
430     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
431     for (j=jmin; j<n; j++) {
432       /* (strict lower triangular part of A)*x  */
433       cval       = ib[j]*5;
434       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
435       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
436       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
437       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
438       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
439       /* (strict upper triangular part of A)*x  */
440       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
441       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
442       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
443       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
444       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
445       v        += 25;
446     }
447     xb +=5; ai++;
448   }
449 
450   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
451   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
452   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
453   PetscFunctionReturn(0);
454 }
455 
456 
457 #undef __FUNCT__
458 #define __FUNCT__ "MatMult_SeqSBAIJ_6"
459 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
460 {
461   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
462   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
463   MatScalar      *v;
464   PetscErrorCode ierr;
465   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
466   PetscInt       nonzerorow=0;
467 
468   PetscFunctionBegin;
469   ierr = VecSet(zz,zero);CHKERRQ(ierr);
470   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
471   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
472 
473   v  = a->a;
474   xb = x;
475 
476   for (i=0; i<mbs; i++) {
477     n           = ai[1] - ai[0]; /* length of i_th block row of A */
478     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
479     ib          = aj + *ai;
480     jmin        = 0;
481     nonzerorow += (n>0);
482     if (*ib == i) {      /* (diag of A)*x */
483       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
484       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
485       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
486       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
487       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
488       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
489       v        += 36; jmin++;
490     }
491     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
492     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
493     for (j=jmin; j<n; j++) {
494       /* (strict lower triangular part of A)*x  */
495       cval       = ib[j]*6;
496       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
497       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
498       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
499       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
500       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
501       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
502       /* (strict upper triangular part of A)*x  */
503       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
504       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
505       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
506       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
507       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
508       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
509       v        += 36;
510     }
511     xb +=6; ai++;
512   }
513 
514   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
515   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
516   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
517   PetscFunctionReturn(0);
518 }
519 #undef __FUNCT__
520 #define __FUNCT__ "MatMult_SeqSBAIJ_7"
521 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
522 {
523   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
524   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
525   MatScalar      *v;
526   PetscErrorCode ierr;
527   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
528   PetscInt       nonzerorow=0;
529 
530   PetscFunctionBegin;
531   ierr = VecSet(zz,zero);CHKERRQ(ierr);
532   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
533   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
534 
535   v  = a->a;
536   xb = x;
537 
538   for (i=0; i<mbs; i++) {
539     n           = ai[1] - ai[0]; /* length of i_th block row of A */
540     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
541     ib          = aj + *ai;
542     jmin        = 0;
543     nonzerorow += (n>0);
544     if (*ib == i) {      /* (diag of A)*x */
545       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
546       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
547       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
548       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
549       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
550       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
551       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
552       v        += 49; jmin++;
553     }
554     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
555     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
556     for (j=jmin; j<n; j++) {
557       /* (strict lower triangular part of A)*x  */
558       cval       = ib[j]*7;
559       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
560       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
561       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
562       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
563       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
564       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
565       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
566       /* (strict upper triangular part of A)*x  */
567       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
568       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
569       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
570       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
571       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
572       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
573       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
574       v       += 49;
575     }
576     xb +=7; ai++;
577   }
578   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
579   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
580   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
581   PetscFunctionReturn(0);
582 }
583 
584 /*
585     This will not work with MatScalar == float because it calls the BLAS
586 */
587 #undef __FUNCT__
588 #define __FUNCT__ "MatMult_SeqSBAIJ_N"
589 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
590 {
591   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
592   PetscScalar    *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
593   MatScalar      *v;
594   PetscErrorCode ierr;
595   PetscInt       mbs       =a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
596   PetscInt       nonzerorow=0;
597 
598   PetscFunctionBegin;
599   ierr = VecSet(zz,zero);CHKERRQ(ierr);
600   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
601   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
602 
603   aj = a->j;
604   v  = a->a;
605   ii = a->i;
606 
607   if (!a->mult_work) {
608     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
609   }
610   work = a->mult_work;
611 
612   for (i=0; i<mbs; i++) {
613     n           = ii[1] - ii[0]; ncols = n*bs;
614     workt       = work; idx=aj+ii[0];
615     nonzerorow += (n>0);
616 
617     /* upper triangular part */
618     for (j=0; j<n; j++) {
619       xb = x_ptr + bs*(*idx++);
620       for (k=0; k<bs; k++) workt[k] = xb[k];
621       workt += bs;
622     }
623     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
624     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
625 
626     /* strict lower triangular part */
627     idx = aj+ii[0];
628     if (*idx == i) {
629       ncols -= bs; v += bs2; idx++; n--;
630     }
631 
632     if (ncols > 0) {
633       workt = work;
634       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
635       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
636       for (j=0; j<n; j++) {
637         zb = z_ptr + bs*(*idx++);
638         for (k=0; k<bs; k++) zb[k] += workt[k];
639         workt += bs;
640       }
641     }
642     x += bs; v += n*bs2; z += bs; ii++;
643   }
644 
645   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
646   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
647   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
648   PetscFunctionReturn(0);
649 }
650 
651 #undef __FUNCT__
652 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_1"
653 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
654 {
655   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
656   PetscScalar    *x,*z,*xb,x1;
657   MatScalar      *v;
658   PetscErrorCode ierr;
659   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
660   PetscInt       nonzerorow=0;
661 
662   PetscFunctionBegin;
663   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
664   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
665   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
666   v    = a->a;
667   xb   = x;
668 
669   for (i=0; i<mbs; i++) {
670     n           = ai[1] - ai[0]; /* length of i_th row of A */
671     x1          = xb[0];
672     ib          = aj + *ai;
673     jmin        = 0;
674     nonzerorow += (n>0);
675     if (*ib == i) {            /* (diag of A)*x */
676       z[i] += *v++ * x[*ib++]; jmin++;
677     }
678     for (j=jmin; j<n; j++) {
679       cval    = *ib;
680       z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
681       z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
682     }
683     xb++; ai++;
684   }
685 
686   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
687   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
688 
689   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
690   PetscFunctionReturn(0);
691 }
692 
693 #undef __FUNCT__
694 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_2"
695 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
696 {
697   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
698   PetscScalar    *x,*z,*xb,x1,x2;
699   MatScalar      *v;
700   PetscErrorCode ierr;
701   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
702   PetscInt       nonzerorow=0;
703 
704   PetscFunctionBegin;
705   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
706   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
707   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
708 
709   v  = a->a;
710   xb = x;
711 
712   for (i=0; i<mbs; i++) {
713     n           = ai[1] - ai[0]; /* length of i_th block row of A */
714     x1          = xb[0]; x2 = xb[1];
715     ib          = aj + *ai;
716     jmin        = 0;
717     nonzerorow += (n>0);
718     if (*ib == i) {      /* (diag of A)*x */
719       z[2*i]   += v[0]*x1 + v[2]*x2;
720       z[2*i+1] += v[2]*x1 + v[3]*x2;
721       v        += 4; jmin++;
722     }
723     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
724     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
725     for (j=jmin; j<n; j++) {
726       /* (strict lower triangular part of A)*x  */
727       cval       = ib[j]*2;
728       z[cval]   += v[0]*x1 + v[1]*x2;
729       z[cval+1] += v[2]*x1 + v[3]*x2;
730       /* (strict upper triangular part of A)*x  */
731       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
732       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
733       v        += 4;
734     }
735     xb +=2; ai++;
736   }
737   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
738   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
739 
740   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
741   PetscFunctionReturn(0);
742 }
743 
744 #undef __FUNCT__
745 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_3"
746 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
747 {
748   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
749   PetscScalar    *x,*z,*xb,x1,x2,x3;
750   MatScalar      *v;
751   PetscErrorCode ierr;
752   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
753   PetscInt       nonzerorow=0;
754 
755   PetscFunctionBegin;
756   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
757   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
758   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
759 
760   v  = a->a;
761   xb = x;
762 
763   for (i=0; i<mbs; i++) {
764     n           = ai[1] - ai[0]; /* length of i_th block row of A */
765     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
766     ib          = aj + *ai;
767     jmin        = 0;
768     nonzerorow += (n>0);
769     if (*ib == i) {     /* (diag of A)*x */
770       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
771       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
772       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
773       v        += 9; jmin++;
774     }
775     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
776     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
777     for (j=jmin; j<n; j++) {
778       /* (strict lower triangular part of A)*x  */
779       cval       = ib[j]*3;
780       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
781       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
782       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
783       /* (strict upper triangular part of A)*x  */
784       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
785       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
786       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
787       v        += 9;
788     }
789     xb +=3; ai++;
790   }
791 
792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
794 
795   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
796   PetscFunctionReturn(0);
797 }
798 
799 #undef __FUNCT__
800 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_4"
801 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
802 {
803   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
804   PetscScalar    *x,*z,*xb,x1,x2,x3,x4;
805   MatScalar      *v;
806   PetscErrorCode ierr;
807   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
808   PetscInt       nonzerorow=0;
809 
810   PetscFunctionBegin;
811   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
812   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
813   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
814 
815   v  = a->a;
816   xb = x;
817 
818   for (i=0; i<mbs; i++) {
819     n           = ai[1] - ai[0]; /* length of i_th block row of A */
820     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
821     ib          = aj + *ai;
822     jmin        = 0;
823     nonzerorow += (n>0);
824     if (*ib == i) {      /* (diag of A)*x */
825       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
826       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
827       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
828       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
829       v        += 16; jmin++;
830     }
831     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
832     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
833     for (j=jmin; j<n; j++) {
834       /* (strict lower triangular part of A)*x  */
835       cval       = ib[j]*4;
836       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
837       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
838       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
839       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
840       /* (strict upper triangular part of A)*x  */
841       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
842       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
843       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
844       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
845       v        += 16;
846     }
847     xb +=4; ai++;
848   }
849 
850   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
851   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
852 
853   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
854   PetscFunctionReturn(0);
855 }
856 
857 #undef __FUNCT__
858 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_5"
859 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
860 {
861   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
862   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5;
863   MatScalar      *v;
864   PetscErrorCode ierr;
865   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
866   PetscInt       nonzerorow=0;
867 
868   PetscFunctionBegin;
869   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
870   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
871   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
872 
873   v  = a->a;
874   xb = x;
875 
876   for (i=0; i<mbs; i++) {
877     n           = ai[1] - ai[0]; /* length of i_th block row of A */
878     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
879     ib          = aj + *ai;
880     jmin        = 0;
881     nonzerorow += (n>0);
882     if (*ib == i) {      /* (diag of A)*x */
883       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
884       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
885       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
886       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
887       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
888       v        += 25; jmin++;
889     }
890     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
891     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
892     for (j=jmin; j<n; j++) {
893       /* (strict lower triangular part of A)*x  */
894       cval       = ib[j]*5;
895       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
896       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
897       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
898       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
899       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
900       /* (strict upper triangular part of A)*x  */
901       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
902       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
903       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
904       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
905       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
906       v        += 25;
907     }
908     xb +=5; ai++;
909   }
910 
911   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
912   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
913 
914   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
915   PetscFunctionReturn(0);
916 }
917 #undef __FUNCT__
918 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_6"
919 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
920 {
921   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
922   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6;
923   MatScalar      *v;
924   PetscErrorCode ierr;
925   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
926   PetscInt       nonzerorow=0;
927 
928   PetscFunctionBegin;
929   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
930   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
931   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
932 
933   v  = a->a;
934   xb = x;
935 
936   for (i=0; i<mbs; i++) {
937     n           = ai[1] - ai[0]; /* length of i_th block row of A */
938     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
939     ib          = aj + *ai;
940     jmin        = 0;
941     nonzerorow += (n>0);
942     if (*ib == i) {     /* (diag of A)*x */
943       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
944       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
945       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
946       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
947       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
948       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
949       v        += 36; jmin++;
950     }
951     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
952     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
953     for (j=jmin; j<n; j++) {
954       /* (strict lower triangular part of A)*x  */
955       cval       = ib[j]*6;
956       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
957       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
958       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
959       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
960       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
961       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
962       /* (strict upper triangular part of A)*x  */
963       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
964       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
965       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
966       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
967       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
968       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
969       v        += 36;
970     }
971     xb +=6; ai++;
972   }
973 
974   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
975   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
976 
977   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
978   PetscFunctionReturn(0);
979 }
980 
981 #undef __FUNCT__
982 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_7"
983 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
984 {
985   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
986   PetscScalar    *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
987   MatScalar      *v;
988   PetscErrorCode ierr;
989   PetscInt       mbs       =a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
990   PetscInt       nonzerorow=0;
991 
992   PetscFunctionBegin;
993   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
994   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
995   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
996 
997   v  = a->a;
998   xb = x;
999 
1000   for (i=0; i<mbs; i++) {
1001     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1002     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1003     ib          = aj + *ai;
1004     jmin        = 0;
1005     nonzerorow += (n>0);
1006     if (*ib == i) {     /* (diag of A)*x */
1007       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1008       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1009       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1010       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1011       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1012       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1013       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1014       v        += 49; jmin++;
1015     }
1016     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1017     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1018     for (j=jmin; j<n; j++) {
1019       /* (strict lower triangular part of A)*x  */
1020       cval       = ib[j]*7;
1021       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1022       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1023       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1024       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1025       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1026       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1027       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1028       /* (strict upper triangular part of A)*x  */
1029       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1030       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1031       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1032       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1033       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1034       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1035       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1036       v       += 49;
1037     }
1038     xb +=7; ai++;
1039   }
1040 
1041   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1042   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1043 
1044   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1045   PetscFunctionReturn(0);
1046 }
1047 
1048 #undef __FUNCT__
1049 #define __FUNCT__ "MatMultAdd_SeqSBAIJ_N"
1050 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1051 {
1052   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1053   PetscScalar    *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1054   MatScalar      *v;
1055   PetscErrorCode ierr;
1056   PetscInt       mbs       =a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1057   PetscInt       nonzerorow=0;
1058 
1059   PetscFunctionBegin;
1060   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1061   ierr = VecGetArray(xx,&x);CHKERRQ(ierr); x_ptr=x;
1062   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1063 
1064   aj = a->j;
1065   v  = a->a;
1066   ii = a->i;
1067 
1068   if (!a->mult_work) {
1069     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1070   }
1071   work = a->mult_work;
1072 
1073 
1074   for (i=0; i<mbs; i++) {
1075     n           = ii[1] - ii[0]; ncols = n*bs;
1076     workt       = work; idx=aj+ii[0];
1077     nonzerorow += (n>0);
1078 
1079     /* upper triangular part */
1080     for (j=0; j<n; j++) {
1081       xb = x_ptr + bs*(*idx++);
1082       for (k=0; k<bs; k++) workt[k] = xb[k];
1083       workt += bs;
1084     }
1085     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1086     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1087 
1088     /* strict lower triangular part */
1089     idx = aj+ii[0];
1090     if (*idx == i) {
1091       ncols -= bs; v += bs2; idx++; n--;
1092     }
1093     if (ncols > 0) {
1094       workt = work;
1095       ierr  = PetscMemzero(workt,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1096       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1097       for (j=0; j<n; j++) {
1098         zb = z_ptr + bs*(*idx++);
1099         for (k=0; k<bs; k++) zb[k] += workt[k];
1100         workt += bs;
1101       }
1102     }
1103 
1104     x += bs; v += n*bs2; z += bs; ii++;
1105   }
1106 
1107   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1108   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1109 
1110   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 #undef __FUNCT__
1115 #define __FUNCT__ "MatScale_SeqSBAIJ"
1116 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1117 {
1118   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1119   PetscScalar    oalpha = alpha;
1120   PetscErrorCode ierr;
1121   PetscBLASInt   one = 1,totalnz;
1122 
1123   PetscFunctionBegin;
1124   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1125   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1126   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1127   PetscFunctionReturn(0);
1128 }
1129 
1130 #undef __FUNCT__
1131 #define __FUNCT__ "MatNorm_SeqSBAIJ"
1132 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1133 {
1134   Mat_SeqSBAIJ   *a       = (Mat_SeqSBAIJ*)A->data;
1135   MatScalar      *v       = a->a;
1136   PetscReal      sum_diag = 0.0, sum_off = 0.0, *sum;
1137   PetscInt       i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1138   PetscErrorCode ierr;
1139   PetscInt       *jl,*il,jmin,jmax,nexti,ik,*col;
1140 
1141   PetscFunctionBegin;
1142   if (type == NORM_FROBENIUS) {
1143     for (k=0; k<mbs; k++) {
1144       jmin = a->i[k]; jmax = a->i[k+1];
1145       col  = aj + jmin;
1146       if (*col == k) {         /* diagonal block */
1147         for (i=0; i<bs2; i++) {
1148           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1149         }
1150         jmin++;
1151       }
1152       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1153         for (i=0; i<bs2; i++) {
1154           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1155         }
1156       }
1157     }
1158     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1159   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1160     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1161     for (i=0; i<mbs; i++) jl[i] = mbs;
1162     il[0] = 0;
1163 
1164     *norm = 0.0;
1165     for (k=0; k<mbs; k++) { /* k_th block row */
1166       for (j=0; j<bs; j++) sum[j]=0.0;
1167       /*-- col sum --*/
1168       i = jl[k]; /* first |A(i,k)| to be added */
1169       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1170                   at step k */
1171       while (i<mbs) {
1172         nexti = jl[i];  /* next block row to be added */
1173         ik    = il[i];  /* block index of A(i,k) in the array a */
1174         for (j=0; j<bs; j++) {
1175           v = a->a + ik*bs2 + j*bs;
1176           for (k1=0; k1<bs; k1++) {
1177             sum[j] += PetscAbsScalar(*v); v++;
1178           }
1179         }
1180         /* update il, jl */
1181         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1182         jmax = a->i[i+1];
1183         if (jmin < jmax) {
1184           il[i] = jmin;
1185           j     = a->j[jmin];
1186           jl[i] = jl[j]; jl[j]=i;
1187         }
1188         i = nexti;
1189       }
1190       /*-- row sum --*/
1191       jmin = a->i[k]; jmax = a->i[k+1];
1192       for (i=jmin; i<jmax; i++) {
1193         for (j=0; j<bs; j++) {
1194           v = a->a + i*bs2 + j;
1195           for (k1=0; k1<bs; k1++) {
1196             sum[j] += PetscAbsScalar(*v); v += bs;
1197           }
1198         }
1199       }
1200       /* add k_th block row to il, jl */
1201       col = aj+jmin;
1202       if (*col == k) jmin++;
1203       if (jmin < jmax) {
1204         il[k] = jmin;
1205         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1206       }
1207       for (j=0; j<bs; j++) {
1208         if (sum[j] > *norm) *norm = sum[j];
1209       }
1210     }
1211     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1212   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1213   PetscFunctionReturn(0);
1214 }
1215 
1216 #undef __FUNCT__
1217 #define __FUNCT__ "MatEqual_SeqSBAIJ"
1218 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1219 {
1220   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1221   PetscErrorCode ierr;
1222 
1223   PetscFunctionBegin;
1224   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1225   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1226     *flg = PETSC_FALSE;
1227     PetscFunctionReturn(0);
1228   }
1229 
1230   /* if the a->i are the same */
1231   ierr = PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1232   if (!*flg) PetscFunctionReturn(0);
1233 
1234   /* if a->j are the same */
1235   ierr = PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);CHKERRQ(ierr);
1236   if (!*flg) PetscFunctionReturn(0);
1237 
1238   /* if a->a are the same */
1239   ierr = PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);CHKERRQ(ierr);
1240   PetscFunctionReturn(0);
1241 }
1242 
1243 #undef __FUNCT__
1244 #define __FUNCT__ "MatGetDiagonal_SeqSBAIJ"
1245 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1246 {
1247   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1248   PetscErrorCode ierr;
1249   PetscInt       i,j,k,row,bs,*ai,*aj,ambs,bs2;
1250   PetscScalar    *x,zero = 0.0;
1251   MatScalar      *aa,*aa_j;
1252 
1253   PetscFunctionBegin;
1254   bs = A->rmap->bs;
1255   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1256 
1257   aa   = a->a;
1258   ambs = a->mbs;
1259 
1260   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1261     PetscInt *diag=a->diag;
1262     aa   = a->a;
1263     ambs = a->mbs;
1264     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1265     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1266     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1267     PetscFunctionReturn(0);
1268   }
1269 
1270   ai   = a->i;
1271   aj   = a->j;
1272   bs2  = a->bs2;
1273   ierr = VecSet(v,zero);CHKERRQ(ierr);
1274   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1275   for (i=0; i<ambs; i++) {
1276     j=ai[i];
1277     if (aj[j] == i) {    /* if this is a diagonal element */
1278       row  = i*bs;
1279       aa_j = aa + j*bs2;
1280       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1281     }
1282   }
1283   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1284   PetscFunctionReturn(0);
1285 }
1286 
1287 #undef __FUNCT__
1288 #define __FUNCT__ "MatDiagonalScale_SeqSBAIJ"
1289 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1290 {
1291   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1292   PetscScalar    *l,x,*li,*ri;
1293   MatScalar      *aa,*v;
1294   PetscErrorCode ierr;
1295   PetscInt       i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1296   PetscBool      flg;
1297 
1298   PetscFunctionBegin;
1299   if (ll != rr) {
1300     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1301     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1302   }
1303   if (!ll) PetscFunctionReturn(0);
1304   ai  = a->i;
1305   aj  = a->j;
1306   aa  = a->a;
1307   m   = A->rmap->N;
1308   bs  = A->rmap->bs;
1309   mbs = a->mbs;
1310   bs2 = a->bs2;
1311 
1312   ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1313   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1314   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1315   for (i=0; i<mbs; i++) { /* for each block row */
1316     M  = ai[i+1] - ai[i];
1317     li = l + i*bs;
1318     v  = aa + bs2*ai[i];
1319     for (j=0; j<M; j++) { /* for each block */
1320       ri = l + bs*aj[ai[i]+j];
1321       for (k=0; k<bs; k++) {
1322         x = ri[k];
1323         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1324       }
1325     }
1326   }
1327   ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1328   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 #undef __FUNCT__
1333 #define __FUNCT__ "MatGetInfo_SeqSBAIJ"
1334 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1335 {
1336   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1337 
1338   PetscFunctionBegin;
1339   info->block_size   = a->bs2;
1340   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1341   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1342   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
1343   info->assemblies   = A->num_ass;
1344   info->mallocs      = A->info.mallocs;
1345   info->memory       = ((PetscObject)A)->mem;
1346   if (A->factortype) {
1347     info->fill_ratio_given  = A->info.fill_ratio_given;
1348     info->fill_ratio_needed = A->info.fill_ratio_needed;
1349     info->factor_mallocs    = A->info.factor_mallocs;
1350   } else {
1351     info->fill_ratio_given  = 0;
1352     info->fill_ratio_needed = 0;
1353     info->factor_mallocs    = 0;
1354   }
1355   PetscFunctionReturn(0);
1356 }
1357 
1358 
1359 #undef __FUNCT__
1360 #define __FUNCT__ "MatZeroEntries_SeqSBAIJ"
1361 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1362 {
1363   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1364   PetscErrorCode ierr;
1365 
1366   PetscFunctionBegin;
1367   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1368   PetscFunctionReturn(0);
1369 }
1370 
1371 #undef __FUNCT__
1372 #define __FUNCT__ "MatGetRowMaxAbs_SeqSBAIJ"
1373 /*
1374    This code does not work since it only checks the upper triangular part of
1375   the matrix. Hence it is not listed in the function table.
1376 */
1377 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1378 {
1379   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1380   PetscErrorCode ierr;
1381   PetscInt       i,j,n,row,col,bs,*ai,*aj,mbs;
1382   PetscReal      atmp;
1383   MatScalar      *aa;
1384   PetscScalar    *x;
1385   PetscInt       ncols,brow,bcol,krow,kcol;
1386 
1387   PetscFunctionBegin;
1388   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1389   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1390   bs  = A->rmap->bs;
1391   aa  = a->a;
1392   ai  = a->i;
1393   aj  = a->j;
1394   mbs = a->mbs;
1395 
1396   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1397   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1398   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1399   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1400   for (i=0; i<mbs; i++) {
1401     ncols = ai[1] - ai[0]; ai++;
1402     brow  = bs*i;
1403     for (j=0; j<ncols; j++) {
1404       bcol = bs*(*aj);
1405       for (kcol=0; kcol<bs; kcol++) {
1406         col = bcol + kcol;      /* col index */
1407         for (krow=0; krow<bs; krow++) {
1408           atmp = PetscAbsScalar(*aa); aa++;
1409           row  = brow + krow;   /* row index */
1410           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1411           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1412         }
1413       }
1414       aj++;
1415     }
1416   }
1417   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1418   PetscFunctionReturn(0);
1419 }
1420