xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 4ccb76dd2a06ade17c2800eec40bc3f6b9abd66f)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.46 1999/05/12 03:29:44 bsmith Exp balay $";
3 #endif
4 
5 #include "sys.h"
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/spops.h"
9 #include "src/inline/ilu.h"
10 #include "bitarray.h"
11 
12 #undef __FUNC__
13 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
14 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
15 {
16   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
17   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
18   int         start, end, *ai, *aj,bs,*nidx2;
19   BT          table;
20 
21   PetscFunctionBegin;
22   m     = a->mbs;
23   ai    = a->i;
24   aj    = a->j;
25   bs    = a->bs;
26 
27   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
28 
29   ierr  = BTCreate(m,table);CHKERRQ(ierr);
30   nidx  = (int *) PetscMalloc((m+1)*sizeof(int));CHKPTRQ(nidx);
31   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int));CHKPTRQ(nidx2);
32 
33   for ( i=0; i<is_max; i++ ) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     BTMemzero(m,table);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for ( j=0; j<n ; ++j){
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
46       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for ( j=0; j<ov; j++){ /* for each overlap*/
53       n = isz;
54       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for ( l = start; l<end ; l++){
59           val = aj[l];
60           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++ ) {
66       for (k=0; k<bs; k++ )
67         nidx2[j*bs+k] = nidx[j]*bs+k;
68     }
69     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i));CHKERRQ(ierr);
70   }
71   BTDestroy(table);
72   PetscFree(nidx);
73   PetscFree(nidx2);
74   PetscFunctionReturn(0);
75 }
76 
77 #undef __FUNC__
78 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
79 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
80 {
81   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
82   int          *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
83   int          row,mat_i,*mat_j,tcol,*mat_ilen;
84   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
85   int          *aj = a->j, *ai = a->i;
86   MatScalar    *mat_a;
87   Mat          C;
88 
89   PetscFunctionBegin;
90   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
91   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
92 
93   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
95   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
96   ierr = ISGetSize(iscol,&ncols);CHKERRQ(ierr);
97 
98   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap);
99   ssmap = smap;
100   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens);
101   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
102   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
103   /* determine lens of each row */
104   for (i=0; i<nrows; i++) {
105     kstart  = ai[irow[i]];
106     kend    = kstart + a->ilen[irow[i]];
107     lens[i] = 0;
108       for ( k=kstart; k<kend; k++ ) {
109         if (ssmap[aj[k]]) {
110           lens[i]++;
111         }
112       }
113     }
114   /* Create and fill new matrix */
115   if (scall == MAT_REUSE_MATRIX) {
116     c = (Mat_SeqBAIJ *)((*B)->data);
117 
118     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
119     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
120       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
121     }
122     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
123     C = *B;
124   } else {
125     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
126   }
127   c = (Mat_SeqBAIJ *)(C->data);
128   for (i=0; i<nrows; i++) {
129     row    = irow[i];
130     kstart = ai[row];
131     kend   = kstart + a->ilen[row];
132     mat_i  = c->i[i];
133     mat_j  = c->j + mat_i;
134     mat_a  = c->a + mat_i*bs2;
135     mat_ilen = c->ilen + i;
136     for ( k=kstart; k<kend; k++ ) {
137       if ((tcol=ssmap[a->j[k]])) {
138         *mat_j++ = tcol - 1;
139         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
140         mat_a   += bs2;
141         (*mat_ilen)++;
142       }
143     }
144   }
145 
146   /* Free work space */
147   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
148   PetscFree(smap); PetscFree(lens);
149   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
150   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
151 
152   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
153   *B = C;
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNC__
158 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
159 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
160 {
161   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
162   IS          is1,is2;
163   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
164 
165   PetscFunctionBegin;
166   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
167   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
168   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
169   ierr = ISGetSize(iscol,&ncols);CHKERRQ(ierr);
170 
171   /* Verify if the indices corespond to each element in a block
172    and form the IS with compressed IS */
173   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary);
174   iary = vary + a->mbs;
175   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
176   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
177   count = 0;
178   for (i=0; i<a->mbs; i++) {
179     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
180     if (vary[i]==bs) iary[count++] = i;
181   }
182   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1);CHKERRQ(ierr);
183 
184   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
185   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
186   count = 0;
187   for (i=0; i<a->mbs; i++) {
188     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
189     if (vary[i]==bs) iary[count++] = i;
190   }
191   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2);CHKERRQ(ierr);
192   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
193   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
194   PetscFree(vary);
195 
196   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);CHKERRQ(ierr);
197   ISDestroy(is1);
198   ISDestroy(is2);
199   PetscFunctionReturn(0);
200 }
201 
202 #undef __FUNC__
203 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
204 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatReuse scall,Mat **B)
205 {
206   int ierr,i;
207 
208   PetscFunctionBegin;
209   if (scall == MAT_INITIAL_MATRIX) {
210     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) );CHKPTRQ(*B);
211   }
212 
213   for ( i=0; i<n; i++ ) {
214     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
215   }
216   PetscFunctionReturn(0);
217 }
218 
219 
220 /* -------------------------------------------------------*/
221 /* Should check that shapes of vectors and matrices match */
222 /* -------------------------------------------------------*/
223 #include "pinclude/blaslapack.h"
224 
225 #undef __FUNC__
226 #define __FUNC__ "MatMult_SeqBAIJ_1"
227 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
228 {
229   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
230   Scalar          *x,*z,sum;
231   MatScalar       *v;
232   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
233 
234   PetscFunctionBegin;
235   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
236   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
237 
238   idx   = a->j;
239   v     = a->a;
240   ii    = a->i;
241 
242   for ( i=0; i<mbs; i++ ) {
243     n    = ii[1] - ii[0]; ii++;
244     sum  = 0.0;
245     while (n--) sum += *v++ * x[*idx++];
246     z[i] = sum;
247   }
248   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
249   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
250   PLogFlops(2*a->nz - a->m);
251   PetscFunctionReturn(0);
252 }
253 
254 #undef __FUNC__
255 #define __FUNC__ "MatMult_SeqBAIJ_2"
256 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
257 {
258   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
259   Scalar          *x,*z,*xb,sum1,sum2;
260   Scalar          x1,x2;
261   MatScalar       *v;
262   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
263 
264   PetscFunctionBegin;
265   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
266   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
267 
268   idx   = a->j;
269   v     = a->a;
270   ii    = a->i;
271 
272   for ( i=0; i<mbs; i++ ) {
273     n  = ii[1] - ii[0]; ii++;
274     sum1 = 0.0; sum2 = 0.0;
275     for ( j=0; j<n; j++ ) {
276       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
277       sum1 += v[0]*x1 + v[2]*x2;
278       sum2 += v[1]*x1 + v[3]*x2;
279       v += 4;
280     }
281     z[0] = sum1; z[1] = sum2;
282     z += 2;
283   }
284   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
285   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
286   PLogFlops(8*a->nz - a->m);
287   PetscFunctionReturn(0);
288 }
289 
290 #undef __FUNC__
291 #define __FUNC__ "MatMult_SeqBAIJ_3"
292 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
293 {
294   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data;
295   Scalar       *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
296   MatScalar    *v;
297   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;
298 
299 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
300 #pragma disjoint(*v,*z,*xb)
301 #endif
302 
303   PetscFunctionBegin;
304   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
305   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
306 
307   idx   = a->j;
308   v     = a->a;
309   ii    = a->i;
310 
311   for ( i=0; i<mbs; i++ ) {
312     n  = ii[1] - ii[0]; ii++;
313     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
314     for ( j=0; j<n; j++ ) {
315       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
316       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
317       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
318       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
319       v += 9;
320     }
321     z[0] = sum1; z[1] = sum2; z[2] = sum3;
322     z += 3;
323   }
324   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
325   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
326   PLogFlops(18*a->nz - a->m);
327   PetscFunctionReturn(0);
328 }
329 
330 #undef __FUNC__
331 #define __FUNC__ "MatMult_SeqBAIJ_4"
332 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
333 {
334   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
335   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
336   MatScalar       *v;
337   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
338 
339   PetscFunctionBegin;
340   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
341   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
342 
343   idx   = a->j;
344   v     = a->a;
345   ii    = a->i;
346 
347   for ( i=0; i<mbs; i++ ) {
348     n  = ii[1] - ii[0]; ii++;
349     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
350     for ( j=0; j<n; j++ ) {
351       xb = x + 4*(*idx++);
352       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
353       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
354       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
355       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
356       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
357       v += 16;
358     }
359     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
360     z += 4;
361   }
362   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
363   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
364   PLogFlops(32*a->nz - a->m);
365   PetscFunctionReturn(0);
366 }
367 
368 #undef __FUNC__
369 #define __FUNC__ "MatMult_SeqBAIJ_5"
370 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
371 {
372   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
373   Scalar          sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
374   MatScalar       *v;
375   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
376 
377   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
378   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
379 
380   idx   = a->j;
381   v     = a->a;
382   ii    = a->i;
383 
384   for ( i=0; i<mbs; i++ ) {
385     n  = ii[1] - ii[0]; ii++;
386     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
387     for ( j=0; j<n; j++ ) {
388       xb = x + 5*(*idx++);
389       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
390       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
391       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
392       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
393       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
394       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
395       v += 25;
396     }
397     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
398     z += 5;
399   }
400   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
401   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
402   PLogFlops(50*a->nz - a->m);
403   PetscFunctionReturn(0);
404 }
405 
406 
407 #undef __FUNC__
408 #define __FUNC__ "MatMult_SeqBAIJ_6"
409 int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
410 {
411   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
412   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
413   Scalar          x1,x2,x3,x4,x5,x6;
414   MatScalar       *v;
415   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
416 
417   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
418   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
419 
420   idx   = a->j;
421   v     = a->a;
422   ii    = a->i;
423 
424   for ( i=0; i<mbs; i++ ) {
425     n  = ii[1] - ii[0]; ii++;
426     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
427     for ( j=0; j<n; j++ ) {
428       xb = x + 6*(*idx++);
429       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
430       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
431       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
432       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
433       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
434       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
435       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
436       v += 36;
437     }
438     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
439     z += 6;
440   }
441 
442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
443   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
444   PLogFlops(72*a->nz - a->m);
445   PetscFunctionReturn(0);
446 }
447 #undef __FUNC__
448 #define __FUNC__ "MatMult_SeqBAIJ_7"
449 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
450 {
451   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
452   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
453   Scalar          x1,x2,x3,x4,x5,x6,x7;
454   MatScalar       *v;
455   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
456 
457   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
458   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
459 
460   idx   = a->j;
461   v     = a->a;
462   ii    = a->i;
463 
464   for ( i=0; i<mbs; i++ ) {
465     n  = ii[1] - ii[0]; ii++;
466     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
467     for ( j=0; j<n; j++ ) {
468       xb = x + 7*(*idx++);
469       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
470       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
471       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
472       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
473       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
474       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
475       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
476       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
477       v += 49;
478     }
479     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
480     z += 7;
481   }
482 
483   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
484   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
485   PLogFlops(98*a->nz - a->m);
486   PetscFunctionReturn(0);
487 }
488 
489 /*
490     This will not work with MatScalar == float because it calls the BLAS
491 */
492 #undef __FUNC__
493 #define __FUNC__ "MatMult_SeqBAIJ_N"
494 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
495 {
496   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
497   Scalar          *x,*z,*xb, *work,*workt;
498   MatScalar       *v;
499   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
500   int             ncols,k;
501 
502   PetscFunctionBegin;
503   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
504   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
505 
506   idx   = a->j;
507   v     = a->a;
508   ii    = a->i;
509 
510 
511   if (!a->mult_work) {
512     k = PetscMax(a->m,a->n);
513     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
514   }
515   work = a->mult_work;
516   for ( i=0; i<mbs; i++ ) {
517     n     = ii[1] - ii[0]; ii++;
518     ncols = n*bs;
519     workt = work;
520     for ( j=0; j<n; j++ ) {
521       xb = x + bs*(*idx++);
522       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
523       workt += bs;
524     }
525     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
526     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
527     v += n*bs2;
528     z += bs;
529   }
530   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
531   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
532   PLogFlops(2*a->nz*bs2 - a->m);
533   PetscFunctionReturn(0);
534 }
535 
536 #undef __FUNC__
537 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
538 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
539 {
540   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
541   Scalar          *x,*y,*z,sum;
542   MatScalar       *v;
543   int             ierr,mbs=a->mbs,i,*idx,*ii,n;
544 
545   PetscFunctionBegin;
546   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
547   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
548   if (zz != yy) {
549     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
550   } else {
551     z = y;
552   }
553 
554   idx   = a->j;
555   v     = a->a;
556   ii    = a->i;
557 
558   for ( i=0; i<mbs; i++ ) {
559     n    = ii[1] - ii[0]; ii++;
560     sum  = y[i];
561     while (n--) sum += *v++ * x[*idx++];
562     z[i] = sum;
563   }
564   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
565   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
566   if (zz != yy) {
567     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
568   }
569   PLogFlops(2*a->nz);
570   PetscFunctionReturn(0);
571 }
572 
573 #undef __FUNC__
574 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
575 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
576 {
577   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
578   Scalar          *x,*y,*z,*xb,sum1,sum2;
579   Scalar          x1,x2;
580   MatScalar       *v;
581   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
582 
583   PetscFunctionBegin;
584   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
585   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
586   if (zz != yy) {
587     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
588   } else {
589     z = y;
590   }
591 
592   idx   = a->j;
593   v     = a->a;
594   ii    = a->i;
595 
596   for ( i=0; i<mbs; i++ ) {
597     n  = ii[1] - ii[0]; ii++;
598     sum1 = y[0]; sum2 = y[1];
599     for ( j=0; j<n; j++ ) {
600       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
601       sum1 += v[0]*x1 + v[2]*x2;
602       sum2 += v[1]*x1 + v[3]*x2;
603       v += 4;
604     }
605     z[0] = sum1; z[1] = sum2;
606     z += 2; y += 2;
607   }
608   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
609   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
610   if (zz != yy) {
611     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
612   }
613   PLogFlops(4*a->nz);
614   PetscFunctionReturn(0);
615 }
616 
617 #undef __FUNC__
618 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
619 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
620 {
621   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
622   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
623   MatScalar       *v;
624   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
625 
626   PetscFunctionBegin;
627   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
628   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
629   if (zz != yy) {
630     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
631   } else {
632     z = y;
633   }
634 
635   idx   = a->j;
636   v     = a->a;
637   ii    = a->i;
638 
639   for ( i=0; i<mbs; i++ ) {
640     n  = ii[1] - ii[0]; ii++;
641     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
642     for ( j=0; j<n; j++ ) {
643       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
644       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
645       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
646       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
647       v += 9;
648     }
649     z[0] = sum1; z[1] = sum2; z[2] = sum3;
650     z += 3; y += 3;
651   }
652   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
653   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
654   if (zz != yy) {
655     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
656   }
657   PLogFlops(18*a->nz);
658   PetscFunctionReturn(0);
659 }
660 
661 #undef __FUNC__
662 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
663 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
664 {
665   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
666   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
667   MatScalar       *v;
668   int             ierr,mbs=a->mbs,i,*idx,*ii;
669   int             j,n;
670 
671   PetscFunctionBegin;
672   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
673   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
674   if (zz != yy) {
675     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
676   } else {
677     z = y;
678   }
679 
680   idx   = a->j;
681   v     = a->a;
682   ii    = a->i;
683 
684   for ( i=0; i<mbs; i++ ) {
685     n  = ii[1] - ii[0]; ii++;
686     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
687     for ( j=0; j<n; j++ ) {
688       xb = x + 4*(*idx++);
689       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
690       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
691       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
692       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
693       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
694       v += 16;
695     }
696     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
697     z += 4; y += 4;
698   }
699   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
700   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
701   if (zz != yy) {
702     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
703   }
704   PLogFlops(32*a->nz);
705   PetscFunctionReturn(0);
706 }
707 
708 #undef __FUNC__
709 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
710 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
711 {
712   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
713   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
714   MatScalar       *v;
715   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
716 
717   PetscFunctionBegin;
718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
720   if (zz != yy) {
721     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
722   } else {
723     z = y;
724   }
725 
726   idx   = a->j;
727   v     = a->a;
728   ii    = a->i;
729 
730   for ( i=0; i<mbs; i++ ) {
731     n  = ii[1] - ii[0]; ii++;
732     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
733     for ( j=0; j<n; j++ ) {
734       xb = x + 5*(*idx++);
735       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
736       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
737       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
738       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
739       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
740       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
741       v += 25;
742     }
743     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
744     z += 5; y += 5;
745   }
746   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
747   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
748   if (zz != yy) {
749     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
750   }
751   PLogFlops(50*a->nz);
752   PetscFunctionReturn(0);
753 }
754 #undef __FUNC__
755 #define __FUNC__ "MatMultAdd_SeqBAIJ_6"
756 int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
757 {
758   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
759   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
760   Scalar          x1,x2,x3,x4,x5,x6;
761   MatScalar       *v;
762   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
763 
764   PetscFunctionBegin;
765   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
766   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
767   if (zz != yy) {
768     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
769   } else {
770     z = y;
771   }
772 
773   idx   = a->j;
774   v     = a->a;
775   ii    = a->i;
776 
777   for ( i=0; i<mbs; i++ ) {
778     n  = ii[1] - ii[0]; ii++;
779     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
780     for ( j=0; j<n; j++ ) {
781       xb = x + 6*(*idx++);
782       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
783       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
784       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
785       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
786       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
787       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
788       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
789       v += 36;
790     }
791     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
792     z += 6; y += 6;
793   }
794   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
795   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
796   if (zz != yy) {
797     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
798   }
799   PLogFlops(72*a->nz);
800   PetscFunctionReturn(0);
801 }
802 
803 #undef __FUNC__
804 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
805 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
806 {
807   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
808   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
809   Scalar          x1,x2,x3,x4,x5,x6,x7;
810   MatScalar       *v;
811   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
812 
813   PetscFunctionBegin;
814   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
815   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
816   if (zz != yy) {
817     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
818   } else {
819     z = y;
820   }
821 
822   idx   = a->j;
823   v     = a->a;
824   ii    = a->i;
825 
826   for ( i=0; i<mbs; i++ ) {
827     n  = ii[1] - ii[0]; ii++;
828     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
829     for ( j=0; j<n; j++ ) {
830       xb = x + 7*(*idx++);
831       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
832       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
833       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
834       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
835       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
836       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
837       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
838       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
839       v += 49;
840     }
841     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842     z += 7; y += 7;
843   }
844   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
845   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
846   if (zz != yy) {
847     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
848   }
849   PLogFlops(98*a->nz);
850   PetscFunctionReturn(0);
851 }
852 
853 #undef __FUNC__
854 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
855 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
856 {
857   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *) A->data;
858   Scalar         *x,*z,*xb,*work,*workt;
859   MatScalar      *v;
860   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
861   int            ncols,k;
862 
863   PetscFunctionBegin;
864   if ( xx != yy) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
865 
866   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
867   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
868 
869   idx   = a->j;
870   v     = a->a;
871   ii    = a->i;
872 
873 
874   if (!a->mult_work) {
875     k = PetscMax(a->m,a->n);
876     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
877   }
878   work = a->mult_work;
879   for ( i=0; i<mbs; i++ ) {
880     n     = ii[1] - ii[0]; ii++;
881     ncols = n*bs;
882     workt = work;
883     for ( j=0; j<n; j++ ) {
884       xb = x + bs*(*idx++);
885       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
886       workt += bs;
887     }
888     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
889     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
890     v += n*bs2;
891     z += bs;
892   }
893   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
894   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
895   PLogFlops(2*a->nz*bs2 );
896   PetscFunctionReturn(0);
897 }
898 
899 #undef __FUNC__
900 #define __FUNC__ "MatMultTrans_SeqBAIJ"
901 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
902 {
903   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
904   Scalar          *xg,*zg,*zb;
905   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
906   MatScalar       *v;
907   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
908   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
909 
910 
911   PetscFunctionBegin;
912   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
913   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
914   ierr = PetscMemzero(z,N*sizeof(Scalar));CHKERRQ(ierr);
915 
916   idx   = a->j;
917   v     = a->a;
918   ii    = a->i;
919 
920   switch (bs) {
921   case 1:
922     for ( i=0; i<mbs; i++ ) {
923       n  = ii[1] - ii[0]; ii++;
924       xb = x + i; x1 = xb[0];
925       ib = idx + ai[i];
926       for ( j=0; j<n; j++ ) {
927         rval    = ib[j];
928         z[rval] += *v++ * x1;
929       }
930     }
931     break;
932   case 2:
933     for ( i=0; i<mbs; i++ ) {
934       n  = ii[1] - ii[0]; ii++;
935       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
936       ib = idx + ai[i];
937       for ( j=0; j<n; j++ ) {
938         rval      = ib[j]*2;
939         z[rval++] += v[0]*x1 + v[1]*x2;
940         z[rval++] += v[2]*x1 + v[3]*x2;
941         v += 4;
942       }
943     }
944     break;
945   case 3:
946     for ( i=0; i<mbs; i++ ) {
947       n  = ii[1] - ii[0]; ii++;
948       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
949       ib = idx + ai[i];
950       for ( j=0; j<n; j++ ) {
951         rval      = ib[j]*3;
952         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
953         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
954         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
955         v += 9;
956       }
957     }
958     break;
959   case 4:
960     for ( i=0; i<mbs; i++ ) {
961       n  = ii[1] - ii[0]; ii++;
962       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
963       ib = idx + ai[i];
964       for ( j=0; j<n; j++ ) {
965         rval      = ib[j]*4;
966         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
967         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
968         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
969         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
970         v += 16;
971       }
972     }
973     break;
974   case 5:
975     for ( i=0; i<mbs; i++ ) {
976       n  = ii[1] - ii[0]; ii++;
977       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
978       x4 = xb[3];   x5 = xb[4];
979       ib = idx + ai[i];
980       for ( j=0; j<n; j++ ) {
981         rval      = ib[j]*5;
982         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
983         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
984         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
985         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
986         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
987         v += 25;
988       }
989     }
990     break;
991       /* block sizes larger then 5 by 5 are handled by BLAS */
992     default: {
993       int       ncols,k;
994       Scalar    *work,*workt;
995 
996       if (!a->mult_work) {
997         k = PetscMax(a->m,a->n);
998         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
999       }
1000       work = a->mult_work;
1001       for ( i=0; i<mbs; i++ ) {
1002         n     = ii[1] - ii[0]; ii++;
1003         ncols = n*bs;
1004         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1005         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1006         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1007         v += n*bs2;
1008         x += bs;
1009         workt = work;
1010         for ( j=0; j<n; j++ ) {
1011           zb = z + bs*(*idx++);
1012           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1013           workt += bs;
1014         }
1015       }
1016     }
1017   }
1018   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1019   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1020   PLogFlops(2*a->nz*a->bs2 - a->n);
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 #undef __FUNC__
1025 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1026 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1027 
1028 {
1029   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1030   Scalar          *xg,*zg,*zb;
1031   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
1032   MatScalar       *v;
1033   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1034   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1035 
1036   PetscFunctionBegin;
1037   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
1038   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
1039 
1040   if ( yy != zz ) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1041   else {
1042     ierr = PetscMemzero(z,N*sizeof(Scalar));CHKERRQ(ierr);
1043   }
1044 
1045   idx   = a->j;
1046   v     = a->a;
1047   ii    = a->i;
1048 
1049   switch (bs) {
1050   case 1:
1051     for ( i=0; i<mbs; i++ ) {
1052       n  = ii[1] - ii[0]; ii++;
1053       xb = x + i; x1 = xb[0];
1054       ib = idx + ai[i];
1055       for ( j=0; j<n; j++ ) {
1056         rval    = ib[j];
1057         z[rval] += *v++ * x1;
1058       }
1059     }
1060     break;
1061   case 2:
1062     for ( i=0; i<mbs; i++ ) {
1063       n  = ii[1] - ii[0]; ii++;
1064       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1065       ib = idx + ai[i];
1066       for ( j=0; j<n; j++ ) {
1067         rval      = ib[j]*2;
1068         z[rval++] += v[0]*x1 + v[1]*x2;
1069         z[rval++] += v[2]*x1 + v[3]*x2;
1070         v += 4;
1071       }
1072     }
1073     break;
1074   case 3:
1075     for ( i=0; i<mbs; i++ ) {
1076       n  = ii[1] - ii[0]; ii++;
1077       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1078       ib = idx + ai[i];
1079       for ( j=0; j<n; j++ ) {
1080         rval      = ib[j]*3;
1081         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1082         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1083         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1084         v += 9;
1085       }
1086     }
1087     break;
1088   case 4:
1089     for ( i=0; i<mbs; i++ ) {
1090       n  = ii[1] - ii[0]; ii++;
1091       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1092       ib = idx + ai[i];
1093       for ( j=0; j<n; j++ ) {
1094         rval      = ib[j]*4;
1095         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1096         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1097         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1098         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1099         v += 16;
1100       }
1101     }
1102     break;
1103   case 5:
1104     for ( i=0; i<mbs; i++ ) {
1105       n  = ii[1] - ii[0]; ii++;
1106       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1107       x4 = xb[3];   x5 = xb[4];
1108       ib = idx + ai[i];
1109       for ( j=0; j<n; j++ ) {
1110         rval      = ib[j]*5;
1111         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1112         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1113         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1114         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1115         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1116         v += 25;
1117       }
1118     }
1119     break;
1120       /* block sizes larger then 5 by 5 are handled by BLAS */
1121     default: {
1122       int       ncols,k;
1123       Scalar    *work,*workt;
1124 
1125       if (!a->mult_work) {
1126         k = PetscMax(a->m,a->n);
1127         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1128       }
1129       work = a->mult_work;
1130       for ( i=0; i<mbs; i++ ) {
1131         n     = ii[1] - ii[0]; ii++;
1132         ncols = n*bs;
1133         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1134         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1135         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1136         v += n*bs2;
1137         x += bs;
1138         workt = work;
1139         for ( j=0; j<n; j++ ) {
1140           zb = z + bs*(*idx++);
1141           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1142           workt += bs;
1143         }
1144       }
1145     }
1146   }
1147   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1148   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1149   PLogFlops(2*a->nz*a->bs2);
1150   PetscFunctionReturn(0);
1151 }
1152 
1153 #undef __FUNC__
1154 #define __FUNC__ "MatScale_SeqBAIJ"
1155 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1156 {
1157   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1158   int         one = 1, totalnz = a->bs2*a->nz;
1159 
1160   PetscFunctionBegin;
1161   BLscal_( &totalnz, alpha, a->a, &one );
1162   PLogFlops(totalnz);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNC__
1167 #define __FUNC__ "MatNorm_SeqBAIJ"
1168 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1169 {
1170   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1171   MatScalar   *v = a->a;
1172   double      sum = 0.0;
1173   int         i,j,k,bs = a->bs, nz=a->nz,bs2=a->bs2;
1174 
1175   PetscFunctionBegin;
1176   if (type == NORM_FROBENIUS) {
1177     for (i=0; i< bs2*nz; i++ ) {
1178 #if defined(PETSC_USE_COMPLEX)
1179       sum += PetscReal(PetscConj(*v)*(*v)); v++;
1180 #else
1181       sum += (*v)*(*v); v++;
1182 #endif
1183     }
1184     *norm = sqrt(sum);
1185   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1186     *norm = 0.0;
1187     for ( k=0; k<bs; k++ ) {
1188       for ( j=0; j<a->m; j++ ) {
1189         v = a->a + bs2*a->i[j] + k;
1190         sum = 0.0;
1191         for ( i=0; i<a->i[j+1]-a->i[j]; i++ ) {
1192           sum += PetscAbsScalar(*v);
1193           v   += bs;
1194         }
1195         if (sum > *norm) *norm = sum;
1196       }
1197     }
1198   } else {
1199     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1200   }
1201   PetscFunctionReturn(0);
1202 }
1203 
1204 
1205 #undef __FUNC__
1206 #define __FUNC__ "MatEqual_SeqBAIJ"
1207 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1208 {
1209   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1210 
1211   PetscFunctionBegin;
1212   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1213 
1214   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1215   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1216     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1217   }
1218 
1219   /* if the a->i are the same */
1220   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1221     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1222   }
1223 
1224   /* if a->j are the same */
1225   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1226     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1227   }
1228 
1229   /* if a->a are the same */
1230   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1231     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1232   }
1233   *flg = PETSC_TRUE;
1234   PetscFunctionReturn(0);
1235 
1236 }
1237 
1238 #undef __FUNC__
1239 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1240 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1241 {
1242   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1243   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1244   Scalar      *x, zero = 0.0;
1245   MatScalar   *aa,*aa_j;
1246 
1247   PetscFunctionBegin;
1248   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1249   bs   = a->bs;
1250   aa   = a->a;
1251   ai   = a->i;
1252   aj   = a->j;
1253   ambs = a->mbs;
1254   bs2  = a->bs2;
1255 
1256   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1257   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1258   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1259   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1260   for ( i=0; i<ambs; i++ ) {
1261     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1262       if (aj[j] == i) {
1263         row  = i*bs;
1264         aa_j = aa+j*bs2;
1265         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1266         break;
1267       }
1268     }
1269   }
1270   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1271   PetscFunctionReturn(0);
1272 }
1273 
1274 #undef __FUNC__
1275 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1276 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1277 {
1278   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1279   Scalar      *l,*r,x,*li,*ri;
1280   MatScalar   *aa,*v;
1281   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1282 
1283   PetscFunctionBegin;
1284   ai  = a->i;
1285   aj  = a->j;
1286   aa  = a->a;
1287   m   = a->m;
1288   n   = a->n;
1289   bs  = a->bs;
1290   mbs = a->mbs;
1291   bs2 = a->bs2;
1292   if (ll) {
1293     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1294     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1295     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1296     for ( i=0; i<mbs; i++ ) { /* for each block row */
1297       M  = ai[i+1] - ai[i];
1298       li = l + i*bs;
1299       v  = aa + bs2*ai[i];
1300       for ( j=0; j<M; j++ ) { /* for each block */
1301         for ( k=0; k<bs2; k++ ) {
1302           (*v++) *= li[k%bs];
1303         }
1304       }
1305     }
1306     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1307     PLogFlops(a->nz);
1308   }
1309 
1310   if (rr) {
1311     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1312     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1313     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1314     for ( i=0; i<mbs; i++ ) { /* for each block row */
1315       M  = ai[i+1] - ai[i];
1316       v  = aa + bs2*ai[i];
1317       for ( j=0; j<M; j++ ) { /* for each block */
1318         ri = r + bs*aj[ai[i]+j];
1319         for ( k=0; k<bs; k++ ) {
1320           x = ri[k];
1321           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1322         }
1323       }
1324     }
1325     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1326     PLogFlops(a->nz);
1327   }
1328   PetscFunctionReturn(0);
1329 }
1330 
1331 
1332 #undef __FUNC__
1333 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1334 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1335 {
1336   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1337 
1338   PetscFunctionBegin;
1339   info->rows_global    = (double)a->m;
1340   info->columns_global = (double)a->n;
1341   info->rows_local     = (double)a->m;
1342   info->columns_local  = (double)a->n;
1343   info->block_size     = a->bs2;
1344   info->nz_allocated   = a->maxnz;
1345   info->nz_used        = a->bs2*a->nz;
1346   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1347   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1348     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1349   info->assemblies   = A->num_ass;
1350   info->mallocs      = a->reallocs;
1351   info->memory       = A->mem;
1352   if (A->factor) {
1353     info->fill_ratio_given  = A->info.fill_ratio_given;
1354     info->fill_ratio_needed = A->info.fill_ratio_needed;
1355     info->factor_mallocs    = A->info.factor_mallocs;
1356   } else {
1357     info->fill_ratio_given  = 0;
1358     info->fill_ratio_needed = 0;
1359     info->factor_mallocs    = 0;
1360   }
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 
1365 #undef __FUNC__
1366 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1367 int MatZeroEntries_SeqBAIJ(Mat A)
1368 {
1369   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1370   int         ierr;
1371 
1372   PetscFunctionBegin;
1373   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1374   PetscFunctionReturn(0);
1375 }
1376