xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision a3b388050d4e8a4c38128e174a7572fa24ee1ba5)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.47 1999/06/08 22:56:09 balay Exp balay $";
3 #endif
4 
5 #include "sys.h"
6 #include "src/mat/impls/baij/seq/baij.h"
7 #include "src/vec/vecimpl.h"
8 #include "src/inline/spops.h"
9 #include "src/inline/ilu.h"
10 #include "bitarray.h"
11 
12 #undef __FUNC__
13 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
14 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
15 {
16   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
17   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
18   int         start, end, *ai, *aj,bs,*nidx2;
19   BT          table;
20 
21   PetscFunctionBegin;
22   m     = a->mbs;
23   ai    = a->i;
24   aj    = a->j;
25   bs    = a->bs;
26 
27   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
28 
29   ierr  = BTCreate(m,table);CHKERRQ(ierr);
30   nidx  = (int *) PetscMalloc((m+1)*sizeof(int));CHKPTRQ(nidx);
31   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int));CHKPTRQ(nidx2);
32 
33   for ( i=0; i<is_max; i++ ) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     BTMemzero(m,table);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for ( j=0; j<n ; ++j){
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
46       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for ( j=0; j<ov; j++){ /* for each overlap*/
53       n = isz;
54       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for ( l = start; l<end ; l++){
59           val = aj[l];
60           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++ ) {
66       for (k=0; k<bs; k++ )
67         nidx2[j*bs+k] = nidx[j]*bs+k;
68     }
69     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i));CHKERRQ(ierr);
70   }
71   BTDestroy(table);
72   ierr = PetscFree(nidx);CHKERRQ(ierr);
73   ierr = PetscFree(nidx2);CHKERRQ(ierr);
74   PetscFunctionReturn(0);
75 }
76 
77 #undef __FUNC__
78 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
79 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
80 {
81   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
82   int          *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
83   int          row,mat_i,*mat_j,tcol,*mat_ilen;
84   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
85   int          *aj = a->j, *ai = a->i;
86   MatScalar    *mat_a;
87   Mat          C;
88 
89   PetscFunctionBegin;
90   ierr = ISSorted(iscol,(PetscTruth*)&i);CHKERRQ(ierr);
91   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
92 
93   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
94   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
95   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
96   ierr = ISGetSize(iscol,&ncols);CHKERRQ(ierr);
97 
98   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int));CHKPTRQ(smap);
99   ssmap = smap;
100   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int));CHKPTRQ(lens);
101   ierr  = PetscMemzero(smap,oldcols*sizeof(int));CHKERRQ(ierr);
102   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
103   /* determine lens of each row */
104   for (i=0; i<nrows; i++) {
105     kstart  = ai[irow[i]];
106     kend    = kstart + a->ilen[irow[i]];
107     lens[i] = 0;
108       for ( k=kstart; k<kend; k++ ) {
109         if (ssmap[aj[k]]) {
110           lens[i]++;
111         }
112       }
113     }
114   /* Create and fill new matrix */
115   if (scall == MAT_REUSE_MATRIX) {
116     c = (Mat_SeqBAIJ *)((*B)->data);
117 
118     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
119     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
120       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
121     }
122     ierr = PetscMemzero(c->ilen,c->mbs*sizeof(int));CHKERRQ(ierr);
123     C = *B;
124   } else {
125     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
126   }
127   c = (Mat_SeqBAIJ *)(C->data);
128   for (i=0; i<nrows; i++) {
129     row    = irow[i];
130     kstart = ai[row];
131     kend   = kstart + a->ilen[row];
132     mat_i  = c->i[i];
133     mat_j  = c->j + mat_i;
134     mat_a  = c->a + mat_i*bs2;
135     mat_ilen = c->ilen + i;
136     for ( k=kstart; k<kend; k++ ) {
137       if ((tcol=ssmap[a->j[k]])) {
138         *mat_j++ = tcol - 1;
139         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr);
140         mat_a   += bs2;
141         (*mat_ilen)++;
142       }
143     }
144   }
145 
146   /* Free work space */
147   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
148   ierr = PetscFree(smap);CHKERRQ(ierr);
149   ierr = PetscFree(lens);CHKERRQ(ierr);
150   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
151   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
152 
153   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
154   *B = C;
155   PetscFunctionReturn(0);
156 }
157 
158 #undef __FUNC__
159 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
160 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
161 {
162   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
163   IS          is1,is2;
164   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
165 
166   PetscFunctionBegin;
167   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
168   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
169   ierr = ISGetSize(isrow,&nrows);CHKERRQ(ierr);
170   ierr = ISGetSize(iscol,&ncols);CHKERRQ(ierr);
171 
172   /* Verify if the indices corespond to each element in a block
173    and form the IS with compressed IS */
174   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int));CHKPTRQ(vary);
175   iary = vary + a->mbs;
176   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
177   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
178   count = 0;
179   for (i=0; i<a->mbs; i++) {
180     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"Index set does not match blocks");
181     if (vary[i]==bs) iary[count++] = i;
182   }
183   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1);CHKERRQ(ierr);
184 
185   ierr = PetscMemzero(vary,(a->mbs)*sizeof(int));CHKERRQ(ierr);
186   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
187   count = 0;
188   for (i=0; i<a->mbs; i++) {
189     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
190     if (vary[i]==bs) iary[count++] = i;
191   }
192   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2);CHKERRQ(ierr);
193   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
194   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
195   ierr = PetscFree(vary);CHKERRQ(ierr);
196 
197   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);CHKERRQ(ierr);
198   ISDestroy(is1);
199   ISDestroy(is2);
200   PetscFunctionReturn(0);
201 }
202 
203 #undef __FUNC__
204 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
205 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatReuse scall,Mat **B)
206 {
207   int ierr,i;
208 
209   PetscFunctionBegin;
210   if (scall == MAT_INITIAL_MATRIX) {
211     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) );CHKPTRQ(*B);
212   }
213 
214   for ( i=0; i<n; i++ ) {
215     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);CHKERRQ(ierr);
216   }
217   PetscFunctionReturn(0);
218 }
219 
220 
221 /* -------------------------------------------------------*/
222 /* Should check that shapes of vectors and matrices match */
223 /* -------------------------------------------------------*/
224 #include "pinclude/blaslapack.h"
225 
226 #undef __FUNC__
227 #define __FUNC__ "MatMult_SeqBAIJ_1"
228 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
229 {
230   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
231   Scalar          *x,*z,sum;
232   MatScalar       *v;
233   int             mbs=a->mbs,i,*idx,*ii,n,ierr;
234 
235   PetscFunctionBegin;
236   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
237   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
238 
239   idx   = a->j;
240   v     = a->a;
241   ii    = a->i;
242 
243   for ( i=0; i<mbs; i++ ) {
244     n    = ii[1] - ii[0]; ii++;
245     sum  = 0.0;
246     while (n--) sum += *v++ * x[*idx++];
247     z[i] = sum;
248   }
249   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
250   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
251   PLogFlops(2*a->nz - a->m);
252   PetscFunctionReturn(0);
253 }
254 
255 #undef __FUNC__
256 #define __FUNC__ "MatMult_SeqBAIJ_2"
257 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
258 {
259   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
260   Scalar          *x,*z,*xb,sum1,sum2;
261   Scalar          x1,x2;
262   MatScalar       *v;
263   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
264 
265   PetscFunctionBegin;
266   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
267   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
268 
269   idx   = a->j;
270   v     = a->a;
271   ii    = a->i;
272 
273   for ( i=0; i<mbs; i++ ) {
274     n  = ii[1] - ii[0]; ii++;
275     sum1 = 0.0; sum2 = 0.0;
276     for ( j=0; j<n; j++ ) {
277       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
278       sum1 += v[0]*x1 + v[2]*x2;
279       sum2 += v[1]*x1 + v[3]*x2;
280       v += 4;
281     }
282     z[0] = sum1; z[1] = sum2;
283     z += 2;
284   }
285   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
286   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
287   PLogFlops(8*a->nz - a->m);
288   PetscFunctionReturn(0);
289 }
290 
291 #undef __FUNC__
292 #define __FUNC__ "MatMult_SeqBAIJ_3"
293 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
294 {
295   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data;
296   Scalar       *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
297   MatScalar    *v;
298   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;
299 
300 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
301 #pragma disjoint(*v,*z,*xb)
302 #endif
303 
304   PetscFunctionBegin;
305   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
306   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
307 
308   idx   = a->j;
309   v     = a->a;
310   ii    = a->i;
311 
312   for ( i=0; i<mbs; i++ ) {
313     n  = ii[1] - ii[0]; ii++;
314     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
315     for ( j=0; j<n; j++ ) {
316       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
317       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
318       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
319       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
320       v += 9;
321     }
322     z[0] = sum1; z[1] = sum2; z[2] = sum3;
323     z += 3;
324   }
325   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
326   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
327   PLogFlops(18*a->nz - a->m);
328   PetscFunctionReturn(0);
329 }
330 
331 #undef __FUNC__
332 #define __FUNC__ "MatMult_SeqBAIJ_4"
333 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
334 {
335   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
336   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
337   MatScalar       *v;
338   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
339 
340   PetscFunctionBegin;
341   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
342   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
343 
344   idx   = a->j;
345   v     = a->a;
346   ii    = a->i;
347 
348   for ( i=0; i<mbs; i++ ) {
349     n  = ii[1] - ii[0]; ii++;
350     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
351     for ( j=0; j<n; j++ ) {
352       xb = x + 4*(*idx++);
353       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
354       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
355       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
356       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
357       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
358       v += 16;
359     }
360     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
361     z += 4;
362   }
363   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
364   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
365   PLogFlops(32*a->nz - a->m);
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNC__
370 #define __FUNC__ "MatMult_SeqBAIJ_5"
371 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
372 {
373   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
374   Scalar          sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
375   MatScalar       *v;
376   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
377 
378   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
379   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
380 
381   idx   = a->j;
382   v     = a->a;
383   ii    = a->i;
384 
385   for ( i=0; i<mbs; i++ ) {
386     n  = ii[1] - ii[0]; ii++;
387     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
388     for ( j=0; j<n; j++ ) {
389       xb = x + 5*(*idx++);
390       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
391       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
392       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
393       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
394       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
395       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
396       v += 25;
397     }
398     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
399     z += 5;
400   }
401   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
402   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
403   PLogFlops(50*a->nz - a->m);
404   PetscFunctionReturn(0);
405 }
406 
407 
408 #undef __FUNC__
409 #define __FUNC__ "MatMult_SeqBAIJ_6"
410 int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
411 {
412   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
413   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
414   Scalar          x1,x2,x3,x4,x5,x6;
415   MatScalar       *v;
416   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
417 
418   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
419   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
420 
421   idx   = a->j;
422   v     = a->a;
423   ii    = a->i;
424 
425   for ( i=0; i<mbs; i++ ) {
426     n  = ii[1] - ii[0]; ii++;
427     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
428     for ( j=0; j<n; j++ ) {
429       xb = x + 6*(*idx++);
430       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
431       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
432       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
433       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
434       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
435       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
436       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
437       v += 36;
438     }
439     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
440     z += 6;
441   }
442 
443   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
444   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
445   PLogFlops(72*a->nz - a->m);
446   PetscFunctionReturn(0);
447 }
448 #undef __FUNC__
449 #define __FUNC__ "MatMult_SeqBAIJ_7"
450 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
451 {
452   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
453   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
454   Scalar          x1,x2,x3,x4,x5,x6,x7;
455   MatScalar       *v;
456   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
457 
458   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
459   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
460 
461   idx   = a->j;
462   v     = a->a;
463   ii    = a->i;
464 
465   for ( i=0; i<mbs; i++ ) {
466     n  = ii[1] - ii[0]; ii++;
467     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
468     for ( j=0; j<n; j++ ) {
469       xb = x + 7*(*idx++);
470       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
471       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
472       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
473       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
474       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
475       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
476       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
477       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
478       v += 49;
479     }
480     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
481     z += 7;
482   }
483 
484   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
485   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
486   PLogFlops(98*a->nz - a->m);
487   PetscFunctionReturn(0);
488 }
489 
490 /*
491     This will not work with MatScalar == float because it calls the BLAS
492 */
493 #undef __FUNC__
494 #define __FUNC__ "MatMult_SeqBAIJ_N"
495 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
496 {
497   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
498   Scalar          *x,*z,*xb, *work,*workt;
499   MatScalar       *v;
500   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
501   int             ncols,k;
502 
503   PetscFunctionBegin;
504   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
505   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
506 
507   idx   = a->j;
508   v     = a->a;
509   ii    = a->i;
510 
511 
512   if (!a->mult_work) {
513     k = PetscMax(a->m,a->n);
514     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
515   }
516   work = a->mult_work;
517   for ( i=0; i<mbs; i++ ) {
518     n     = ii[1] - ii[0]; ii++;
519     ncols = n*bs;
520     workt = work;
521     for ( j=0; j<n; j++ ) {
522       xb = x + bs*(*idx++);
523       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
524       workt += bs;
525     }
526     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
527     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
528     v += n*bs2;
529     z += bs;
530   }
531   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
532   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
533   PLogFlops(2*a->nz*bs2 - a->m);
534   PetscFunctionReturn(0);
535 }
536 
537 #undef __FUNC__
538 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
539 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
540 {
541   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
542   Scalar          *x,*y,*z,sum;
543   MatScalar       *v;
544   int             ierr,mbs=a->mbs,i,*idx,*ii,n;
545 
546   PetscFunctionBegin;
547   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
548   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
549   if (zz != yy) {
550     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
551   } else {
552     z = y;
553   }
554 
555   idx   = a->j;
556   v     = a->a;
557   ii    = a->i;
558 
559   for ( i=0; i<mbs; i++ ) {
560     n    = ii[1] - ii[0]; ii++;
561     sum  = y[i];
562     while (n--) sum += *v++ * x[*idx++];
563     z[i] = sum;
564   }
565   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
566   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
567   if (zz != yy) {
568     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
569   }
570   PLogFlops(2*a->nz);
571   PetscFunctionReturn(0);
572 }
573 
574 #undef __FUNC__
575 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
576 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
577 {
578   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
579   Scalar          *x,*y,*z,*xb,sum1,sum2;
580   Scalar          x1,x2;
581   MatScalar       *v;
582   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
583 
584   PetscFunctionBegin;
585   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
586   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
587   if (zz != yy) {
588     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
589   } else {
590     z = y;
591   }
592 
593   idx   = a->j;
594   v     = a->a;
595   ii    = a->i;
596 
597   for ( i=0; i<mbs; i++ ) {
598     n  = ii[1] - ii[0]; ii++;
599     sum1 = y[0]; sum2 = y[1];
600     for ( j=0; j<n; j++ ) {
601       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
602       sum1 += v[0]*x1 + v[2]*x2;
603       sum2 += v[1]*x1 + v[3]*x2;
604       v += 4;
605     }
606     z[0] = sum1; z[1] = sum2;
607     z += 2; y += 2;
608   }
609   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
610   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
611   if (zz != yy) {
612     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
613   }
614   PLogFlops(4*a->nz);
615   PetscFunctionReturn(0);
616 }
617 
618 #undef __FUNC__
619 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
620 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
621 {
622   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
623   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
624   MatScalar       *v;
625   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
626 
627   PetscFunctionBegin;
628   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
630   if (zz != yy) {
631     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
632   } else {
633     z = y;
634   }
635 
636   idx   = a->j;
637   v     = a->a;
638   ii    = a->i;
639 
640   for ( i=0; i<mbs; i++ ) {
641     n  = ii[1] - ii[0]; ii++;
642     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
643     for ( j=0; j<n; j++ ) {
644       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
645       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
646       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
647       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
648       v += 9;
649     }
650     z[0] = sum1; z[1] = sum2; z[2] = sum3;
651     z += 3; y += 3;
652   }
653   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
654   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
655   if (zz != yy) {
656     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
657   }
658   PLogFlops(18*a->nz);
659   PetscFunctionReturn(0);
660 }
661 
662 #undef __FUNC__
663 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
664 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
665 {
666   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
667   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
668   MatScalar       *v;
669   int             ierr,mbs=a->mbs,i,*idx,*ii;
670   int             j,n;
671 
672   PetscFunctionBegin;
673   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
674   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
675   if (zz != yy) {
676     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
677   } else {
678     z = y;
679   }
680 
681   idx   = a->j;
682   v     = a->a;
683   ii    = a->i;
684 
685   for ( i=0; i<mbs; i++ ) {
686     n  = ii[1] - ii[0]; ii++;
687     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
688     for ( j=0; j<n; j++ ) {
689       xb = x + 4*(*idx++);
690       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
691       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
692       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
693       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
694       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
695       v += 16;
696     }
697     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
698     z += 4; y += 4;
699   }
700   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
701   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
702   if (zz != yy) {
703     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
704   }
705   PLogFlops(32*a->nz);
706   PetscFunctionReturn(0);
707 }
708 
709 #undef __FUNC__
710 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
711 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
712 {
713   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
714   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
715   MatScalar       *v;
716   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
717 
718   PetscFunctionBegin;
719   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
720   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
721   if (zz != yy) {
722     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
723   } else {
724     z = y;
725   }
726 
727   idx   = a->j;
728   v     = a->a;
729   ii    = a->i;
730 
731   for ( i=0; i<mbs; i++ ) {
732     n  = ii[1] - ii[0]; ii++;
733     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
734     for ( j=0; j<n; j++ ) {
735       xb = x + 5*(*idx++);
736       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
737       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
738       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
739       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
740       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
741       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
742       v += 25;
743     }
744     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
745     z += 5; y += 5;
746   }
747   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
748   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
749   if (zz != yy) {
750     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
751   }
752   PLogFlops(50*a->nz);
753   PetscFunctionReturn(0);
754 }
755 #undef __FUNC__
756 #define __FUNC__ "MatMultAdd_SeqBAIJ_6"
757 int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
758 {
759   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
760   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
761   Scalar          x1,x2,x3,x4,x5,x6;
762   MatScalar       *v;
763   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
764 
765   PetscFunctionBegin;
766   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
767   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
768   if (zz != yy) {
769     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
770   } else {
771     z = y;
772   }
773 
774   idx   = a->j;
775   v     = a->a;
776   ii    = a->i;
777 
778   for ( i=0; i<mbs; i++ ) {
779     n  = ii[1] - ii[0]; ii++;
780     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
781     for ( j=0; j<n; j++ ) {
782       xb = x + 6*(*idx++);
783       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
784       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
785       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
786       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
787       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
788       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
789       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
790       v += 36;
791     }
792     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
793     z += 6; y += 6;
794   }
795   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
796   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
797   if (zz != yy) {
798     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
799   }
800   PLogFlops(72*a->nz);
801   PetscFunctionReturn(0);
802 }
803 
804 #undef __FUNC__
805 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
806 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
807 {
808   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
809   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
810   Scalar          x1,x2,x3,x4,x5,x6,x7;
811   MatScalar       *v;
812   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;
813 
814   PetscFunctionBegin;
815   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
816   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
817   if (zz != yy) {
818     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
819   } else {
820     z = y;
821   }
822 
823   idx   = a->j;
824   v     = a->a;
825   ii    = a->i;
826 
827   for ( i=0; i<mbs; i++ ) {
828     n  = ii[1] - ii[0]; ii++;
829     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
830     for ( j=0; j<n; j++ ) {
831       xb = x + 7*(*idx++);
832       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
833       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
834       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
835       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
836       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
837       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
838       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
839       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
840       v += 49;
841     }
842     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
843     z += 7; y += 7;
844   }
845   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
846   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
847   if (zz != yy) {
848     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
849   }
850   PLogFlops(98*a->nz);
851   PetscFunctionReturn(0);
852 }
853 
854 #undef __FUNC__
855 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
856 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
857 {
858   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *) A->data;
859   Scalar         *x,*z,*xb,*work,*workt;
860   MatScalar      *v;
861   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
862   int            ncols,k;
863 
864   PetscFunctionBegin;
865   if ( xx != yy) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
866 
867   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
868   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
869 
870   idx   = a->j;
871   v     = a->a;
872   ii    = a->i;
873 
874 
875   if (!a->mult_work) {
876     k = PetscMax(a->m,a->n);
877     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
878   }
879   work = a->mult_work;
880   for ( i=0; i<mbs; i++ ) {
881     n     = ii[1] - ii[0]; ii++;
882     ncols = n*bs;
883     workt = work;
884     for ( j=0; j<n; j++ ) {
885       xb = x + bs*(*idx++);
886       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
887       workt += bs;
888     }
889     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
890     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
891     v += n*bs2;
892     z += bs;
893   }
894   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
895   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
896   PLogFlops(2*a->nz*bs2 );
897   PetscFunctionReturn(0);
898 }
899 
900 #undef __FUNC__
901 #define __FUNC__ "MatMultTrans_SeqBAIJ"
902 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
903 {
904   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
905   Scalar          *xg,*zg,*zb;
906   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
907   MatScalar       *v;
908   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
909   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
910 
911 
912   PetscFunctionBegin;
913   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
914   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
915   ierr = PetscMemzero(z,N*sizeof(Scalar));CHKERRQ(ierr);
916 
917   idx   = a->j;
918   v     = a->a;
919   ii    = a->i;
920 
921   switch (bs) {
922   case 1:
923     for ( i=0; i<mbs; i++ ) {
924       n  = ii[1] - ii[0]; ii++;
925       xb = x + i; x1 = xb[0];
926       ib = idx + ai[i];
927       for ( j=0; j<n; j++ ) {
928         rval    = ib[j];
929         z[rval] += *v++ * x1;
930       }
931     }
932     break;
933   case 2:
934     for ( i=0; i<mbs; i++ ) {
935       n  = ii[1] - ii[0]; ii++;
936       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
937       ib = idx + ai[i];
938       for ( j=0; j<n; j++ ) {
939         rval      = ib[j]*2;
940         z[rval++] += v[0]*x1 + v[1]*x2;
941         z[rval++] += v[2]*x1 + v[3]*x2;
942         v += 4;
943       }
944     }
945     break;
946   case 3:
947     for ( i=0; i<mbs; i++ ) {
948       n  = ii[1] - ii[0]; ii++;
949       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
950       ib = idx + ai[i];
951       for ( j=0; j<n; j++ ) {
952         rval      = ib[j]*3;
953         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
954         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
955         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
956         v += 9;
957       }
958     }
959     break;
960   case 4:
961     for ( i=0; i<mbs; i++ ) {
962       n  = ii[1] - ii[0]; ii++;
963       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
964       ib = idx + ai[i];
965       for ( j=0; j<n; j++ ) {
966         rval      = ib[j]*4;
967         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
968         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
969         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
970         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
971         v += 16;
972       }
973     }
974     break;
975   case 5:
976     for ( i=0; i<mbs; i++ ) {
977       n  = ii[1] - ii[0]; ii++;
978       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
979       x4 = xb[3];   x5 = xb[4];
980       ib = idx + ai[i];
981       for ( j=0; j<n; j++ ) {
982         rval      = ib[j]*5;
983         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
984         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
985         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
986         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
987         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
988         v += 25;
989       }
990     }
991     break;
992       /* block sizes larger then 5 by 5 are handled by BLAS */
993     default: {
994       int       ncols,k;
995       Scalar    *work,*workt;
996 
997       if (!a->mult_work) {
998         k = PetscMax(a->m,a->n);
999         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1000       }
1001       work = a->mult_work;
1002       for ( i=0; i<mbs; i++ ) {
1003         n     = ii[1] - ii[0]; ii++;
1004         ncols = n*bs;
1005         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1006         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1007         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1008         v += n*bs2;
1009         x += bs;
1010         workt = work;
1011         for ( j=0; j<n; j++ ) {
1012           zb = z + bs*(*idx++);
1013           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1014           workt += bs;
1015         }
1016       }
1017     }
1018   }
1019   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1020   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1021   PLogFlops(2*a->nz*a->bs2 - a->n);
1022   PetscFunctionReturn(0);
1023 }
1024 
1025 #undef __FUNC__
1026 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1027 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1028 
1029 {
1030   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1031   Scalar          *xg,*zg,*zb;
1032   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5;
1033   MatScalar       *v;
1034   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1035   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1036 
1037   PetscFunctionBegin;
1038   ierr = VecGetArray(xx,&xg);CHKERRQ(ierr); x = xg;
1039   ierr = VecGetArray(zz,&zg);CHKERRQ(ierr); z = zg;
1040 
1041   if ( yy != zz ) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1042   else {
1043     ierr = PetscMemzero(z,N*sizeof(Scalar));CHKERRQ(ierr);
1044   }
1045 
1046   idx   = a->j;
1047   v     = a->a;
1048   ii    = a->i;
1049 
1050   switch (bs) {
1051   case 1:
1052     for ( i=0; i<mbs; i++ ) {
1053       n  = ii[1] - ii[0]; ii++;
1054       xb = x + i; x1 = xb[0];
1055       ib = idx + ai[i];
1056       for ( j=0; j<n; j++ ) {
1057         rval    = ib[j];
1058         z[rval] += *v++ * x1;
1059       }
1060     }
1061     break;
1062   case 2:
1063     for ( i=0; i<mbs; i++ ) {
1064       n  = ii[1] - ii[0]; ii++;
1065       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1066       ib = idx + ai[i];
1067       for ( j=0; j<n; j++ ) {
1068         rval      = ib[j]*2;
1069         z[rval++] += v[0]*x1 + v[1]*x2;
1070         z[rval++] += v[2]*x1 + v[3]*x2;
1071         v += 4;
1072       }
1073     }
1074     break;
1075   case 3:
1076     for ( i=0; i<mbs; i++ ) {
1077       n  = ii[1] - ii[0]; ii++;
1078       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1079       ib = idx + ai[i];
1080       for ( j=0; j<n; j++ ) {
1081         rval      = ib[j]*3;
1082         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1083         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1084         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1085         v += 9;
1086       }
1087     }
1088     break;
1089   case 4:
1090     for ( i=0; i<mbs; i++ ) {
1091       n  = ii[1] - ii[0]; ii++;
1092       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1093       ib = idx + ai[i];
1094       for ( j=0; j<n; j++ ) {
1095         rval      = ib[j]*4;
1096         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1097         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1098         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1099         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1100         v += 16;
1101       }
1102     }
1103     break;
1104   case 5:
1105     for ( i=0; i<mbs; i++ ) {
1106       n  = ii[1] - ii[0]; ii++;
1107       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1108       x4 = xb[3];   x5 = xb[4];
1109       ib = idx + ai[i];
1110       for ( j=0; j<n; j++ ) {
1111         rval      = ib[j]*5;
1112         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1113         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1114         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1115         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1116         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1117         v += 25;
1118       }
1119     }
1120     break;
1121       /* block sizes larger then 5 by 5 are handled by BLAS */
1122     default: {
1123       int       ncols,k;
1124       Scalar    *work,*workt;
1125 
1126       if (!a->mult_work) {
1127         k = PetscMax(a->m,a->n);
1128         a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1129       }
1130       work = a->mult_work;
1131       for ( i=0; i<mbs; i++ ) {
1132         n     = ii[1] - ii[0]; ii++;
1133         ncols = n*bs;
1134         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));CHKERRQ(ierr);
1135         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1136         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1137         v += n*bs2;
1138         x += bs;
1139         workt = work;
1140         for ( j=0; j<n; j++ ) {
1141           zb = z + bs*(*idx++);
1142           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1143           workt += bs;
1144         }
1145       }
1146     }
1147   }
1148   ierr = VecRestoreArray(xx,&xg);CHKERRQ(ierr);
1149   ierr = VecRestoreArray(zz,&zg);CHKERRQ(ierr);
1150   PLogFlops(2*a->nz*a->bs2);
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 #undef __FUNC__
1155 #define __FUNC__ "MatScale_SeqBAIJ"
1156 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1157 {
1158   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1159   int         one = 1, totalnz = a->bs2*a->nz;
1160 
1161   PetscFunctionBegin;
1162   BLscal_( &totalnz, alpha, a->a, &one );
1163   PLogFlops(totalnz);
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 #undef __FUNC__
1168 #define __FUNC__ "MatNorm_SeqBAIJ"
1169 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1170 {
1171   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1172   MatScalar   *v = a->a;
1173   double      sum = 0.0;
1174   int         i,j,k,bs = a->bs, nz=a->nz,bs2=a->bs2;
1175 
1176   PetscFunctionBegin;
1177   if (type == NORM_FROBENIUS) {
1178     for (i=0; i< bs2*nz; i++ ) {
1179 #if defined(PETSC_USE_COMPLEX)
1180       sum += PetscReal(PetscConj(*v)*(*v)); v++;
1181 #else
1182       sum += (*v)*(*v); v++;
1183 #endif
1184     }
1185     *norm = sqrt(sum);
1186   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1187     *norm = 0.0;
1188     for ( k=0; k<bs; k++ ) {
1189       for ( j=0; j<a->m; j++ ) {
1190         v = a->a + bs2*a->i[j] + k;
1191         sum = 0.0;
1192         for ( i=0; i<a->i[j+1]-a->i[j]; i++ ) {
1193           sum += PetscAbsScalar(*v);
1194           v   += bs;
1195         }
1196         if (sum > *norm) *norm = sum;
1197       }
1198     }
1199   } else {
1200     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1201   }
1202   PetscFunctionReturn(0);
1203 }
1204 
1205 
1206 #undef __FUNC__
1207 #define __FUNC__ "MatEqual_SeqBAIJ"
1208 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1209 {
1210   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1211 
1212   PetscFunctionBegin;
1213   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1214 
1215   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1216   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1217     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1218   }
1219 
1220   /* if the a->i are the same */
1221   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1222     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1223   }
1224 
1225   /* if a->j are the same */
1226   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1227     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1228   }
1229 
1230   /* if a->a are the same */
1231   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1232     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1233   }
1234   *flg = PETSC_TRUE;
1235   PetscFunctionReturn(0);
1236 
1237 }
1238 
1239 #undef __FUNC__
1240 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1241 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1242 {
1243   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1244   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1245   Scalar      *x, zero = 0.0;
1246   MatScalar   *aa,*aa_j;
1247 
1248   PetscFunctionBegin;
1249   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1250   bs   = a->bs;
1251   aa   = a->a;
1252   ai   = a->i;
1253   aj   = a->j;
1254   ambs = a->mbs;
1255   bs2  = a->bs2;
1256 
1257   ierr = VecSet(&zero,v);CHKERRQ(ierr);
1258   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1259   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1260   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1261   for ( i=0; i<ambs; i++ ) {
1262     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1263       if (aj[j] == i) {
1264         row  = i*bs;
1265         aa_j = aa+j*bs2;
1266         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1267         break;
1268       }
1269     }
1270   }
1271   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1272   PetscFunctionReturn(0);
1273 }
1274 
1275 #undef __FUNC__
1276 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1277 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1278 {
1279   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1280   Scalar      *l,*r,x,*li,*ri;
1281   MatScalar   *aa,*v;
1282   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1283 
1284   PetscFunctionBegin;
1285   ai  = a->i;
1286   aj  = a->j;
1287   aa  = a->a;
1288   m   = a->m;
1289   n   = a->n;
1290   bs  = a->bs;
1291   mbs = a->mbs;
1292   bs2 = a->bs2;
1293   if (ll) {
1294     ierr = VecGetArray(ll,&l);CHKERRQ(ierr);
1295     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1296     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1297     for ( i=0; i<mbs; i++ ) { /* for each block row */
1298       M  = ai[i+1] - ai[i];
1299       li = l + i*bs;
1300       v  = aa + bs2*ai[i];
1301       for ( j=0; j<M; j++ ) { /* for each block */
1302         for ( k=0; k<bs2; k++ ) {
1303           (*v++) *= li[k%bs];
1304         }
1305       }
1306     }
1307     ierr = VecRestoreArray(ll,&l);CHKERRQ(ierr);
1308     PLogFlops(a->nz);
1309   }
1310 
1311   if (rr) {
1312     ierr = VecGetArray(rr,&r);CHKERRQ(ierr);
1313     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
1314     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1315     for ( i=0; i<mbs; i++ ) { /* for each block row */
1316       M  = ai[i+1] - ai[i];
1317       v  = aa + bs2*ai[i];
1318       for ( j=0; j<M; j++ ) { /* for each block */
1319         ri = r + bs*aj[ai[i]+j];
1320         for ( k=0; k<bs; k++ ) {
1321           x = ri[k];
1322           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1323         }
1324       }
1325     }
1326     ierr = VecRestoreArray(rr,&r);CHKERRQ(ierr);
1327     PLogFlops(a->nz);
1328   }
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 
1333 #undef __FUNC__
1334 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1335 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1336 {
1337   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1338 
1339   PetscFunctionBegin;
1340   info->rows_global    = (double)a->m;
1341   info->columns_global = (double)a->n;
1342   info->rows_local     = (double)a->m;
1343   info->columns_local  = (double)a->n;
1344   info->block_size     = a->bs2;
1345   info->nz_allocated   = a->maxnz;
1346   info->nz_used        = a->bs2*a->nz;
1347   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1348   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1349     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1350   info->assemblies   = A->num_ass;
1351   info->mallocs      = a->reallocs;
1352   info->memory       = A->mem;
1353   if (A->factor) {
1354     info->fill_ratio_given  = A->info.fill_ratio_given;
1355     info->fill_ratio_needed = A->info.fill_ratio_needed;
1356     info->factor_mallocs    = A->info.factor_mallocs;
1357   } else {
1358     info->fill_ratio_given  = 0;
1359     info->fill_ratio_needed = 0;
1360     info->factor_mallocs    = 0;
1361   }
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 
1366 #undef __FUNC__
1367 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1368 int MatZeroEntries_SeqBAIJ(Mat A)
1369 {
1370   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1371   int         ierr;
1372 
1373   PetscFunctionBegin;
1374   ierr = PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));CHKERRQ(ierr);
1375   PetscFunctionReturn(0);
1376 }
1377