xref: /petsc/src/mat/impls/baij/seq/baij2.c (revision 76be9ce4a233aaa47cda2bc7f5a27cd7faabecaa)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij2.c,v 1.20 1997/12/01 01:55:02 bsmith Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "sys.h"
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/vec/vecimpl.h"
9 #include "src/inline/spops.h"
10 #include "petsc.h"
11 #include "src/inline/bitarray.h"
12 
13 
14 #undef __FUNC__
15 #define __FUNC__ "MatIncreaseOverlap_SeqBAIJ"
16 int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
17 {
18   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
19   int         row, i,j,k,l,m,n, *idx,ierr, *nidx, isz, val, ival;
20   int         start, end, *ai, *aj,bs,*nidx2;
21   BT          table;
22 
23   PetscFunctionBegin;
24   m     = a->mbs;
25   ai    = a->i;
26   aj    = a->j;
27   bs    = a->bs;
28 
29   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative overlap specified");
30 
31   ierr  = BTCreate(m,table); CHKERRQ(ierr);
32   nidx  = (int *) PetscMalloc((m+1)*sizeof(int)); CHKPTRQ(nidx);
33   nidx2 = (int *)PetscMalloc((a->m+1)*sizeof(int)); CHKPTRQ(nidx2);
34 
35   for ( i=0; i<is_max; i++ ) {
36     /* Initialise the two local arrays */
37     isz  = 0;
38     BTMemzero(m,table);
39 
40     /* Extract the indices, assume there can be duplicate entries */
41     ierr = ISGetIndices(is[i],&idx);  CHKERRQ(ierr);
42     ierr = ISGetSize(is[i],&n);  CHKERRQ(ierr);
43 
44     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
45     for ( j=0; j<n ; ++j){
46       ival = idx[j]/bs; /* convert the indices into block indices */
47       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"index greater than mat-dim");
48       if(!BTLookupSet(table, ival)) { nidx[isz++] = ival;}
49     }
50     ierr = ISRestoreIndices(is[i],&idx);  CHKERRQ(ierr);
51     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
52 
53     k = 0;
54     for ( j=0; j<ov; j++){ /* for each overlap*/
55       n = isz;
56       for ( ; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
57         row   = nidx[k];
58         start = ai[row];
59         end   = ai[row+1];
60         for ( l = start; l<end ; l++){
61           val = aj[l];
62           if (!BTLookupSet(table,val)) {nidx[isz++] = val;}
63         }
64       }
65     }
66     /* expand the Index Set */
67     for (j=0; j<isz; j++ ) {
68       for (k=0; k<bs; k++ )
69         nidx2[j*bs+k] = nidx[j]*bs+k;
70     }
71     ierr = ISCreateGeneral(PETSC_COMM_SELF, isz*bs, nidx2, (is+i)); CHKERRQ(ierr);
72   }
73   BTDestroy(table);
74   PetscFree(nidx);
75   PetscFree(nidx2);
76   PetscFunctionReturn(0);
77 }
78 
79 #undef __FUNC__
80 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ_Private"
81 int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatGetSubMatrixCall scall,Mat *B)
82 {
83   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ *) A->data,*c;
84   int          nznew, *smap, i, k, kstart, kend, ierr, oldcols = a->nbs,*lens;
85   int          row,mat_i,*mat_j,tcol,*mat_ilen;
86   int          *irow, *icol, nrows, ncols,*ssmap,bs=a->bs, bs2=a->bs2;
87   int          *aj = a->j, *ai = a->i;
88   Scalar       *mat_a;
89   Mat          C;
90 
91   PetscFunctionBegin;
92   ierr = ISSorted(iscol,(PetscTruth*)&i);
93   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IS is not sorted");
94 
95   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
96   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
97   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
98   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
99 
100   smap  = (int *) PetscMalloc((1+oldcols)*sizeof(int)); CHKPTRQ(smap);
101   ssmap = smap;
102   lens  = (int *) PetscMalloc((1+nrows)*sizeof(int)); CHKPTRQ(lens);
103   PetscMemzero(smap,oldcols*sizeof(int));
104   for ( i=0; i<ncols; i++ ) smap[icol[i]] = i+1;
105   /* determine lens of each row */
106   for (i=0; i<nrows; i++) {
107     kstart  = ai[irow[i]];
108     kend    = kstart + a->ilen[irow[i]];
109     lens[i] = 0;
110       for ( k=kstart; k<kend; k++ ) {
111         if (ssmap[aj[k]]) {
112           lens[i]++;
113         }
114       }
115     }
116   /* Create and fill new matrix */
117   if (scall == MAT_REUSE_MATRIX) {
118     c = (Mat_SeqBAIJ *)((*B)->data);
119 
120     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Submatrix wrong size");
121     if (PetscMemcmp(c->ilen,lens, c->mbs *sizeof(int))) {
122       SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros");
123     }
124     PetscMemzero(c->ilen,c->mbs*sizeof(int));
125     C = *B;
126   } else {
127     ierr = MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);CHKERRQ(ierr);
128   }
129   c = (Mat_SeqBAIJ *)(C->data);
130   for (i=0; i<nrows; i++) {
131     row    = irow[i];
132     nznew  = 0;
133     kstart = ai[row];
134     kend   = kstart + a->ilen[row];
135     mat_i  = c->i[i];
136     mat_j  = c->j + mat_i;
137     mat_a  = c->a + mat_i*bs2;
138     mat_ilen = c->ilen + i;
139     for ( k=kstart; k<kend; k++ ) {
140       if ((tcol=ssmap[a->j[k]])) {
141         *mat_j++ = tcol - 1;
142         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(Scalar)); mat_a+=bs2;
143         (*mat_ilen)++;
144 
145       }
146     }
147   }
148 
149   /* Free work space */
150   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
151   PetscFree(smap); PetscFree(lens);
152   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
153   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
154 
155   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
156   *B = C;
157   PetscFunctionReturn(0);
158 }
159 
160 #undef __FUNC__
161 #define __FUNC__ "MatGetSubMatrix_SeqBAIJ"
162 int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatGetSubMatrixCall scall,Mat *B)
163 {
164   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
165   IS          is1,is2;
166   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
167 
168   PetscFunctionBegin;
169   ierr = ISGetIndices(isrow,&irow); CHKERRQ(ierr);
170   ierr = ISGetIndices(iscol,&icol); CHKERRQ(ierr);
171   ierr = ISGetSize(isrow,&nrows); CHKERRQ(ierr);
172   ierr = ISGetSize(iscol,&ncols); CHKERRQ(ierr);
173 
174   /* Verify if the indices corespond to each elementin a block
175    and form the IS with compressed IS */
176   vary = (int *) PetscMalloc(2*(a->mbs+1)*sizeof(int)); CHKPTRQ(vary);
177   iary = vary + a->mbs;
178   PetscMemzero(vary,(a->mbs)*sizeof(int));
179   for ( i=0; i<nrows; i++) vary[irow[i]/bs]++;
180   count = 0;
181   for (i=0; i<a->mbs; i++) {
182     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
183     if (vary[i]==bs) iary[count++] = i;
184   }
185   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is1); CHKERRQ(ierr);
186 
187   PetscMemzero(vary,(a->mbs)*sizeof(int));
188   for ( i=0; i<ncols; i++) vary[icol[i]/bs]++;
189   count = 0;
190   for (i=0; i<a->mbs; i++) {
191     if (vary[i]!=0 && vary[i]!=bs) SETERRA(1,0,"MatGetSubmatrices_SeqBAIJ:");
192     if (vary[i]==bs) iary[count++] = i;
193   }
194   ierr = ISCreateGeneral(PETSC_COMM_SELF, count, iary,&is2); CHKERRQ(ierr);
195   ierr = ISRestoreIndices(isrow,&irow); CHKERRQ(ierr);
196   ierr = ISRestoreIndices(iscol,&icol); CHKERRQ(ierr);
197   PetscFree(vary);
198 
199   ierr = MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B); CHKERRQ(ierr);
200   ISDestroy(is1);
201   ISDestroy(is2);
202   PetscFunctionReturn(0);
203 }
204 
205 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
206 
207 #undef __FUNC__
208 #define __FUNC__ "MatGetSubMatrices_SeqBAIJ"
209 int MatGetSubMatrices_SeqBAIJ(Mat A,int n, IS *irow,IS *icol,MatGetSubMatrixCall scall,
210                                     Mat **B)
211 {
212   int ierr,i;
213 
214   PetscFunctionBegin;
215   if (scall == MAT_INITIAL_MATRIX) {
216     *B = (Mat *) PetscMalloc( (n+1)*sizeof(Mat) ); CHKPTRQ(*B);
217   }
218 
219   for ( i=0; i<n; i++ ) {
220     ierr = MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 
226 /* -------------------------------------------------------*/
227 /* Should check that shapes of vectors and matrices match */
228 /* -------------------------------------------------------*/
229 #include "pinclude/plapack.h"
230 
231 #undef __FUNC__
232 #define __FUNC__ "MatMult_SeqBAIJ_1"
233 int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
234 {
235   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
236   register Scalar *x,*z,*v,sum;
237   int             mbs=a->mbs,i,*idx,*ii,n;
238 
239   PetscFunctionBegin;
240   VecGetArray_Fast(xx,x);
241   VecGetArray_Fast(zz,z);
242 
243   idx   = a->j;
244   v     = a->a;
245   ii    = a->i;
246 
247   for ( i=0; i<mbs; i++ ) {
248     n    = ii[1] - ii[0]; ii++;
249     sum  = 0.0;
250     while (n--) sum += *v++ * x[*idx++];
251     z[i] = sum;
252   }
253   VecRestoreArray_Fast(xx,x);
254   VecRestoreArray_Fast(zz,z);
255   PLogFlops(2*a->nz - a->m);
256   PetscFunctionReturn(0);
257 }
258 
259 #undef __FUNC__
260 #define __FUNC__ "MatMult_SeqBAIJ_2"
261 int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
262 {
263   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
264   register Scalar *x,*z,*v,*xb,sum1,sum2;
265   register Scalar x1,x2;
266   int             mbs=a->mbs,i,*idx,*ii,j,n;
267 
268   PetscFunctionBegin;
269   VecGetArray_Fast(xx,x);
270   VecGetArray_Fast(zz,z);
271 
272   idx   = a->j;
273   v     = a->a;
274   ii    = a->i;
275 
276   for ( i=0; i<mbs; i++ ) {
277     n  = ii[1] - ii[0]; ii++;
278     sum1 = 0.0; sum2 = 0.0;
279     for ( j=0; j<n; j++ ) {
280       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
281       sum1 += v[0]*x1 + v[2]*x2;
282       sum2 += v[1]*x1 + v[3]*x2;
283       v += 4;
284     }
285     z[0] = sum1; z[1] = sum2;
286     z += 2;
287   }
288   VecRestoreArray_Fast(xx,x);
289   VecRestoreArray_Fast(zz,z);
290   PLogFlops(4*a->nz - a->m);
291   PetscFunctionReturn(0);
292 }
293 
294 #undef __FUNC__
295 #define __FUNC__ "MatMult_SeqBAIJ_3"
296 int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
297 {
298   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
299   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
300   int             mbs=a->mbs,i,*idx,*ii,j,n;
301 
302   PetscFunctionBegin;
303   VecGetArray_Fast(xx,x);
304   VecGetArray_Fast(zz,z);
305 
306   idx   = a->j;
307   v     = a->a;
308   ii    = a->i;
309 
310   for ( i=0; i<mbs; i++ ) {
311     n  = ii[1] - ii[0]; ii++;
312     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
313     for ( j=0; j<n; j++ ) {
314       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
315       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
316       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
317       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
318       v += 9;
319     }
320     z[0] = sum1; z[1] = sum2; z[2] = sum3;
321     z += 3;
322   }
323   VecRestoreArray_Fast(xx,x);
324   VecRestoreArray_Fast(zz,z);
325   PLogFlops(18*a->nz - a->m);
326   PetscFunctionReturn(0);
327 }
328 
329 #undef __FUNC__
330 #define __FUNC__ "MatMult_SeqBAIJ_4"
331 int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
332 {
333   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
334   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
335   int             mbs=a->mbs,i,*idx,*ii,j,n;
336 
337   PetscFunctionBegin;
338   VecGetArray_Fast(xx,x);
339   VecGetArray_Fast(zz,z);
340 
341   idx   = a->j;
342   v     = a->a;
343   ii    = a->i;
344 
345   for ( i=0; i<mbs; i++ ) {
346     n  = ii[1] - ii[0]; ii++;
347     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
348     for ( j=0; j<n; j++ ) {
349       xb = x + 4*(*idx++);
350       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
351       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
352       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
353       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
354       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
355       v += 16;
356     }
357     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
358     z += 4;
359   }
360   VecRestoreArray_Fast(xx,x);
361   VecRestoreArray_Fast(zz,z);
362   PLogFlops(32*a->nz - a->m);
363   PetscFunctionReturn(0);
364 }
365 
366 #undef __FUNC__
367 #define __FUNC__ "MatMult_SeqBAIJ_5"
368 int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
369 {
370   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
371   register Scalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
372   register Scalar * restrict v,* restrict xb,* restrict z, * restrict x;
373   int             mbs=a->mbs,i,*idx,*ii,j,n;
374 
375   VecGetArray_Fast(xx,x);
376   VecGetArray_Fast(zz,z);
377 
378   idx   = a->j;
379   v     = a->a;
380   ii    = a->i;
381 
382   for ( i=0; i<mbs; i++ ) {
383     n  = ii[1] - ii[0]; ii++;
384     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
385     for ( j=0; j<n; j++ ) {
386       xb = x + 5*(*idx++);
387       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
388       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
389       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
390       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
391       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
392       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
393       v += 25;
394     }
395     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
396     z += 5;
397   }
398   VecRestoreArray_Fast(xx,x);
399   VecRestoreArray_Fast(zz,z);
400   PLogFlops(50*a->nz - a->m);
401   PetscFunctionReturn(0);
402 }
403 
404 #undef __FUNC__
405 #define __FUNC__ "MatMult_SeqBAIJ_7"
406 int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
407 {
408   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
409   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
410   register Scalar x1,x2,x3,x4,x5,x6,x7;
411   int             mbs=a->mbs,i,*idx,*ii,j,n;
412 
413   VecGetArray_Fast(xx,x);
414   VecGetArray_Fast(zz,z);
415 
416   idx   = a->j;
417   v     = a->a;
418   ii    = a->i;
419 
420   for ( i=0; i<mbs; i++ ) {
421     n  = ii[1] - ii[0]; ii++;
422     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
423     for ( j=0; j<n; j++ ) {
424       xb = x + 7*(*idx++);
425       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
426       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
427       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
428       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
429       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
430       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
431       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
432       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
433       v += 49;
434     }
435     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
436     z += 7;
437   }
438 
439   VecRestoreArray_Fast(xx,x);
440   VecRestoreArray_Fast(zz,z);
441   PLogFlops(98*a->nz - a->m);
442   PetscFunctionReturn(0);
443 }
444 
445 #undef __FUNC__
446 #define __FUNC__ "MatMult_SeqBAIJ_N"
447 int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
448 {
449   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
450   register Scalar *x,*z,*v,*xb;
451   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
452   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
453   int             _One = 1,ncols,k;
454 
455   PetscFunctionBegin;
456   VecGetArray_Fast(xx,x);
457   VecGetArray_Fast(zz,z);
458 
459   idx   = a->j;
460   v     = a->a;
461   ii    = a->i;
462 
463 
464   if (!a->mult_work) {
465     k = PetscMax(a->m,a->n);
466     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
467   }
468   work = a->mult_work;
469   for ( i=0; i<mbs; i++ ) {
470     n     = ii[1] - ii[0]; ii++;
471     ncols = n*bs;
472     workt = work;
473     for ( j=0; j<n; j++ ) {
474       xb = x + bs*(*idx++);
475       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
476       workt += bs;
477     }
478     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
479     v += n*bs2;
480     z += bs;
481   }
482   VecRestoreArray_Fast(xx,x);
483   VecRestoreArray_Fast(zz,z);
484   PLogFlops(2*a->nz*bs2 - a->m);
485   PetscFunctionReturn(0);
486 }
487 
488 #undef __FUNC__
489 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
490 int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
491 {
492   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
493   register Scalar *x,*y,*z,*v,sum;
494   int             mbs=a->mbs,i,*idx,*ii,n;
495 
496   PetscFunctionBegin;
497   VecGetArray_Fast(xx,x);
498   VecGetArray_Fast(yy,y);
499   VecGetArray_Fast(zz,z);
500 
501   idx   = a->j;
502   v     = a->a;
503   ii    = a->i;
504 
505   for ( i=0; i<mbs; i++ ) {
506     n    = ii[1] - ii[0]; ii++;
507     sum  = y[i];
508     while (n--) sum += *v++ * x[*idx++];
509     z[i] = sum;
510   }
511   VecRestoreArray_Fast(xx,x);
512   VecRestoreArray_Fast(yy,y);
513   VecRestoreArray_Fast(zz,z);
514   PLogFlops(2*a->nz);
515   PetscFunctionReturn(0);
516 }
517 
518 #undef __FUNC__
519 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
520 int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
521 {
522   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
523   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
524   register Scalar x1,x2;
525   int             mbs=a->mbs,i,*idx,*ii,j,n;
526 
527   PetscFunctionBegin;
528   VecGetArray_Fast(xx,x);
529   VecGetArray_Fast(yy,y);
530   VecGetArray_Fast(zz,z);
531 
532   idx   = a->j;
533   v     = a->a;
534   ii    = a->i;
535 
536   for ( i=0; i<mbs; i++ ) {
537     n  = ii[1] - ii[0]; ii++;
538     sum1 = y[0]; sum2 = y[1];
539     for ( j=0; j<n; j++ ) {
540       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
541       sum1 += v[0]*x1 + v[2]*x2;
542       sum2 += v[1]*x1 + v[3]*x2;
543       v += 4;
544     }
545     z[0] = sum1; z[1] = sum2;
546     z += 2; y += 2;
547   }
548   VecRestoreArray_Fast(xx,x);
549   VecRestoreArray_Fast(yy,y);
550   VecRestoreArray_Fast(zz,z);
551   PLogFlops(4*a->nz);
552   PetscFunctionReturn(0);
553 }
554 
555 #undef __FUNC__
556 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
557 int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
558 {
559   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
560   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
561   int             mbs=a->mbs,i,*idx,*ii,j,n;
562 
563   PetscFunctionBegin;
564   VecGetArray_Fast(xx,x);
565   VecGetArray_Fast(yy,y);
566   VecGetArray_Fast(zz,z);
567 
568   idx   = a->j;
569   v     = a->a;
570   ii    = a->i;
571 
572   for ( i=0; i<mbs; i++ ) {
573     n  = ii[1] - ii[0]; ii++;
574     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
575     for ( j=0; j<n; j++ ) {
576       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
577       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
578       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
579       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
580       v += 9;
581     }
582     z[0] = sum1; z[1] = sum2; z[2] = sum3;
583     z += 3; y += 3;
584   }
585   VecRestoreArray_Fast(xx,x);
586   VecRestoreArray_Fast(yy,y);
587   VecRestoreArray_Fast(zz,z);
588   PLogFlops(18*a->nz);
589   PetscFunctionReturn(0);
590 }
591 
592 #undef __FUNC__
593 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
594 int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
595 {
596   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
597   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
598   int             mbs=a->mbs,i,*idx,*ii;
599   int             j,n;
600 
601   PetscFunctionBegin;
602   VecGetArray_Fast(xx,x);
603   VecGetArray_Fast(yy,y);
604   VecGetArray_Fast(zz,z);
605 
606   idx   = a->j;
607   v     = a->a;
608   ii    = a->i;
609 
610   for ( i=0; i<mbs; i++ ) {
611     n  = ii[1] - ii[0]; ii++;
612     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
613     for ( j=0; j<n; j++ ) {
614       xb = x + 4*(*idx++);
615       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
616       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
617       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
618       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
619       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
620       v += 16;
621     }
622     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
623     z += 4; y += 4;
624   }
625   VecRestoreArray_Fast(xx,x);
626   VecRestoreArray_Fast(yy,y);
627   VecRestoreArray_Fast(zz,z);
628   PLogFlops(32*a->nz);
629   PetscFunctionReturn(0);
630 }
631 
632 #undef __FUNC__
633 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
634 int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
635 {
636   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
637   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
638   int             mbs=a->mbs,i,*idx,*ii,j,n;
639 
640   PetscFunctionBegin;
641   VecGetArray_Fast(xx,x);
642   VecGetArray_Fast(yy,y);
643   VecGetArray_Fast(zz,z);
644 
645   idx   = a->j;
646   v     = a->a;
647   ii    = a->i;
648 
649   for ( i=0; i<mbs; i++ ) {
650     n  = ii[1] - ii[0]; ii++;
651     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
652     for ( j=0; j<n; j++ ) {
653       xb = x + 5*(*idx++);
654       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
655       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
656       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
657       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
658       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
659       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
660       v += 25;
661     }
662     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
663     z += 5; y += 5;
664   }
665   VecRestoreArray_Fast(xx,x);
666   VecRestoreArray_Fast(yy,y);
667   VecRestoreArray_Fast(zz,z);
668   PLogFlops(50*a->nz);
669   PetscFunctionReturn(0);
670 }
671 
672 #undef __FUNC__
673 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
674 int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
675 {
676   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
677   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
678   register Scalar x1,x2,x3,x4,x5,x6,x7;
679   int             mbs=a->mbs,i,*idx,*ii,j,n;
680 
681   PetscFunctionBegin;
682   VecGetArray_Fast(xx,x);
683   VecGetArray_Fast(yy,y);
684   VecGetArray_Fast(zz,z);
685 
686   idx   = a->j;
687   v     = a->a;
688   ii    = a->i;
689 
690   for ( i=0; i<mbs; i++ ) {
691     n  = ii[1] - ii[0]; ii++;
692     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
693     for ( j=0; j<n; j++ ) {
694       xb = x + 7*(*idx++);
695       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
696       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
697       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
698       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
699       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
700       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
701       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
702       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
703       v += 49;
704     }
705     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
706     z += 7; y += 7;
707   }
708   VecRestoreArray_Fast(xx,x);
709   VecRestoreArray_Fast(yy,y);
710   VecRestoreArray_Fast(zz,z);
711   PLogFlops(98*a->nz);
712   PetscFunctionReturn(0);
713 }
714 
715 
716 #undef __FUNC__
717 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
718 int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
719 {
720   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
721   register Scalar *x,*z,*v,*xb;
722   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
723   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
724 
725   PetscFunctionBegin;
726   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
727 
728   VecGetArray_Fast(xx,x);
729   VecGetArray_Fast(zz,z);
730 
731   idx   = a->j;
732   v     = a->a;
733   ii    = a->i;
734 
735 
736   if (!a->mult_work) {
737     k = PetscMax(a->m,a->n);
738     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
739   }
740   work = a->mult_work;
741   for ( i=0; i<mbs; i++ ) {
742     n     = ii[1] - ii[0]; ii++;
743     ncols = n*bs;
744     workt = work;
745     for ( j=0; j<n; j++ ) {
746       xb = x + bs*(*idx++);
747       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
748       workt += bs;
749     }
750     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
751     v += n*bs2;
752     z += bs;
753   }
754   VecRestoreArray_Fast(xx,x);
755   VecRestoreArray_Fast(zz,z);
756   PLogFlops(2*a->nz*bs2 );
757   PetscFunctionReturn(0);
758 }
759 
760 #undef __FUNC__
761 #define __FUNC__ "MatMultTrans_SeqBAIJ"
762 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
763 {
764   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
765   Scalar          *xg,*zg,*zb;
766   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
767   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
768   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
769 
770 
771   PetscFunctionBegin;
772   VecGetArray_Fast(xx,xg); x = xg;
773   VecGetArray_Fast(zz,zg); z = zg;
774   PetscMemzero(z,N*sizeof(Scalar));
775 
776   idx   = a->j;
777   v     = a->a;
778   ii    = a->i;
779 
780   switch (bs) {
781   case 1:
782     for ( i=0; i<mbs; i++ ) {
783       n  = ii[1] - ii[0]; ii++;
784       xb = x + i; x1 = xb[0];
785       ib = idx + ai[i];
786       for ( j=0; j<n; j++ ) {
787         rval    = ib[j];
788         z[rval] += *v++ * x1;
789       }
790     }
791     break;
792   case 2:
793     for ( i=0; i<mbs; i++ ) {
794       n  = ii[1] - ii[0]; ii++;
795       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
796       ib = idx + ai[i];
797       for ( j=0; j<n; j++ ) {
798         rval      = ib[j]*2;
799         z[rval++] += v[0]*x1 + v[1]*x2;
800         z[rval++] += v[2]*x1 + v[3]*x2;
801         v += 4;
802       }
803     }
804     break;
805   case 3:
806     for ( i=0; i<mbs; i++ ) {
807       n  = ii[1] - ii[0]; ii++;
808       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
809       ib = idx + ai[i];
810       for ( j=0; j<n; j++ ) {
811         rval      = ib[j]*3;
812         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
813         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
814         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
815         v += 9;
816       }
817     }
818     break;
819   case 4:
820     for ( i=0; i<mbs; i++ ) {
821       n  = ii[1] - ii[0]; ii++;
822       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
823       ib = idx + ai[i];
824       for ( j=0; j<n; j++ ) {
825         rval      = ib[j]*4;
826         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
827         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
828         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
829         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
830         v += 16;
831       }
832     }
833     break;
834   case 5:
835     for ( i=0; i<mbs; i++ ) {
836       n  = ii[1] - ii[0]; ii++;
837       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
838       x4 = xb[3];   x5 = xb[4];
839       ib = idx + ai[i];
840       for ( j=0; j<n; j++ ) {
841         rval      = ib[j]*5;
842         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
843         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
844         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
845         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
846         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
847         v += 25;
848       }
849     }
850     break;
851       /* block sizes larger then 5 by 5 are handled by BLAS */
852     default: {
853       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
854       if (!a->mult_work) {
855         k = PetscMax(a->m,a->n);
856         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
857         CHKPTRQ(a->mult_work);
858       }
859       work = a->mult_work;
860       for ( i=0; i<mbs; i++ ) {
861         n     = ii[1] - ii[0]; ii++;
862         ncols = n*bs;
863         PetscMemzero(work,ncols*sizeof(Scalar));
864         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
865         v += n*bs2;
866         x += bs;
867         workt = work;
868         for ( j=0; j<n; j++ ) {
869           zb = z + bs*(*idx++);
870           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
871           workt += bs;
872         }
873       }
874     }
875   }
876   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
877   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
878   PLogFlops(2*a->nz*a->bs2 - a->n);
879   PetscFunctionReturn(0);
880 }
881 
882 #undef __FUNC__
883 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
884 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
885 
886 {
887   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
888   Scalar          *xg,*zg,*zb;
889   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
890   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
891   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
892 
893   PetscFunctionBegin;
894   VecGetArray_Fast(xx,xg); x = xg;
895   VecGetArray_Fast(zz,zg); z = zg;
896 
897   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
898   else PetscMemzero(z,N*sizeof(Scalar));
899 
900   idx   = a->j;
901   v     = a->a;
902   ii    = a->i;
903 
904   switch (bs) {
905   case 1:
906     for ( i=0; i<mbs; i++ ) {
907       n  = ii[1] - ii[0]; ii++;
908       xb = x + i; x1 = xb[0];
909       ib = idx + ai[i];
910       for ( j=0; j<n; j++ ) {
911         rval    = ib[j];
912         z[rval] += *v++ * x1;
913       }
914     }
915     break;
916   case 2:
917     for ( i=0; i<mbs; i++ ) {
918       n  = ii[1] - ii[0]; ii++;
919       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
920       ib = idx + ai[i];
921       for ( j=0; j<n; j++ ) {
922         rval      = ib[j]*2;
923         z[rval++] += v[0]*x1 + v[1]*x2;
924         z[rval++] += v[2]*x1 + v[3]*x2;
925         v += 4;
926       }
927     }
928     break;
929   case 3:
930     for ( i=0; i<mbs; i++ ) {
931       n  = ii[1] - ii[0]; ii++;
932       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
933       ib = idx + ai[i];
934       for ( j=0; j<n; j++ ) {
935         rval      = ib[j]*3;
936         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
937         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
938         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
939         v += 9;
940       }
941     }
942     break;
943   case 4:
944     for ( i=0; i<mbs; i++ ) {
945       n  = ii[1] - ii[0]; ii++;
946       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
947       ib = idx + ai[i];
948       for ( j=0; j<n; j++ ) {
949         rval      = ib[j]*4;
950         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
951         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
952         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
953         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
954         v += 16;
955       }
956     }
957     break;
958   case 5:
959     for ( i=0; i<mbs; i++ ) {
960       n  = ii[1] - ii[0]; ii++;
961       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
962       x4 = xb[3];   x5 = xb[4];
963       ib = idx + ai[i];
964       for ( j=0; j<n; j++ ) {
965         rval      = ib[j]*5;
966         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
967         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
968         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
969         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
970         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
971         v += 25;
972       }
973     }
974     break;
975       /* block sizes larger then 5 by 5 are handled by BLAS */
976     default: {
977       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
978       if (!a->mult_work) {
979         k = PetscMax(a->m,a->n);
980         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
981         CHKPTRQ(a->mult_work);
982       }
983       work = a->mult_work;
984       for ( i=0; i<mbs; i++ ) {
985         n     = ii[1] - ii[0]; ii++;
986         ncols = n*bs;
987         PetscMemzero(work,ncols*sizeof(Scalar));
988         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
989         v += n*bs2;
990         x += bs;
991         workt = work;
992         for ( j=0; j<n; j++ ) {
993           zb = z + bs*(*idx++);
994           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
995           workt += bs;
996         }
997       }
998     }
999   }
1000   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1001   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1002   PLogFlops(2*a->nz*a->bs2);
1003   PetscFunctionReturn(0);
1004 }
1005 
1006 #undef __FUNC__
1007 #define __FUNC__ "MatScale_SeqBAIJ"
1008 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1009 {
1010   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1011   int         one = 1, totalnz = a->bs2*a->nz;
1012 
1013   PetscFunctionBegin;
1014   BLscal_( &totalnz, alpha, a->a, &one );
1015   PLogFlops(totalnz);
1016   PetscFunctionReturn(0);
1017 }
1018 
1019 #undef __FUNC__
1020 #define __FUNC__ "MatNorm_SeqBAIJ"
1021 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1022 {
1023   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1024   Scalar      *v = a->a;
1025   double      sum = 0.0;
1026   int         i,nz=a->nz,bs2=a->bs2;
1027 
1028   PetscFunctionBegin;
1029   if (type == NORM_FROBENIUS) {
1030     for (i=0; i< bs2*nz; i++ ) {
1031 #if defined(USE_PETSC_COMPLEX)
1032       sum += real(conj(*v)*(*v)); v++;
1033 #else
1034       sum += (*v)*(*v); v++;
1035 #endif
1036     }
1037     *norm = sqrt(sum);
1038   }
1039   else {
1040     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1041   }
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 
1046 #undef __FUNC__
1047 #define __FUNC__ "MatEqual_SeqBAIJ"
1048 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1049 {
1050   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1051 
1052   PetscFunctionBegin;
1053   if (B->type !=MATSEQBAIJ) SETERRQ(PETSC_ERR_ARG_INCOMP,0,"Matrices must be same type");
1054 
1055   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1056   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1057     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1058   }
1059 
1060   /* if the a->i are the same */
1061   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1062     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1063   }
1064 
1065   /* if a->j are the same */
1066   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1067     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1068   }
1069 
1070   /* if a->a are the same */
1071   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1072     *flg = PETSC_FALSE; PetscFunctionReturn(0);
1073   }
1074   *flg = PETSC_TRUE;
1075   PetscFunctionReturn(0);
1076 
1077 }
1078 
1079 #undef __FUNC__
1080 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1081 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1082 {
1083   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1084   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1085   Scalar      *x, zero = 0.0,*aa,*aa_j;
1086 
1087   PetscFunctionBegin;
1088   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Not for factored matrix");
1089   bs   = a->bs;
1090   aa   = a->a;
1091   ai   = a->i;
1092   aj   = a->j;
1093   ambs = a->mbs;
1094   bs2  = a->bs2;
1095 
1096   VecSet(&zero,v);
1097   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1098   if (n != a->m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Nonconforming matrix and vector");
1099   for ( i=0; i<ambs; i++ ) {
1100     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1101       if (aj[j] == i) {
1102         row  = i*bs;
1103         aa_j = aa+j*bs2;
1104         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1105         break;
1106       }
1107     }
1108   }
1109   PetscFunctionReturn(0);
1110 }
1111 
1112 #undef __FUNC__
1113 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1114 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1115 {
1116   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1117   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1118   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1119 
1120   PetscFunctionBegin;
1121   ai  = a->i;
1122   aj  = a->j;
1123   aa  = a->a;
1124   m   = a->m;
1125   n   = a->n;
1126   bs  = a->bs;
1127   mbs = a->mbs;
1128   bs2 = a->bs2;
1129   if (ll) {
1130     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1131     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Left scaling vector wrong length");
1132     for ( i=0; i<mbs; i++ ) { /* for each block row */
1133       M  = ai[i+1] - ai[i];
1134       li = l + i*bs;
1135       v  = aa + bs2*ai[i];
1136       for ( j=0; j<M; j++ ) { /* for each block */
1137         for ( k=0; k<bs2; k++ ) {
1138           (*v++) *= li[k%bs];
1139         }
1140       }
1141     }
1142   }
1143 
1144   if (rr) {
1145     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1146     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Right scaling vector wrong length");
1147     for ( i=0; i<mbs; i++ ) { /* for each block row */
1148       M  = ai[i+1] - ai[i];
1149       v  = aa + bs2*ai[i];
1150       for ( j=0; j<M; j++ ) { /* for each block */
1151         ri = r + bs*aj[ai[i]+j];
1152         for ( k=0; k<bs; k++ ) {
1153           x = ri[k];
1154           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1155         }
1156       }
1157     }
1158   }
1159   PetscFunctionReturn(0);
1160 }
1161 
1162 
1163 #undef __FUNC__
1164 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1165 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1166 {
1167   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1168 
1169   PetscFunctionBegin;
1170   info->rows_global    = (double)a->m;
1171   info->columns_global = (double)a->n;
1172   info->rows_local     = (double)a->m;
1173   info->columns_local  = (double)a->n;
1174   info->block_size     = a->bs2;
1175   info->nz_allocated   = a->maxnz;
1176   info->nz_used        = a->bs2*a->nz;
1177   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1178   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1179     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1180   info->assemblies   = A->num_ass;
1181   info->mallocs      = a->reallocs;
1182   info->memory       = A->mem;
1183   if (A->factor) {
1184     info->fill_ratio_given  = A->info.fill_ratio_given;
1185     info->fill_ratio_needed = A->info.fill_ratio_needed;
1186     info->factor_mallocs    = A->info.factor_mallocs;
1187   } else {
1188     info->fill_ratio_given  = 0;
1189     info->fill_ratio_needed = 0;
1190     info->factor_mallocs    = 0;
1191   }
1192   PetscFunctionReturn(0);
1193 }
1194 
1195 
1196 #undef __FUNC__
1197 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
1198 int MatZeroEntries_SeqBAIJ(Mat A)
1199 {
1200   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1201 
1202   PetscFunctionBegin;
1203   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
1204   PetscFunctionReturn(0);
1205 }
1206