xref: /petsc/src/mat/impls/baij/seq/baij.c (revision 83e2fdc756c13f964f3ed5aaff42e3626c5b72bc)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: baij.c,v 1.108 1997/07/21 02:29:01 bsmith Exp bsmith $";
3 #endif
4 
5 /*
6     Defines the basic matrix operations for the BAIJ (compressed row)
7   matrix storage format.
8 */
9 
10 #include "pinclude/pviewer.h"
11 #include "sys.h"
12 #include "src/mat/impls/baij/seq/baij.h"
13 #include "src/vec/vecimpl.h"
14 #include "src/inline/spops.h"
15 #include "petsc.h"
16 
17 
18 /*
19      Adds diagonal pointers to sparse matrix structure.
20 */
21 
22 #undef __FUNC__
23 #define __FUNC__ "MatMarkDiag_SeqBAIJ"
24 int MatMarkDiag_SeqBAIJ(Mat A)
25 {
26   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
27   int         i,j, *diag, m = a->mbs;
28 
29   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
30   PLogObjectMemory(A,(m+1)*sizeof(int));
31   for ( i=0; i<m; i++ ) {
32     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
33       if (a->j[j] == i) {
34         diag[i] = j;
35         break;
36       }
37     }
38   }
39   a->diag = diag;
40   return 0;
41 }
42 
43 
44 extern int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
45 
46 #undef __FUNC__
47 #define __FUNC__ "MatGetRowIJ_SeqBAIJ"
48 static int MatGetRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
49                             PetscTruth *done)
50 {
51   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
52   int         ierr, n = a->mbs,i;
53 
54   *nn = n;
55   if (!ia) return 0;
56   if (symmetric) {
57     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
58   } else if (oshift == 1) {
59     /* temporarily add 1 to i and j indices */
60     int nz = a->i[n] + 1;
61     for ( i=0; i<nz; i++ ) a->j[i]++;
62     for ( i=0; i<n+1; i++ ) a->i[i]++;
63     *ia = a->i; *ja = a->j;
64   } else {
65     *ia = a->i; *ja = a->j;
66   }
67 
68   return 0;
69 }
70 
71 #undef __FUNC__
72 #define __FUNC__ "MatRestoreRowIJ_SeqBAIJ" /* ADIC Ignore */
73 static int MatRestoreRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
74                                 PetscTruth *done)
75 {
76   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
77   int         i,n = a->mbs;
78 
79   if (!ia) return 0;
80   if (symmetric) {
81     PetscFree(*ia);
82     PetscFree(*ja);
83   } else if (oshift == 1) {
84     int nz = a->i[n];
85     for ( i=0; i<nz; i++ ) a->j[i]--;
86     for ( i=0; i<n+1; i++ ) a->i[i]--;
87   }
88   return 0;
89 }
90 
91 
92 #undef __FUNC__
93 #define __FUNC__ "MatView_SeqBAIJ_Binary" /* ADIC Ignore */
94 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
95 {
96   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
97   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
98   Scalar      *aa;
99 
100   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
101   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
102   col_lens[0] = MAT_COOKIE;
103 
104   col_lens[1] = a->m;
105   col_lens[2] = a->n;
106   col_lens[3] = a->nz*bs2;
107 
108   /* store lengths of each row and write (including header) to file */
109   count = 0;
110   for ( i=0; i<a->mbs; i++ ) {
111     for ( j=0; j<bs; j++ ) {
112       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
113     }
114   }
115   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,BINARY_INT,1); CHKERRQ(ierr);
116   PetscFree(col_lens);
117 
118   /* store column indices (zero start index) */
119   jj = (int *) PetscMalloc( (a->nz+1)*bs2*sizeof(int) ); CHKPTRQ(jj);
120   count = 0;
121   for ( i=0; i<a->mbs; i++ ) {
122     for ( j=0; j<bs; j++ ) {
123       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
124         for ( l=0; l<bs; l++ ) {
125           jj[count++] = bs*a->j[k] + l;
126         }
127       }
128     }
129   }
130   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,BINARY_INT,0); CHKERRQ(ierr);
131   PetscFree(jj);
132 
133   /* store nonzero values */
134   aa = (Scalar *) PetscMalloc((a->nz+1)*bs2*sizeof(Scalar)); CHKPTRQ(aa);
135   count = 0;
136   for ( i=0; i<a->mbs; i++ ) {
137     for ( j=0; j<bs; j++ ) {
138       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
139         for ( l=0; l<bs; l++ ) {
140           aa[count++] = a->a[bs2*k + l*bs + j];
141         }
142       }
143     }
144   }
145   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,BINARY_SCALAR,0); CHKERRQ(ierr);
146   PetscFree(aa);
147   return 0;
148 }
149 
150 #undef __FUNC__
151 #define __FUNC__ "MatView_SeqBAIJ_ASCII" /* ADIC Ignore */
152 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
153 {
154   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
155   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
156   FILE        *fd;
157   char        *outputname;
158 
159   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
160   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
161   ierr = ViewerGetFormat(viewer,&format);
162   if (format == VIEWER_FORMAT_ASCII_INFO || format == VIEWER_FORMAT_ASCII_INFO_LONG) {
163     fprintf(fd,"  block size is %d\n",bs);
164   }
165   else if (format == VIEWER_FORMAT_ASCII_MATLAB) {
166     SETERRQ(1,0,"Matlab format not supported");
167   }
168   else if (format == VIEWER_FORMAT_ASCII_COMMON) {
169     for ( i=0; i<a->mbs; i++ ) {
170       for ( j=0; j<bs; j++ ) {
171         fprintf(fd,"row %d:",i*bs+j);
172         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
173           for ( l=0; l<bs; l++ ) {
174 #if defined(PETSC_COMPLEX)
175           if (imag(a->a[bs2*k + l*bs + j]) > 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
176             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
177               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
178           else if (imag(a->a[bs2*k + l*bs + j]) < 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
179             fprintf(fd," %d %g - %g i",bs*a->j[k]+l,
180               real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j]));
181           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
182             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
183 #else
184           if (a->a[bs2*k + l*bs + j] != 0.0)
185             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
186 #endif
187           }
188         }
189         fprintf(fd,"\n");
190       }
191     }
192   }
193   else {
194     for ( i=0; i<a->mbs; i++ ) {
195       for ( j=0; j<bs; j++ ) {
196         fprintf(fd,"row %d:",i*bs+j);
197         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
198           for ( l=0; l<bs; l++ ) {
199 #if defined(PETSC_COMPLEX)
200           if (imag(a->a[bs2*k + l*bs + j]) > 0.0) {
201             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
202               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
203           }
204           else if (imag(a->a[bs2*k + l*bs + j]) < 0.0) {
205             fprintf(fd," %d %g - %g i",bs*a->j[k]+l,
206               real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j]));
207           }
208           else {
209             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
210           }
211 #else
212             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
213 #endif
214           }
215         }
216         fprintf(fd,"\n");
217       }
218     }
219   }
220   fflush(fd);
221   return 0;
222 }
223 
224 #undef __FUNC__
225 #define __FUNC__ "MatView_SeqBAIJ_Draw" /* ADIC Ignore */
226 static int MatView_SeqBAIJ_Draw(Mat A,Viewer viewer)
227 {
228   Mat_SeqBAIJ  *a=(Mat_SeqBAIJ *) A->data;
229   int          row,ierr,i,j,k,l,mbs=a->mbs,pause,color,bs=a->bs,bs2=a->bs2;
230   double       xl,yl,xr,yr,w,h,xc,yc,scale = 1.0,x_l,x_r,y_l,y_r;
231   Scalar       *aa;
232   Draw         draw;
233   DrawButton   button;
234   PetscTruth   isnull;
235 
236   ViewerDrawGetDraw(viewer,&draw);
237   ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
238 
239   xr  = a->n; yr = a->m; h = yr/10.0; w = xr/10.0;
240   xr += w;    yr += h;  xl = -w;     yl = -h;
241   ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
242   /* loop over matrix elements drawing boxes */
243   color = DRAW_BLUE;
244   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
245     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
246       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
247       x_l = a->j[j]*bs; x_r = x_l + 1.0;
248       aa = a->a + j*bs2;
249       for ( k=0; k<bs; k++ ) {
250         for ( l=0; l<bs; l++ ) {
251           if (PetscReal(*aa++) >=  0.) continue;
252           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
253         }
254       }
255     }
256   }
257   color = DRAW_CYAN;
258   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
259     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
260       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
261       x_l = a->j[j]*bs; x_r = x_l + 1.0;
262       aa = a->a + j*bs2;
263       for ( k=0; k<bs; k++ ) {
264         for ( l=0; l<bs; l++ ) {
265           if (PetscReal(*aa++) != 0.) continue;
266           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
267         }
268       }
269     }
270   }
271 
272   color = DRAW_RED;
273   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
274     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
275       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
276       x_l = a->j[j]*bs; x_r = x_l + 1.0;
277       aa = a->a + j*bs2;
278       for ( k=0; k<bs; k++ ) {
279         for ( l=0; l<bs; l++ ) {
280           if (PetscReal(*aa++) <= 0.) continue;
281           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
282         }
283       }
284     }
285   }
286 
287   DrawFlush(draw);
288   DrawGetPause(draw,&pause);
289   if (pause >= 0) { PetscSleep(pause); return 0;}
290 
291   /* allow the matrix to zoom or shrink */
292   ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
293   while (button != BUTTON_RIGHT) {
294     DrawClear(draw);
295     if (button == BUTTON_LEFT) scale = .5;
296     else if (button == BUTTON_CENTER) scale = 2.;
297     xl = scale*(xl + w - xc) + xc - w*scale;
298     xr = scale*(xr - w - xc) + xc + w*scale;
299     yl = scale*(yl + h - yc) + yc - h*scale;
300     yr = scale*(yr - h - yc) + yc + h*scale;
301     w *= scale; h *= scale;
302     ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
303     color = DRAW_BLUE;
304     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
305       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
306         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
307         x_l = a->j[j]*bs; x_r = x_l + 1.0;
308         aa = a->a + j*bs2;
309         for ( k=0; k<bs; k++ ) {
310           for ( l=0; l<bs; l++ ) {
311             if (PetscReal(*aa++) >=  0.) continue;
312             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
313           }
314         }
315       }
316     }
317     color = DRAW_CYAN;
318     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
319       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
320         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
321         x_l = a->j[j]*bs; x_r = x_l + 1.0;
322         aa = a->a + j*bs2;
323         for ( k=0; k<bs; k++ ) {
324           for ( l=0; l<bs; l++ ) {
325           if (PetscReal(*aa++) != 0.) continue;
326           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
327           }
328         }
329       }
330     }
331 
332     color = DRAW_RED;
333     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
334       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
335         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
336         x_l = a->j[j]*bs; x_r = x_l + 1.0;
337         aa = a->a + j*bs2;
338         for ( k=0; k<bs; k++ ) {
339           for ( l=0; l<bs; l++ ) {
340             if (PetscReal(*aa++) <= 0.) continue;
341             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
342           }
343         }
344       }
345     }
346     ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
347   }
348   return 0;
349 }
350 
351 #undef __FUNC__
352 #define __FUNC__ "MatView_SeqBAIJ" /* ADIC Ignore */
353 int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
354 {
355   Mat         A = (Mat) obj;
356   ViewerType  vtype;
357   int         ierr;
358 
359   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
360   if (vtype == MATLAB_VIEWER) {
361     SETERRQ(1,0,"Matlab viewer not supported");
362   }
363   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
364     return MatView_SeqBAIJ_ASCII(A,viewer);
365   }
366   else if (vtype == BINARY_FILE_VIEWER) {
367     return MatView_SeqBAIJ_Binary(A,viewer);
368   }
369   else if (vtype == DRAW_VIEWER) {
370     return MatView_SeqBAIJ_Draw(A,viewer);
371   }
372   return 0;
373 }
374 
375 #define CHUNKSIZE  10
376 
377 #undef __FUNC__
378 #define __FUNC__ "MatSetValues_SeqBAIJ"
379 int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
380 {
381   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
382   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
383   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
384   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
385   int          ridx,cidx,bs2=a->bs2;
386   Scalar      *ap,value,*aa=a->a,*bap;
387 
388   for ( k=0; k<m; k++ ) { /* loop over added rows */
389     row  = im[k]; brow = row/bs;
390 #if defined(PETSC_BOPT_g)
391     if (row < 0) SETERRQ(1,0,"Negative row");
392     if (row >= a->m) SETERRQ(1,0,"Row too large");
393 #endif
394     rp   = aj + ai[brow];
395     ap   = aa + bs2*ai[brow];
396     rmax = imax[brow];
397     nrow = ailen[brow];
398     low  = 0;
399     for ( l=0; l<n; l++ ) { /* loop over added columns */
400 #if defined(PETSC_BOPT_g)
401       if (in[l] < 0) SETERRQ(1,0,"Negative column");
402       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
403 #endif
404       col = in[l]; bcol = col/bs;
405       ridx = row % bs; cidx = col % bs;
406       if (roworiented) {
407         value = *v++;
408       }
409       else {
410         value = v[k + l*m];
411       }
412       if (!sorted) low = 0; high = nrow;
413       while (high-low > 7) {
414         t = (low+high)/2;
415         if (rp[t] > bcol) high = t;
416         else              low  = t;
417       }
418       for ( i=low; i<high; i++ ) {
419         if (rp[i] > bcol) break;
420         if (rp[i] == bcol) {
421           bap  = ap +  bs2*i + bs*cidx + ridx;
422           if (is == ADD_VALUES) *bap += value;
423           else                  *bap  = value;
424           goto noinsert1;
425         }
426       }
427       if (nonew == 1) goto noinsert1;
428       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
429       if (nrow >= rmax) {
430         /* there is no extra room in row, therefore enlarge */
431         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
432         Scalar *new_a;
433 
434         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
435 
436         /* Malloc new storage space */
437         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
438         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
439         new_j   = (int *) (new_a + bs2*new_nz);
440         new_i   = new_j + new_nz;
441 
442         /* copy over old data into new slots */
443         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
444         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
445         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
446         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
447         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
448                                                            len*sizeof(int));
449         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
450         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
451         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
452                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
453         /* free up old matrix storage */
454         PetscFree(a->a);
455         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
456         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
457         a->singlemalloc = 1;
458 
459         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
460         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
461         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
462         a->maxnz += bs2*CHUNKSIZE;
463         a->reallocs++;
464         a->nz++;
465       }
466       N = nrow++ - 1;
467       /* shift up all the later entries in this row */
468       for ( ii=N; ii>=i; ii-- ) {
469         rp[ii+1] = rp[ii];
470         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
471       }
472       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
473       rp[i]                      = bcol;
474       ap[bs2*i + bs*cidx + ridx] = value;
475       noinsert1:;
476       low = i;
477     }
478     ailen[brow] = nrow;
479   }
480   return 0;
481 }
482 
483 #undef __FUNC__
484 #define __FUNC__ "MatSetValuesBlocked_SeqBAIJ"
485 int MatSetValuesBlocked_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
486 {
487   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
488   int         *rp,k,low,high,t,ii,jj,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
489   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen, roworiented=a->roworiented;
490   int         *aj=a->j,nonew=a->nonew,bs2=a->bs2,bs=a->bs,stepval;
491   Scalar      *ap,*value=v,*aa=a->a,*bap;
492 
493   if (roworiented) {
494     stepval = (n-1)*bs;
495   } else {
496     stepval = (m-1)*bs;
497   }
498   for ( k=0; k<m; k++ ) { /* loop over added rows */
499     row  = im[k];
500 #if defined(PETSC_BOPT_g)
501     if (row < 0) SETERRQ(1,0,"Negative row");
502     if (row >= a->mbs) SETERRQ(1,0,"Row too large");
503 #endif
504     rp   = aj + ai[row];
505     ap   = aa + bs2*ai[row];
506     rmax = imax[row];
507     nrow = ailen[row];
508     low  = 0;
509     for ( l=0; l<n; l++ ) { /* loop over added columns */
510 #if defined(PETSC_BOPT_g)
511       if (in[l] < 0) SETERRQ(1,0,"Negative column");
512       if (in[l] >= a->nbs) SETERRQ(1,0,"Column too large");
513 #endif
514       col = in[l];
515       if (roworiented) {
516         value = v + k*(stepval+bs)*bs + l*bs;
517       } else {
518         value = v + l*(stepval+bs)*bs + k*bs;
519       }
520       if (!sorted) low = 0; high = nrow;
521       while (high-low > 7) {
522         t = (low+high)/2;
523         if (rp[t] > col) high = t;
524         else             low  = t;
525       }
526       for ( i=low; i<high; i++ ) {
527         if (rp[i] > col) break;
528         if (rp[i] == col) {
529           bap  = ap +  bs2*i;
530           if (roworiented) {
531             if (is == ADD_VALUES) {
532               for ( ii=0; ii<bs; ii++,value+=stepval ) {
533                 for (jj=ii; jj<bs2; jj+=bs ) {
534                   bap[jj] += *value++;
535                 }
536               }
537             } else {
538               for ( ii=0; ii<bs; ii++,value+=stepval ) {
539                 for (jj=ii; jj<bs2; jj+=bs ) {
540                   bap[jj] = *value++;
541                 }
542               }
543             }
544           } else {
545             if (is == ADD_VALUES) {
546               for ( ii=0; ii<bs; ii++,value+=stepval ) {
547                 for (jj=0; jj<bs; jj++ ) {
548                   *bap++ += *value++;
549                 }
550               }
551             } else {
552               for ( ii=0; ii<bs; ii++,value+=stepval ) {
553                 for (jj=0; jj<bs; jj++ ) {
554                   *bap++  = *value++;
555                 }
556               }
557             }
558           }
559           goto noinsert2;
560         }
561       }
562       if (nonew == 1) goto noinsert2;
563       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
564       if (nrow >= rmax) {
565         /* there is no extra room in row, therefore enlarge */
566         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
567         Scalar *new_a;
568 
569         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
570 
571         /* malloc new storage space */
572         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
573         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
574         new_j   = (int *) (new_a + bs2*new_nz);
575         new_i   = new_j + new_nz;
576 
577         /* copy over old data into new slots */
578         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];}
579         for ( ii=row+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
580         PetscMemcpy(new_j,aj,(ai[row]+nrow)*sizeof(int));
581         len = (new_nz - CHUNKSIZE - ai[row] - nrow);
582         PetscMemcpy(new_j+ai[row]+nrow+CHUNKSIZE,aj+ai[row]+nrow,len*sizeof(int));
583         PetscMemcpy(new_a,aa,(ai[row]+nrow)*bs2*sizeof(Scalar));
584         PetscMemzero(new_a+bs2*(ai[row]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
585         PetscMemcpy(new_a+bs2*(ai[row]+nrow+CHUNKSIZE),aa+bs2*(ai[row]+nrow),bs2*len*sizeof(Scalar));
586         /* free up old matrix storage */
587         PetscFree(a->a);
588         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
589         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
590         a->singlemalloc = 1;
591 
592         rp   = aj + ai[row]; ap = aa + bs2*ai[row];
593         rmax = imax[row] = imax[row] + CHUNKSIZE;
594         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
595         a->maxnz += bs2*CHUNKSIZE;
596         a->reallocs++;
597         a->nz++;
598       }
599       N = nrow++ - 1;
600       /* shift up all the later entries in this row */
601       for ( ii=N; ii>=i; ii-- ) {
602         rp[ii+1] = rp[ii];
603         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
604       }
605       if (N >= i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
606       rp[i] = col;
607       bap   = ap +  bs2*i;
608       if (roworiented) {
609         for ( ii=0; ii<bs; ii++,value+=stepval) {
610           for (jj=ii; jj<bs2; jj+=bs ) {
611             bap[jj] = *value++;
612           }
613         }
614       } else {
615         for ( ii=0; ii<bs; ii++,value+=stepval ) {
616           for (jj=0; jj<bs; jj++ ) {
617             *bap++  = *value++;
618           }
619         }
620       }
621       noinsert2:;
622       low = i;
623     }
624     ailen[row] = nrow;
625   }
626   return 0;
627 }
628 
629 #undef __FUNC__
630 #define __FUNC__ "MatGetSize_SeqBAIJ" /* ADIC Ignore */
631 int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
632 {
633   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
634   *m = a->m; *n = a->n;
635   return 0;
636 }
637 
638 #undef __FUNC__
639 #define __FUNC__ "MatGetOwnershipRange_SeqBAIJ" /* ADIC Ignore */
640 int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
641 {
642   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
643   *m = 0; *n = a->m;
644   return 0;
645 }
646 
647 #undef __FUNC__
648 #define __FUNC__ "MatGetRow_SeqBAIJ"
649 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
650 {
651   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
652   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
653   Scalar      *aa,*v_i,*aa_i;
654 
655   bs  = a->bs;
656   ai  = a->i;
657   aj  = a->j;
658   aa  = a->a;
659   bs2 = a->bs2;
660 
661   if (row < 0 || row >= a->m) SETERRQ(1,0,"Row out of range");
662 
663   bn  = row/bs;   /* Block number */
664   bp  = row % bs; /* Block Position */
665   M   = ai[bn+1] - ai[bn];
666   *nz = bs*M;
667 
668   if (v) {
669     *v = 0;
670     if (*nz) {
671       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
672       for ( i=0; i<M; i++ ) { /* for each block in the block row */
673         v_i  = *v + i*bs;
674         aa_i = aa + bs2*(ai[bn] + i);
675         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
676       }
677     }
678   }
679 
680   if (idx) {
681     *idx = 0;
682     if (*nz) {
683       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
684       for ( i=0; i<M; i++ ) { /* for each block in the block row */
685         idx_i = *idx + i*bs;
686         itmp  = bs*aj[ai[bn] + i];
687         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
688       }
689     }
690   }
691   return 0;
692 }
693 
694 #undef __FUNC__
695 #define __FUNC__ "MatRestoreRow_SeqBAIJ" /* ADIC Ignore */
696 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
697 {
698   if (idx) {if (*idx) PetscFree(*idx);}
699   if (v)   {if (*v)   PetscFree(*v);}
700   return 0;
701 }
702 
703 #undef __FUNC__
704 #define __FUNC__ "MatTranspose_SeqBAIJ"
705 int MatTranspose_SeqBAIJ(Mat A,Mat *B)
706 {
707   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
708   Mat         C;
709   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
710   int         *rows,*cols,bs2=a->bs2;
711   Scalar      *array=a->a;
712 
713   if (B==PETSC_NULL && mbs!=nbs)
714     SETERRQ(1,0,"Square matrix only for in-place");
715   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
716   PetscMemzero(col,(1+nbs)*sizeof(int));
717 
718   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
719   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
720   PetscFree(col);
721   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
722   cols = rows + bs;
723   for ( i=0; i<mbs; i++ ) {
724     cols[0] = i*bs;
725     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
726     len = ai[i+1] - ai[i];
727     for ( j=0; j<len; j++ ) {
728       rows[0] = (*aj++)*bs;
729       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
730       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
731       array += bs2;
732     }
733   }
734   PetscFree(rows);
735 
736   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
737   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
738 
739   if (B != PETSC_NULL) {
740     *B = C;
741   } else {
742     /* This isn't really an in-place transpose */
743     PetscFree(a->a);
744     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
745     if (a->diag) PetscFree(a->diag);
746     if (a->ilen) PetscFree(a->ilen);
747     if (a->imax) PetscFree(a->imax);
748     if (a->solve_work) PetscFree(a->solve_work);
749     PetscFree(a);
750     PetscMemcpy(A,C,sizeof(struct _p_Mat));
751     PetscHeaderDestroy(C);
752   }
753   return 0;
754 }
755 
756 
757 #undef __FUNC__
758 #define __FUNC__ "MatAssemblyEnd_SeqBAIJ"
759 int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
760 {
761   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
762   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
763   int        m = a->m,*ip, N, *ailen = a->ilen;
764   int        mbs = a->mbs, bs2 = a->bs2,rmax = 0;
765   Scalar     *aa = a->a, *ap;
766 
767   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
768 
769   if (m) rmax = ailen[0];
770   for ( i=1; i<mbs; i++ ) {
771     /* move each row back by the amount of empty slots (fshift) before it*/
772     fshift += imax[i-1] - ailen[i-1];
773     rmax   = PetscMax(rmax,ailen[i]);
774     if (fshift) {
775       ip = aj + ai[i]; ap = aa + bs2*ai[i];
776       N = ailen[i];
777       for ( j=0; j<N; j++ ) {
778         ip[j-fshift] = ip[j];
779         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
780       }
781     }
782     ai[i] = ai[i-1] + ailen[i-1];
783   }
784   if (mbs) {
785     fshift += imax[mbs-1] - ailen[mbs-1];
786     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
787   }
788   /* reset ilen and imax for each row */
789   for ( i=0; i<mbs; i++ ) {
790     ailen[i] = imax[i] = ai[i+1] - ai[i];
791   }
792   a->nz = ai[mbs];
793 
794   /* diagonals may have moved, so kill the diagonal pointers */
795   if (fshift && a->diag) {
796     PetscFree(a->diag);
797     PLogObjectMemory(A,-(m+1)*sizeof(int));
798     a->diag = 0;
799   }
800   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Matrix size: %d X %d, block size %d; storage space: %d unneeded, %d used\n",
801            m,a->n,a->bs,fshift*bs2,a->nz*bs2);
802   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Number of mallocs during MatSetValues is %d\n",
803            a->reallocs);
804   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Most nonzeros blocks in any row is %d\n",rmax);
805   a->reallocs          = 0;
806   A->info.nz_unneeded  = (double)fshift*bs2;
807 
808   return 0;
809 }
810 
811 #undef __FUNC__
812 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
813 int MatZeroEntries_SeqBAIJ(Mat A)
814 {
815   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
816   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
817   return 0;
818 }
819 
820 #undef __FUNC__
821 #define __FUNC__ "MatDestroy_SeqBAIJ" /* ADIC Ignore */
822 int MatDestroy_SeqBAIJ(PetscObject obj)
823 {
824   Mat         A  = (Mat) obj;
825   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
826 
827 #if defined(PETSC_LOG)
828   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
829 #endif
830   PetscFree(a->a);
831   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
832   if (a->diag) PetscFree(a->diag);
833   if (a->ilen) PetscFree(a->ilen);
834   if (a->imax) PetscFree(a->imax);
835   if (a->solve_work) PetscFree(a->solve_work);
836   if (a->mult_work) PetscFree(a->mult_work);
837   PetscFree(a);
838   PLogObjectDestroy(A);
839   PetscHeaderDestroy(A);
840   return 0;
841 }
842 
843 #undef __FUNC__
844 #define __FUNC__ "MatSetOption_SeqBAIJ" /* ADIC Ignore */
845 int MatSetOption_SeqBAIJ(Mat A,MatOption op)
846 {
847   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
848   if      (op == MAT_ROW_ORIENTED)                 a->roworiented = 1;
849   else if (op == MAT_COLUMN_ORIENTED)              a->roworiented = 0;
850   else if (op == MAT_COLUMNS_SORTED)               a->sorted      = 1;
851   else if (op == MAT_COLUMNS_UNSORTED)             a->sorted      = 0;
852   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)     a->nonew       = 1;
853   else if (op == MAT_NEW_NONZERO_LOCATION_ERROR)   a->nonew       = -1;
854   else if (op == MAT_NEW_NONZERO_ALLOCATION_ERROR) a->nonew       = -2;
855   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS)    a->nonew       = 0;
856   else if (op == MAT_ROWS_SORTED ||
857            op == MAT_ROWS_UNSORTED ||
858            op == MAT_SYMMETRIC ||
859            op == MAT_STRUCTURALLY_SYMMETRIC ||
860            op == MAT_YES_NEW_DIAGONALS ||
861            op == MAT_IGNORE_OFF_PROC_ENTRIES)
862     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
863   else if (op == MAT_NO_NEW_DIAGONALS)
864     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
865   else
866     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
867   return 0;
868 }
869 
870 
871 /* -------------------------------------------------------*/
872 /* Should check that shapes of vectors and matrices match */
873 /* -------------------------------------------------------*/
874 #include "pinclude/plapack.h"
875 
876 #undef __FUNC__
877 #define __FUNC__ "MatMult_SeqBAIJ_1"
878 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
879 {
880   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
881   register Scalar *x,*z,*v,sum;
882   int             mbs=a->mbs,i,*idx,*ii,n;
883 
884   VecGetArray_Fast(xx,x);
885   VecGetArray_Fast(zz,z);
886 
887   idx   = a->j;
888   v     = a->a;
889   ii    = a->i;
890 
891   for ( i=0; i<mbs; i++ ) {
892     n    = ii[1] - ii[0]; ii++;
893     sum  = 0.0;
894     while (n--) sum += *v++ * x[*idx++];
895     z[i] = sum;
896   }
897   VecRestoreArray_Fast(xx,x);
898   VecRestoreArray_Fast(zz,z);
899   PLogFlops(2*a->nz - a->m);
900   return 0;
901 }
902 
903 #undef __FUNC__
904 #define __FUNC__ "MatMult_SeqBAIJ_2"
905 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
906 {
907   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
908   register Scalar *x,*z,*v,*xb,sum1,sum2;
909   register Scalar x1,x2;
910   int             mbs=a->mbs,i,*idx,*ii;
911   int             j,n;
912 
913   VecGetArray_Fast(xx,x);
914   VecGetArray_Fast(zz,z);
915 
916   idx   = a->j;
917   v     = a->a;
918   ii    = a->i;
919 
920   for ( i=0; i<mbs; i++ ) {
921     n  = ii[1] - ii[0]; ii++;
922     sum1 = 0.0; sum2 = 0.0;
923     for ( j=0; j<n; j++ ) {
924       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
925       sum1 += v[0]*x1 + v[2]*x2;
926       sum2 += v[1]*x1 + v[3]*x2;
927       v += 4;
928     }
929     z[0] = sum1; z[1] = sum2;
930     z += 2;
931   }
932   VecRestoreArray_Fast(xx,x);
933   VecRestoreArray_Fast(zz,z);
934   PLogFlops(4*a->nz - a->m);
935   return 0;
936 }
937 
938 #undef __FUNC__
939 #define __FUNC__ "MatMult_SeqBAIJ_3"
940 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
941 {
942   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
943   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
944   int             mbs=a->mbs,i,*idx,*ii,j,n;
945 
946   VecGetArray_Fast(xx,x);
947   VecGetArray_Fast(zz,z);
948 
949   idx   = a->j;
950   v     = a->a;
951   ii    = a->i;
952 
953   for ( i=0; i<mbs; i++ ) {
954     n  = ii[1] - ii[0]; ii++;
955     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
956     for ( j=0; j<n; j++ ) {
957       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
958       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
959       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
960       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
961       v += 9;
962     }
963     z[0] = sum1; z[1] = sum2; z[2] = sum3;
964     z += 3;
965   }
966   VecRestoreArray_Fast(xx,x);
967   VecRestoreArray_Fast(zz,z);
968   PLogFlops(18*a->nz - a->m);
969   return 0;
970 }
971 
972 #undef __FUNC__
973 #define __FUNC__ "MatMult_SeqBAIJ_4"
974 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
975 {
976   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
977   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
978   register Scalar x1,x2,x3,x4;
979   int             mbs=a->mbs,i,*idx,*ii;
980   int             j,n;
981 
982   VecGetArray_Fast(xx,x);
983   VecGetArray_Fast(zz,z);
984 
985   idx   = a->j;
986   v     = a->a;
987   ii    = a->i;
988 
989   for ( i=0; i<mbs; i++ ) {
990     n  = ii[1] - ii[0]; ii++;
991     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
992     for ( j=0; j<n; j++ ) {
993       xb = x + 4*(*idx++);
994       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
995       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
996       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
997       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
998       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
999       v += 16;
1000     }
1001     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1002     z += 4;
1003   }
1004   VecRestoreArray_Fast(xx,x);
1005   VecRestoreArray_Fast(zz,z);
1006   PLogFlops(32*a->nz - a->m);
1007   return 0;
1008 }
1009 
1010 #undef __FUNC__
1011 #define __FUNC__ "MatMult_SeqBAIJ_5"
1012 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
1013 {
1014   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1015   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1016   register Scalar x1,x2,x3,x4,x5;
1017   int             mbs=a->mbs,i,*idx,*ii,j,n;
1018 
1019   VecGetArray_Fast(xx,x);
1020   VecGetArray_Fast(zz,z);
1021 
1022   idx   = a->j;
1023   v     = a->a;
1024   ii    = a->i;
1025 
1026   for ( i=0; i<mbs; i++ ) {
1027     n  = ii[1] - ii[0]; ii++;
1028     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
1029     for ( j=0; j<n; j++ ) {
1030       xb = x + 5*(*idx++);
1031       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1032       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1033       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1034       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1035       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1036       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1037       v += 25;
1038     }
1039     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1040     z += 5;
1041   }
1042   VecRestoreArray_Fast(xx,x);
1043   VecRestoreArray_Fast(zz,z);
1044   PLogFlops(50*a->nz - a->m);
1045   return 0;
1046 }
1047 
1048 #undef __FUNC__
1049 #define __FUNC__ "MatMult_SeqBAIJ_7"
1050 static int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1051 {
1052   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1053   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1054   register Scalar x1,x2,x3,x4,x5,x6,x7;
1055   int             mbs=a->mbs,i,*idx,*ii,j,n;
1056 
1057   VecGetArray_Fast(xx,x);
1058   VecGetArray_Fast(zz,z);
1059 
1060   idx   = a->j;
1061   v     = a->a;
1062   ii    = a->i;
1063 
1064   for ( i=0; i<mbs; i++ ) {
1065     n  = ii[1] - ii[0]; ii++;
1066     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1067     for ( j=0; j<n; j++ ) {
1068       xb = x + 7*(*idx++);
1069       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1070       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1071       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1072       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1073       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1074       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1075       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1076       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1077       v += 49;
1078     }
1079     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1080     z += 7;
1081   }
1082 
1083   VecRestoreArray_Fast(xx,x);
1084   VecRestoreArray_Fast(zz,z);
1085   PLogFlops(98*a->nz - a->m);
1086   return 0;
1087 }
1088 
1089 #undef __FUNC__
1090 #define __FUNC__ "MatMult_SeqBAIJ_N"
1091 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1092 {
1093   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1094   register Scalar *x,*z,*v,*xb;
1095   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
1096   int             _One = 1,ncols,k;
1097   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
1098 
1099   VecGetArray_Fast(xx,x);
1100   VecGetArray_Fast(zz,z);
1101 
1102   idx   = a->j;
1103   v     = a->a;
1104   ii    = a->i;
1105 
1106 
1107   if (!a->mult_work) {
1108     k = PetscMax(a->m,a->n);
1109     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1110   }
1111   work = a->mult_work;
1112   for ( i=0; i<mbs; i++ ) {
1113     n     = ii[1] - ii[0]; ii++;
1114     ncols = n*bs;
1115     workt = work;
1116     for ( j=0; j<n; j++ ) {
1117       xb = x + bs*(*idx++);
1118       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1119       workt += bs;
1120     }
1121     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
1122     v += n*bs2;
1123     z += bs;
1124   }
1125   VecRestoreArray_Fast(xx,x);
1126   VecRestoreArray_Fast(zz,z);
1127   PLogFlops(2*a->nz*bs2 - a->m);
1128   return 0;
1129 }
1130 
1131 #undef __FUNC__
1132 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
1133 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1134 {
1135   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1136   register Scalar *x,*y,*z,*v,sum;
1137   int             mbs=a->mbs,i,*idx,*ii,n;
1138 
1139   VecGetArray_Fast(xx,x);
1140   VecGetArray_Fast(yy,y);
1141   VecGetArray_Fast(zz,z);
1142 
1143   idx   = a->j;
1144   v     = a->a;
1145   ii    = a->i;
1146 
1147   for ( i=0; i<mbs; i++ ) {
1148     n    = ii[1] - ii[0]; ii++;
1149     sum  = y[i];
1150     while (n--) sum += *v++ * x[*idx++];
1151     z[i] = sum;
1152   }
1153   VecRestoreArray_Fast(xx,x);
1154   VecRestoreArray_Fast(yy,y);
1155   VecRestoreArray_Fast(zz,z);
1156   PLogFlops(2*a->nz);
1157   return 0;
1158 }
1159 
1160 #undef __FUNC__
1161 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
1162 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1163 {
1164   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1165   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
1166   register Scalar x1,x2;
1167   int             mbs=a->mbs,i,*idx,*ii;
1168   int             j,n;
1169 
1170   VecGetArray_Fast(xx,x);
1171   VecGetArray_Fast(yy,y);
1172   VecGetArray_Fast(zz,z);
1173 
1174   idx   = a->j;
1175   v     = a->a;
1176   ii    = a->i;
1177 
1178   for ( i=0; i<mbs; i++ ) {
1179     n  = ii[1] - ii[0]; ii++;
1180     sum1 = y[0]; sum2 = y[1];
1181     for ( j=0; j<n; j++ ) {
1182       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1183       sum1 += v[0]*x1 + v[2]*x2;
1184       sum2 += v[1]*x1 + v[3]*x2;
1185       v += 4;
1186     }
1187     z[0] = sum1; z[1] = sum2;
1188     z += 2; y += 2;
1189   }
1190   VecRestoreArray_Fast(xx,x);
1191   VecRestoreArray_Fast(yy,y);
1192   VecRestoreArray_Fast(zz,z);
1193   PLogFlops(4*a->nz);
1194   return 0;
1195 }
1196 
1197 #undef __FUNC__
1198 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
1199 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1200 {
1201   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1202   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
1203   int             mbs=a->mbs,i,*idx,*ii,j,n;
1204 
1205   VecGetArray_Fast(xx,x);
1206   VecGetArray_Fast(yy,y);
1207   VecGetArray_Fast(zz,z);
1208 
1209   idx   = a->j;
1210   v     = a->a;
1211   ii    = a->i;
1212 
1213   for ( i=0; i<mbs; i++ ) {
1214     n  = ii[1] - ii[0]; ii++;
1215     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1216     for ( j=0; j<n; j++ ) {
1217       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1218       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1219       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1220       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1221       v += 9;
1222     }
1223     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1224     z += 3; y += 3;
1225   }
1226   VecRestoreArray_Fast(xx,x);
1227   VecRestoreArray_Fast(yy,y);
1228   VecRestoreArray_Fast(zz,z);
1229   PLogFlops(18*a->nz);
1230   return 0;
1231 }
1232 
1233 #undef __FUNC__
1234 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
1235 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1236 {
1237   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1238   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
1239   register Scalar x1,x2,x3,x4;
1240   int             mbs=a->mbs,i,*idx,*ii;
1241   int             j,n;
1242 
1243   VecGetArray_Fast(xx,x);
1244   VecGetArray_Fast(yy,y);
1245   VecGetArray_Fast(zz,z);
1246 
1247   idx   = a->j;
1248   v     = a->a;
1249   ii    = a->i;
1250 
1251   for ( i=0; i<mbs; i++ ) {
1252     n  = ii[1] - ii[0]; ii++;
1253     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1254     for ( j=0; j<n; j++ ) {
1255       xb = x + 4*(*idx++);
1256       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1257       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1258       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1259       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1260       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1261       v += 16;
1262     }
1263     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1264     z += 4; y += 4;
1265   }
1266   VecRestoreArray_Fast(xx,x);
1267   VecRestoreArray_Fast(yy,y);
1268   VecRestoreArray_Fast(zz,z);
1269   PLogFlops(32*a->nz);
1270   return 0;
1271 }
1272 
1273 #undef __FUNC__
1274 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
1275 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1276 {
1277   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1278   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1279   register Scalar x1,x2,x3,x4,x5;
1280   int             mbs=a->mbs,i,*idx,*ii,j,n;
1281 
1282   VecGetArray_Fast(xx,x);
1283   VecGetArray_Fast(yy,y);
1284   VecGetArray_Fast(zz,z);
1285 
1286   idx   = a->j;
1287   v     = a->a;
1288   ii    = a->i;
1289 
1290   for ( i=0; i<mbs; i++ ) {
1291     n  = ii[1] - ii[0]; ii++;
1292     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1293     for ( j=0; j<n; j++ ) {
1294       xb = x + 5*(*idx++);
1295       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1296       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1297       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1298       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1299       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1300       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1301       v += 25;
1302     }
1303     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1304     z += 5; y += 5;
1305   }
1306   VecRestoreArray_Fast(xx,x);
1307   VecRestoreArray_Fast(yy,y);
1308   VecRestoreArray_Fast(zz,z);
1309   PLogFlops(50*a->nz);
1310   return 0;
1311 }
1312 
1313 #undef __FUNC__
1314 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
1315 static int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1316 {
1317   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1318   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1319   register Scalar x1,x2,x3,x4,x5,x6,x7;
1320   int             mbs=a->mbs,i,*idx,*ii,j,n;
1321 
1322   VecGetArray_Fast(xx,x);
1323   VecGetArray_Fast(yy,y);
1324   VecGetArray_Fast(zz,z);
1325 
1326   idx   = a->j;
1327   v     = a->a;
1328   ii    = a->i;
1329 
1330   for ( i=0; i<mbs; i++ ) {
1331     n  = ii[1] - ii[0]; ii++;
1332     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1333     for ( j=0; j<n; j++ ) {
1334       xb = x + 7*(*idx++);
1335       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1336       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1337       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1338       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1339       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1340       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1341       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1342       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1343       v += 49;
1344     }
1345     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1346     z += 7; y += 7;
1347   }
1348   VecRestoreArray_Fast(xx,x);
1349   VecRestoreArray_Fast(yy,y);
1350   VecRestoreArray_Fast(zz,z);
1351   PLogFlops(98*a->nz);
1352   return 0;
1353 }
1354 
1355 
1356 #undef __FUNC__
1357 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
1358 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1359 {
1360   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1361   register Scalar *x,*z,*v,*xb;
1362   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
1363   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1364 
1365   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1366 
1367   VecGetArray_Fast(xx,x);
1368   VecGetArray_Fast(zz,z);
1369 
1370   idx   = a->j;
1371   v     = a->a;
1372   ii    = a->i;
1373 
1374 
1375   if (!a->mult_work) {
1376     k = PetscMax(a->m,a->n);
1377     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
1378   }
1379   work = a->mult_work;
1380   for ( i=0; i<mbs; i++ ) {
1381     n     = ii[1] - ii[0]; ii++;
1382     ncols = n*bs;
1383     workt = work;
1384     for ( j=0; j<n; j++ ) {
1385       xb = x + bs*(*idx++);
1386       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1387       workt += bs;
1388     }
1389     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
1390     v += n*bs2;
1391     z += bs;
1392   }
1393   VecRestoreArray_Fast(xx,x);
1394   VecRestoreArray_Fast(zz,z);
1395   PLogFlops(2*a->nz*bs2 );
1396   return 0;
1397 }
1398 
1399 #undef __FUNC__
1400 #define __FUNC__ "MatMultTrans_SeqBAIJ"
1401 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
1402 {
1403   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1404   Scalar          *xg,*zg,*zb;
1405   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1406   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1407   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1408 
1409 
1410   VecGetArray_Fast(xx,xg); x = xg;
1411   VecGetArray_Fast(zz,zg); z = zg;
1412   PetscMemzero(z,N*sizeof(Scalar));
1413 
1414   idx   = a->j;
1415   v     = a->a;
1416   ii    = a->i;
1417 
1418   switch (bs) {
1419   case 1:
1420     for ( i=0; i<mbs; i++ ) {
1421       n  = ii[1] - ii[0]; ii++;
1422       xb = x + i; x1 = xb[0];
1423       ib = idx + ai[i];
1424       for ( j=0; j<n; j++ ) {
1425         rval    = ib[j];
1426         z[rval] += *v++ * x1;
1427       }
1428     }
1429     break;
1430   case 2:
1431     for ( i=0; i<mbs; i++ ) {
1432       n  = ii[1] - ii[0]; ii++;
1433       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1434       ib = idx + ai[i];
1435       for ( j=0; j<n; j++ ) {
1436         rval      = ib[j]*2;
1437         z[rval++] += v[0]*x1 + v[1]*x2;
1438         z[rval++] += v[2]*x1 + v[3]*x2;
1439         v += 4;
1440       }
1441     }
1442     break;
1443   case 3:
1444     for ( i=0; i<mbs; i++ ) {
1445       n  = ii[1] - ii[0]; ii++;
1446       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1447       ib = idx + ai[i];
1448       for ( j=0; j<n; j++ ) {
1449         rval      = ib[j]*3;
1450         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1451         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1452         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1453         v += 9;
1454       }
1455     }
1456     break;
1457   case 4:
1458     for ( i=0; i<mbs; i++ ) {
1459       n  = ii[1] - ii[0]; ii++;
1460       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1461       ib = idx + ai[i];
1462       for ( j=0; j<n; j++ ) {
1463         rval      = ib[j]*4;
1464         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1465         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1466         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1467         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1468         v += 16;
1469       }
1470     }
1471     break;
1472   case 5:
1473     for ( i=0; i<mbs; i++ ) {
1474       n  = ii[1] - ii[0]; ii++;
1475       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1476       x4 = xb[3];   x5 = xb[4];
1477       ib = idx + ai[i];
1478       for ( j=0; j<n; j++ ) {
1479         rval      = ib[j]*5;
1480         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1481         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1482         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1483         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1484         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1485         v += 25;
1486       }
1487     }
1488     break;
1489       /* block sizes larger then 5 by 5 are handled by BLAS */
1490     default: {
1491       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1492       if (!a->mult_work) {
1493         k = PetscMax(a->m,a->n);
1494         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1495         CHKPTRQ(a->mult_work);
1496       }
1497       work = a->mult_work;
1498       for ( i=0; i<mbs; i++ ) {
1499         n     = ii[1] - ii[0]; ii++;
1500         ncols = n*bs;
1501         PetscMemzero(work,ncols*sizeof(Scalar));
1502         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1503         v += n*bs2;
1504         x += bs;
1505         workt = work;
1506         for ( j=0; j<n; j++ ) {
1507           zb = z + bs*(*idx++);
1508           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1509           workt += bs;
1510         }
1511       }
1512     }
1513   }
1514   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1515   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1516   PLogFlops(2*a->nz*a->bs2 - a->n);
1517   return 0;
1518 }
1519 
1520 #undef __FUNC__
1521 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1522 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1523 {
1524   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1525   Scalar          *xg,*zg,*zb;
1526   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1527   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1528   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1529 
1530 
1531 
1532   VecGetArray_Fast(xx,xg); x = xg;
1533   VecGetArray_Fast(zz,zg); z = zg;
1534 
1535   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1536   else PetscMemzero(z,N*sizeof(Scalar));
1537 
1538 
1539   idx   = a->j;
1540   v     = a->a;
1541   ii    = a->i;
1542 
1543   switch (bs) {
1544   case 1:
1545     for ( i=0; i<mbs; i++ ) {
1546       n  = ii[1] - ii[0]; ii++;
1547       xb = x + i; x1 = xb[0];
1548       ib = idx + ai[i];
1549       for ( j=0; j<n; j++ ) {
1550         rval    = ib[j];
1551         z[rval] += *v++ * x1;
1552       }
1553     }
1554     break;
1555   case 2:
1556     for ( i=0; i<mbs; i++ ) {
1557       n  = ii[1] - ii[0]; ii++;
1558       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1559       ib = idx + ai[i];
1560       for ( j=0; j<n; j++ ) {
1561         rval      = ib[j]*2;
1562         z[rval++] += v[0]*x1 + v[1]*x2;
1563         z[rval++] += v[2]*x1 + v[3]*x2;
1564         v += 4;
1565       }
1566     }
1567     break;
1568   case 3:
1569     for ( i=0; i<mbs; i++ ) {
1570       n  = ii[1] - ii[0]; ii++;
1571       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1572       ib = idx + ai[i];
1573       for ( j=0; j<n; j++ ) {
1574         rval      = ib[j]*3;
1575         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1576         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1577         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1578         v += 9;
1579       }
1580     }
1581     break;
1582   case 4:
1583     for ( i=0; i<mbs; i++ ) {
1584       n  = ii[1] - ii[0]; ii++;
1585       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1586       ib = idx + ai[i];
1587       for ( j=0; j<n; j++ ) {
1588         rval      = ib[j]*4;
1589         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1590         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1591         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1592         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1593         v += 16;
1594       }
1595     }
1596     break;
1597   case 5:
1598     for ( i=0; i<mbs; i++ ) {
1599       n  = ii[1] - ii[0]; ii++;
1600       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1601       x4 = xb[3];   x5 = xb[4];
1602       ib = idx + ai[i];
1603       for ( j=0; j<n; j++ ) {
1604         rval      = ib[j]*5;
1605         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1606         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1607         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1608         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1609         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1610         v += 25;
1611       }
1612     }
1613     break;
1614       /* block sizes larger then 5 by 5 are handled by BLAS */
1615     default: {
1616       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1617       if (!a->mult_work) {
1618         k = PetscMax(a->m,a->n);
1619         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1620         CHKPTRQ(a->mult_work);
1621       }
1622       work = a->mult_work;
1623       for ( i=0; i<mbs; i++ ) {
1624         n     = ii[1] - ii[0]; ii++;
1625         ncols = n*bs;
1626         PetscMemzero(work,ncols*sizeof(Scalar));
1627         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1628         v += n*bs2;
1629         x += bs;
1630         workt = work;
1631         for ( j=0; j<n; j++ ) {
1632           zb = z + bs*(*idx++);
1633           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1634           workt += bs;
1635         }
1636       }
1637     }
1638   }
1639   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1640   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1641   PLogFlops(2*a->nz*a->bs2);
1642   return 0;
1643 }
1644 
1645 #undef __FUNC__
1646 #define __FUNC__ "MatGetInfo_SeqBAIJ" /* ADIC Ignore */
1647 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1648 {
1649   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1650 
1651   info->rows_global    = (double)a->m;
1652   info->columns_global = (double)a->n;
1653   info->rows_local     = (double)a->m;
1654   info->columns_local  = (double)a->n;
1655   info->block_size     = a->bs2;
1656   info->nz_allocated   = a->maxnz;
1657   info->nz_used        = a->bs2*a->nz;
1658   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1659   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1660     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1661   info->assemblies   = A->num_ass;
1662   info->mallocs      = a->reallocs;
1663   info->memory       = A->mem;
1664   if (A->factor) {
1665     info->fill_ratio_given  = A->info.fill_ratio_given;
1666     info->fill_ratio_needed = A->info.fill_ratio_needed;
1667     info->factor_mallocs    = A->info.factor_mallocs;
1668   } else {
1669     info->fill_ratio_given  = 0;
1670     info->fill_ratio_needed = 0;
1671     info->factor_mallocs    = 0;
1672   }
1673   return 0;
1674 }
1675 
1676 #undef __FUNC__
1677 #define __FUNC__ "MatEqual_SeqBAIJ"
1678 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1679 {
1680   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1681 
1682   if (B->type !=MATSEQBAIJ)SETERRQ(1,0,"Matrices must be same type");
1683 
1684   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1685   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1686       (a->nz != b->nz)) {
1687     *flg = PETSC_FALSE; return 0;
1688   }
1689 
1690   /* if the a->i are the same */
1691   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1692     *flg = PETSC_FALSE; return 0;
1693   }
1694 
1695   /* if a->j are the same */
1696   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1697     *flg = PETSC_FALSE; return 0;
1698   }
1699 
1700   /* if a->a are the same */
1701   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1702     *flg = PETSC_FALSE; return 0;
1703   }
1704   *flg = PETSC_TRUE;
1705   return 0;
1706 
1707 }
1708 
1709 #undef __FUNC__
1710 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1711 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1712 {
1713   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1714   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1715   Scalar      *x, zero = 0.0,*aa,*aa_j;
1716 
1717   if (A->factor) SETERRQ(1,0,"Not for factored matrix");
1718   bs   = a->bs;
1719   aa   = a->a;
1720   ai   = a->i;
1721   aj   = a->j;
1722   ambs = a->mbs;
1723   bs2  = a->bs2;
1724 
1725   VecSet(&zero,v);
1726   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1727   if (n != a->m) SETERRQ(1,0,"Nonconforming matrix and vector");
1728   for ( i=0; i<ambs; i++ ) {
1729     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1730       if (aj[j] == i) {
1731         row  = i*bs;
1732         aa_j = aa+j*bs2;
1733         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1734         break;
1735       }
1736     }
1737   }
1738   return 0;
1739 }
1740 
1741 #undef __FUNC__
1742 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1743 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1744 {
1745   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1746   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1747   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1748 
1749   ai  = a->i;
1750   aj  = a->j;
1751   aa  = a->a;
1752   m   = a->m;
1753   n   = a->n;
1754   bs  = a->bs;
1755   mbs = a->mbs;
1756   bs2 = a->bs2;
1757   if (ll) {
1758     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1759     if (lm != m) SETERRQ(1,0,"Left scaling vector wrong length");
1760     for ( i=0; i<mbs; i++ ) { /* for each block row */
1761       M  = ai[i+1] - ai[i];
1762       li = l + i*bs;
1763       v  = aa + bs2*ai[i];
1764       for ( j=0; j<M; j++ ) { /* for each block */
1765         for ( k=0; k<bs2; k++ ) {
1766           (*v++) *= li[k%bs];
1767         }
1768       }
1769     }
1770   }
1771 
1772   if (rr) {
1773     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1774     if (rn != n) SETERRQ(1,0,"Right scaling vector wrong length");
1775     for ( i=0; i<mbs; i++ ) { /* for each block row */
1776       M  = ai[i+1] - ai[i];
1777       v  = aa + bs2*ai[i];
1778       for ( j=0; j<M; j++ ) { /* for each block */
1779         ri = r + bs*aj[ai[i]+j];
1780         for ( k=0; k<bs; k++ ) {
1781           x = ri[k];
1782           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1783         }
1784       }
1785     }
1786   }
1787   return 0;
1788 }
1789 
1790 
1791 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1792 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1793 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1794 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1795 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1796 
1797 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1798 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1799 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1800 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1801 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1802 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1803 extern int MatSolve_SeqBAIJ_7(Mat,Vec,Vec);
1804 
1805 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1806 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1807 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1808 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1809 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1810 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1811 
1812 #undef __FUNC__
1813 #define __FUNC__ "MatNorm_SeqBAIJ"
1814 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1815 {
1816   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1817   Scalar      *v = a->a;
1818   double      sum = 0.0;
1819   int         i,nz=a->nz,bs2=a->bs2;
1820 
1821   if (type == NORM_FROBENIUS) {
1822     for (i=0; i< bs2*nz; i++ ) {
1823 #if defined(PETSC_COMPLEX)
1824       sum += real(conj(*v)*(*v)); v++;
1825 #else
1826       sum += (*v)*(*v); v++;
1827 #endif
1828     }
1829     *norm = sqrt(sum);
1830   }
1831   else {
1832     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1833   }
1834   return 0;
1835 }
1836 
1837 extern int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat,Vec,Vec);
1838 /*
1839      note: This can only work for identity for row and col. It would
1840    be good to check this and otherwise generate an error.
1841 */
1842 #undef __FUNC__
1843 #define __FUNC__ "MatILUFactor_SeqBAIJ"
1844 int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1845 {
1846   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1847   Mat         outA;
1848   int         ierr;
1849 
1850   if (fill != 0) SETERRQ(1,0,"Only fill=0 supported");
1851 
1852   outA          = inA;
1853   inA->factor   = FACTOR_LU;
1854   a->row        = row;
1855   a->col        = col;
1856 
1857   if (!a->solve_work) {
1858     a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1859     PLogObjectMemory(inA,(a->m+a->bs)*sizeof(Scalar));
1860   }
1861 
1862   if (!a->diag) {
1863     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1864   }
1865   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1866 
1867   /*
1868       Blocksize 4 has a special faster solver for ILU(0) factorization
1869     with natural ordering
1870   */
1871   if (a->bs == 4) {
1872     inA->ops.solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
1873   }
1874 
1875   return 0;
1876 }
1877 
1878 #undef __FUNC__
1879 #define __FUNC__ "MatScale_SeqBAIJ"
1880 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1881 {
1882   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1883   int         one = 1, totalnz = a->bs2*a->nz;
1884   BLscal_( &totalnz, alpha, a->a, &one );
1885   PLogFlops(totalnz);
1886   return 0;
1887 }
1888 
1889 #undef __FUNC__
1890 #define __FUNC__ "MatGetValues_SeqBAIJ"
1891 int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1892 {
1893   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1894   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1895   int        *ai = a->i, *ailen = a->ilen;
1896   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1897   Scalar     *ap, *aa = a->a, zero = 0.0;
1898 
1899   for ( k=0; k<m; k++ ) { /* loop over rows */
1900     row  = im[k]; brow = row/bs;
1901     if (row < 0) SETERRQ(1,0,"Negative row");
1902     if (row >= a->m) SETERRQ(1,0,"Row too large");
1903     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1904     nrow = ailen[brow];
1905     for ( l=0; l<n; l++ ) { /* loop over columns */
1906       if (in[l] < 0) SETERRQ(1,0,"Negative column");
1907       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
1908       col  = in[l] ;
1909       bcol = col/bs;
1910       cidx = col%bs;
1911       ridx = row%bs;
1912       high = nrow;
1913       low  = 0; /* assume unsorted */
1914       while (high-low > 5) {
1915         t = (low+high)/2;
1916         if (rp[t] > bcol) high = t;
1917         else             low  = t;
1918       }
1919       for ( i=low; i<high; i++ ) {
1920         if (rp[i] > bcol) break;
1921         if (rp[i] == bcol) {
1922           *v++ = ap[bs2*i+bs*cidx+ridx];
1923           goto finished;
1924         }
1925       }
1926       *v++ = zero;
1927       finished:;
1928     }
1929   }
1930   return 0;
1931 }
1932 
1933 #undef __FUNC__
1934 #define __FUNC__ "MatGetBlockSize_SeqBAIJ" /* ADIC Ignore */
1935 int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs)
1936 {
1937   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data;
1938   *bs = baij->bs;
1939   return 0;
1940 }
1941 
1942 /* idx should be of length atlease bs */
1943 #undef __FUNC__
1944 #define __FUNC__ "MatZeroRows_SeqBAIJ_Check_Block"
1945 static int MatZeroRows_SeqBAIJ_Check_Block(int *idx, int bs, PetscTruth *flg)
1946 {
1947   int i,row;
1948   row = idx[0];
1949   if (row%bs!=0) { *flg = PETSC_FALSE; return 0; }
1950 
1951   for ( i=1; i<bs; i++ ) {
1952     if (row+i != idx[i]) { *flg = PETSC_FALSE; return 0; }
1953   }
1954   *flg = PETSC_TRUE;
1955   return 0;
1956 }
1957 
1958 #undef __FUNC__
1959 #define __FUNC__ "MatZeroRows_SeqBAIJ"
1960 int MatZeroRows_SeqBAIJ(Mat A,IS is, Scalar *diag)
1961 {
1962   Mat_SeqBAIJ *baij=(Mat_SeqBAIJ*)A->data;
1963   IS          is_local;
1964   int         ierr,i,j,count,m=baij->m,is_n,*is_idx,*rows,bs=baij->bs,bs2=baij->bs2;
1965   PetscTruth  flg;
1966   Scalar      zero = 0.0,*aa;
1967 
1968   /* Make a copy of the IS and  sort it */
1969   ierr = ISGetSize(is,&is_n);CHKERRQ(ierr);
1970   ierr = ISGetIndices(is,&is_idx);CHKERRQ(ierr);
1971   ierr = ISCreateGeneral(A->comm,is_n,is_idx,&is_local); CHKERRQ(ierr);
1972   ierr = ISSort(is_local); CHKERRQ(ierr);
1973   ierr = ISGetIndices(is_local,&rows); CHKERRQ(ierr);
1974 
1975   i = 0;
1976   while (i < is_n) {
1977     if (rows[i]<0 || rows[i]>m) SETERRQ(1,0,"row out of range");
1978     flg = PETSC_FALSE;
1979     if (i+bs <= is_n) {ierr = MatZeroRows_SeqBAIJ_Check_Block(rows+i,bs,&flg); CHKERRQ(ierr); }
1980     if (flg) { /* There exists a block of rows to be Zerowed */
1981       baij->ilen[rows[i]/bs] = 0;
1982       i += bs;
1983     } else { /* Zero out only the requested row */
1984       count = (baij->i[rows[i]/bs +1] - baij->i[rows[i]/bs])*bs;
1985       aa    = baij->a + baij->i[rows[i]/bs]*bs2 + (rows[i]%bs);
1986       for ( j=0; j<count; j++ ) {
1987         aa[0] = zero;
1988         aa+=bs;
1989       }
1990       i++;
1991     }
1992   }
1993   if (diag) {
1994     for ( j=0; j<is_n; j++ ) {
1995       ierr = (*A->ops.setvalues)(A,1,rows+j,1,rows+j,diag,INSERT_VALUES);CHKERRQ(ierr);
1996     }
1997   }
1998   ierr = ISRestoreIndices(is,&is_idx); CHKERRQ(ierr);
1999   ierr = ISRestoreIndices(is_local,&rows); CHKERRQ(ierr);
2000   ierr = ISDestroy(is_local); CHKERRQ(ierr);
2001   ierr = MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2002 
2003   return 0;
2004 }
2005 
2006 #undef __FUNC__
2007 #define __FUNC__ "MatPrintHelp_SeqBAIJ" /* ADIC Ignore */
2008 int MatPrintHelp_SeqBAIJ(Mat A)
2009 {
2010   static int called = 0;
2011   MPI_Comm   comm = A->comm;
2012 
2013   if (called) return 0; else called = 1;
2014   PetscPrintf(comm," Options for MATSEQBAIJ and MATMPIBAIJ matrix formats (the defaults):\n");
2015   PetscPrintf(comm,"  -mat_block_size <block_size>\n");
2016   return 0;
2017 }
2018 
2019 /* -------------------------------------------------------------------*/
2020 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
2021        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
2022        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
2023        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
2024        MatSolve_SeqBAIJ_N,0,
2025        0,0,
2026        MatLUFactor_SeqBAIJ,0,
2027        0,
2028        MatTranspose_SeqBAIJ,
2029        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
2030        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
2031        0,MatAssemblyEnd_SeqBAIJ,
2032        0,
2033        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,MatZeroRows_SeqBAIJ,
2034        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
2035        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
2036        MatILUFactorSymbolic_SeqBAIJ,0,
2037        0,0,
2038        MatConvertSameType_SeqBAIJ,0,0,
2039        MatILUFactor_SeqBAIJ,0,0,
2040        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
2041        MatGetValues_SeqBAIJ,0,
2042        MatPrintHelp_SeqBAIJ,MatScale_SeqBAIJ,
2043        0,0,0,MatGetBlockSize_SeqBAIJ,
2044        MatGetRowIJ_SeqBAIJ,
2045        MatRestoreRowIJ_SeqBAIJ,
2046        0,0,0,0,0,0,
2047        MatSetValuesBlocked_SeqBAIJ};
2048 
2049 #undef __FUNC__
2050 #define __FUNC__ "MatCreateSeqBAIJ"
2051 /*@C
2052    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
2053    compressed row) format.  For good matrix assembly performance the
2054    user should preallocate the matrix storage by setting the parameter nz
2055    (or the array nzz).  By setting these parameters accurately, performance
2056    during matrix assembly can be increased by more than a factor of 50.
2057 
2058    Input Parameters:
2059 .  comm - MPI communicator, set to PETSC_COMM_SELF
2060 .  bs - size of block
2061 .  m - number of rows
2062 .  n - number of columns
2063 .  nz - number of block nonzeros per block row (same for all rows)
2064 .  nzz - array containing the number of block nonzeros in the various block rows
2065          (possibly different for each block row) or PETSC_NULL
2066 
2067    Output Parameter:
2068 .  A - the matrix
2069 
2070    Options Database Keys:
2071 $    -mat_no_unroll - uses code that does not unroll the loops in the
2072 $                     block calculations (much solver)
2073 $    -mat_block_size - size of the blocks to use
2074 
2075    Notes:
2076    The block AIJ format is fully compatible with standard Fortran 77
2077    storage.  That is, the stored row and column indices can begin at
2078    either one (as in Fortran) or zero.  See the users' manual for details.
2079 
2080    Specify the preallocated storage with either nz or nnz (not both).
2081    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
2082    allocation.  For additional details, see the users manual chapter on
2083    matrices.
2084 
2085 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2086 @*/
2087 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
2088 {
2089   Mat         B;
2090   Mat_SeqBAIJ *b;
2091   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs,size;
2092 
2093   MPI_Comm_size(comm,&size);
2094   if (size > 1) SETERRQ(1,0,"Comm must be of size 1");
2095 
2096   ierr = OptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,&flg);CHKERRQ(ierr);
2097 
2098   if (mbs*bs!=m || nbs*bs!=n)
2099     SETERRQ(1,0,"Number rows, cols must be divisible by blocksize");
2100 
2101   *A = 0;
2102   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATSEQBAIJ,comm);
2103   PLogObjectCreate(B);
2104   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
2105   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
2106   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
2107   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
2108   if (!flg) {
2109     switch (bs) {
2110     case 1:
2111       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2112       B->ops.solve           = MatSolve_SeqBAIJ_1;
2113       B->ops.mult            = MatMult_SeqBAIJ_1;
2114       B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
2115       break;
2116     case 2:
2117       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
2118       B->ops.solve           = MatSolve_SeqBAIJ_2;
2119       B->ops.mult            = MatMult_SeqBAIJ_2;
2120       B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
2121       break;
2122     case 3:
2123       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
2124       B->ops.solve           = MatSolve_SeqBAIJ_3;
2125       B->ops.mult            = MatMult_SeqBAIJ_3;
2126       B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
2127       break;
2128     case 4:
2129       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
2130       B->ops.solve           = MatSolve_SeqBAIJ_4;
2131       B->ops.mult            = MatMult_SeqBAIJ_4;
2132       B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
2133       break;
2134     case 5:
2135       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
2136       B->ops.solve           = MatSolve_SeqBAIJ_5;
2137       B->ops.mult            = MatMult_SeqBAIJ_5;
2138       B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
2139       break;
2140     case 7:
2141       B->ops.mult            = MatMult_SeqBAIJ_7;
2142       B->ops.solve           = MatSolve_SeqBAIJ_7;
2143       B->ops.multadd         = MatMultAdd_SeqBAIJ_7;
2144       break;
2145     }
2146   }
2147   B->destroy          = MatDestroy_SeqBAIJ;
2148   B->view             = MatView_SeqBAIJ;
2149   B->factor           = 0;
2150   B->lupivotthreshold = 1.0;
2151   B->mapping          = 0;
2152   b->row              = 0;
2153   b->col              = 0;
2154   b->reallocs         = 0;
2155 
2156   b->m       = m; B->m = m; B->M = m;
2157   b->n       = n; B->n = n; B->N = n;
2158   b->mbs     = mbs;
2159   b->nbs     = nbs;
2160   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
2161   if (nnz == PETSC_NULL) {
2162     if (nz == PETSC_DEFAULT) nz = 5;
2163     else if (nz <= 0)        nz = 1;
2164     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
2165     nz = nz*mbs;
2166   }
2167   else {
2168     nz = 0;
2169     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
2170   }
2171 
2172   /* allocate the matrix space */
2173   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
2174   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
2175   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
2176   b->j  = (int *) (b->a + nz*bs2);
2177   PetscMemzero(b->j,nz*sizeof(int));
2178   b->i  = b->j + nz;
2179   b->singlemalloc = 1;
2180 
2181   b->i[0] = 0;
2182   for (i=1; i<mbs+1; i++) {
2183     b->i[i] = b->i[i-1] + b->imax[i-1];
2184   }
2185 
2186   /* b->ilen will count nonzeros in each block row so far. */
2187   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
2188   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ));
2189   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
2190 
2191   b->bs               = bs;
2192   b->bs2              = bs2;
2193   b->mbs              = mbs;
2194   b->nz               = 0;
2195   b->maxnz            = nz*bs2;
2196   b->sorted           = 0;
2197   b->roworiented      = 1;
2198   b->nonew            = 0;
2199   b->diag             = 0;
2200   b->solve_work       = 0;
2201   b->mult_work        = 0;
2202   b->spptr            = 0;
2203   B->info.nz_unneeded = (double)b->maxnz;
2204 
2205   *A = B;
2206   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
2207   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
2208   return 0;
2209 }
2210 
2211 #undef __FUNC__
2212 #define __FUNC__ "MatConvertSameType_SeqBAIJ"
2213 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
2214 {
2215   Mat         C;
2216   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
2217   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
2218 
2219   if (a->i[mbs] != nz) SETERRQ(1,0,"Corrupt matrix");
2220 
2221   *B = 0;
2222   PetscHeaderCreate(C,_p_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm);
2223   PLogObjectCreate(C);
2224   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
2225   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
2226   C->destroy    = MatDestroy_SeqBAIJ;
2227   C->view       = MatView_SeqBAIJ;
2228   C->factor     = A->factor;
2229   c->row        = 0;
2230   c->col        = 0;
2231   C->assembled  = PETSC_TRUE;
2232 
2233   c->m = C->m   = a->m;
2234   c->n = C->n   = a->n;
2235   C->M          = a->m;
2236   C->N          = a->n;
2237 
2238   c->bs         = a->bs;
2239   c->bs2        = a->bs2;
2240   c->mbs        = a->mbs;
2241   c->nbs        = a->nbs;
2242 
2243   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
2244   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
2245   for ( i=0; i<mbs; i++ ) {
2246     c->imax[i] = a->imax[i];
2247     c->ilen[i] = a->ilen[i];
2248   }
2249 
2250   /* allocate the matrix space */
2251   c->singlemalloc = 1;
2252   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
2253   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
2254   c->j  = (int *) (c->a + nz*bs2);
2255   c->i  = c->j + nz;
2256   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
2257   if (mbs > 0) {
2258     PetscMemcpy(c->j,a->j,nz*sizeof(int));
2259     if (cpvalues == COPY_VALUES) {
2260       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
2261     }
2262   }
2263 
2264   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ));
2265   c->sorted      = a->sorted;
2266   c->roworiented = a->roworiented;
2267   c->nonew       = a->nonew;
2268 
2269   if (a->diag) {
2270     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
2271     PLogObjectMemory(C,(mbs+1)*sizeof(int));
2272     for ( i=0; i<mbs; i++ ) {
2273       c->diag[i] = a->diag[i];
2274     }
2275   }
2276   else c->diag          = 0;
2277   c->nz                 = a->nz;
2278   c->maxnz              = a->maxnz;
2279   c->solve_work         = 0;
2280   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
2281   c->mult_work          = 0;
2282   *B = C;
2283   return 0;
2284 }
2285 
2286 #undef __FUNC__
2287 #define __FUNC__ "MatLoad_SeqBAIJ"
2288 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
2289 {
2290   Mat_SeqBAIJ  *a;
2291   Mat          B;
2292   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
2293   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
2294   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
2295   int          *masked, nmask,tmp,bs2,ishift;
2296   Scalar       *aa;
2297   MPI_Comm     comm = ((PetscObject) viewer)->comm;
2298 
2299   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2300   bs2  = bs*bs;
2301 
2302   MPI_Comm_size(comm,&size);
2303   if (size > 1) SETERRQ(1,0,"view must have one processor");
2304   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2305   ierr = PetscBinaryRead(fd,header,4,BINARY_INT); CHKERRQ(ierr);
2306   if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not Mat object");
2307   M = header[1]; N = header[2]; nz = header[3];
2308 
2309   if (M != N) SETERRQ(1,0,"Can only do square matrices");
2310 
2311   /*
2312      This code adds extra rows to make sure the number of rows is
2313     divisible by the blocksize
2314   */
2315   mbs        = M/bs;
2316   extra_rows = bs - M + bs*(mbs);
2317   if (extra_rows == bs) extra_rows = 0;
2318   else                  mbs++;
2319   if (extra_rows) {
2320     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize\n");
2321   }
2322 
2323   /* read in row lengths */
2324   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
2325   ierr = PetscBinaryRead(fd,rowlengths,M,BINARY_INT); CHKERRQ(ierr);
2326   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2327 
2328   /* read in column indices */
2329   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
2330   ierr = PetscBinaryRead(fd,jj,nz,BINARY_INT); CHKERRQ(ierr);
2331   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
2332 
2333   /* loop over row lengths determining block row lengths */
2334   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
2335   PetscMemzero(browlengths,mbs*sizeof(int));
2336   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
2337   PetscMemzero(mask,mbs*sizeof(int));
2338   masked = mask + mbs;
2339   rowcount = 0; nzcount = 0;
2340   for ( i=0; i<mbs; i++ ) {
2341     nmask = 0;
2342     for ( j=0; j<bs; j++ ) {
2343       kmax = rowlengths[rowcount];
2344       for ( k=0; k<kmax; k++ ) {
2345         tmp = jj[nzcount++]/bs;
2346         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
2347       }
2348       rowcount++;
2349     }
2350     browlengths[i] += nmask;
2351     /* zero out the mask elements we set */
2352     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2353   }
2354 
2355   /* create our matrix */
2356   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);CHKERRQ(ierr);
2357   B = *A;
2358   a = (Mat_SeqBAIJ *) B->data;
2359 
2360   /* set matrix "i" values */
2361   a->i[0] = 0;
2362   for ( i=1; i<= mbs; i++ ) {
2363     a->i[i]      = a->i[i-1] + browlengths[i-1];
2364     a->ilen[i-1] = browlengths[i-1];
2365   }
2366   a->nz         = 0;
2367   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
2368 
2369   /* read in nonzero values */
2370   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
2371   ierr = PetscBinaryRead(fd,aa,nz,BINARY_SCALAR); CHKERRQ(ierr);
2372   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
2373 
2374   /* set "a" and "j" values into matrix */
2375   nzcount = 0; jcount = 0;
2376   for ( i=0; i<mbs; i++ ) {
2377     nzcountb = nzcount;
2378     nmask    = 0;
2379     for ( j=0; j<bs; j++ ) {
2380       kmax = rowlengths[i*bs+j];
2381       for ( k=0; k<kmax; k++ ) {
2382         tmp = jj[nzcount++]/bs;
2383 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
2384       }
2385       rowcount++;
2386     }
2387     /* sort the masked values */
2388     PetscSortInt(nmask,masked);
2389 
2390     /* set "j" values into matrix */
2391     maskcount = 1;
2392     for ( j=0; j<nmask; j++ ) {
2393       a->j[jcount++]  = masked[j];
2394       mask[masked[j]] = maskcount++;
2395     }
2396     /* set "a" values into matrix */
2397     ishift = bs2*a->i[i];
2398     for ( j=0; j<bs; j++ ) {
2399       kmax = rowlengths[i*bs+j];
2400       for ( k=0; k<kmax; k++ ) {
2401         tmp    = jj[nzcountb]/bs ;
2402         block  = mask[tmp] - 1;
2403         point  = jj[nzcountb] - bs*tmp;
2404         idx    = ishift + bs2*block + j + bs*point;
2405         a->a[idx] = aa[nzcountb++];
2406       }
2407     }
2408     /* zero out the mask elements we set */
2409     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2410   }
2411   if (jcount != a->nz) SETERRQ(1,0,"Error bad binary matrix");
2412 
2413   PetscFree(rowlengths);
2414   PetscFree(browlengths);
2415   PetscFree(aa);
2416   PetscFree(jj);
2417   PetscFree(mask);
2418 
2419   B->assembled = PETSC_TRUE;
2420 
2421   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info",&flg); CHKERRQ(ierr);
2422   if (flg) {
2423     Viewer tviewer;
2424     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2425     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO,0);CHKERRQ(ierr);
2426     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2427     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2428   }
2429   ierr = OptionsHasName(PETSC_NULL,"-mat_view_info_detailed",&flg);CHKERRQ(ierr);
2430   if (flg) {
2431     Viewer tviewer;
2432     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2433     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_INFO_LONG,0);CHKERRQ(ierr);
2434     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2435     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2436   }
2437   ierr = OptionsHasName(PETSC_NULL,"-mat_view",&flg); CHKERRQ(ierr);
2438   if (flg) {
2439     Viewer tviewer;
2440     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2441     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2442     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2443   }
2444   ierr = OptionsHasName(PETSC_NULL,"-mat_view_matlab",&flg); CHKERRQ(ierr);
2445   if (flg) {
2446     Viewer tviewer;
2447     ierr = ViewerFileOpenASCII(B->comm,"stdout",&tviewer);CHKERRQ(ierr);
2448     ierr = ViewerSetFormat(tviewer,VIEWER_FORMAT_ASCII_MATLAB,"M");CHKERRQ(ierr);
2449     ierr = MatView(B,tviewer); CHKERRQ(ierr);
2450     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2451   }
2452   ierr = OptionsHasName(PETSC_NULL,"-mat_view_draw",&flg); CHKERRQ(ierr);
2453   if (flg) {
2454     Viewer tviewer;
2455     ierr = ViewerDrawOpenX(B->comm,0,0,0,0,300,300,&tviewer); CHKERRQ(ierr);
2456     ierr = MatView(B,(Viewer)tviewer); CHKERRQ(ierr);
2457     ierr = ViewerFlush(tviewer); CHKERRQ(ierr);
2458     ierr = ViewerDestroy(tviewer); CHKERRQ(ierr);
2459   }
2460   return 0;
2461 }
2462 
2463 
2464 
2465