xref: /petsc/src/mat/impls/baij/seq/baij.c (revision c2ea1186303e6ad186d8293edcc4dffdc3bb3ffe)
1 
2 #ifdef PETSC_RCS_HEADER
3 static char vcid[] = "$Id: baij.c,v 1.112 1997/09/26 02:19:29 bsmith Exp bsmith $";
4 #endif
5 
6 /*
7     Defines the basic matrix operations for the BAIJ (compressed row)
8   matrix storage format.
9 */
10 
11 #include "pinclude/pviewer.h"
12 #include "sys.h"
13 #include "src/mat/impls/baij/seq/baij.h"
14 #include "src/vec/vecimpl.h"
15 #include "src/inline/spops.h"
16 #include "petsc.h"
17 
18 
19 /*
20      Adds diagonal pointers to sparse matrix structure.
21 */
22 
23 #undef __FUNC__
24 #define __FUNC__ "MatMarkDiag_SeqBAIJ"
25 int MatMarkDiag_SeqBAIJ(Mat A)
26 {
27   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
28   int         i,j, *diag, m = a->mbs;
29 
30   diag = (int *) PetscMalloc( (m+1)*sizeof(int)); CHKPTRQ(diag);
31   PLogObjectMemory(A,(m+1)*sizeof(int));
32   for ( i=0; i<m; i++ ) {
33     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
34       if (a->j[j] == i) {
35         diag[i] = j;
36         break;
37       }
38     }
39   }
40   a->diag = diag;
41   return 0;
42 }
43 
44 
45 extern int MatToSymmetricIJ_SeqAIJ(int,int*,int*,int,int,int**,int**);
46 
47 #undef __FUNC__
48 #define __FUNC__ "MatGetRowIJ_SeqBAIJ"
49 static int MatGetRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
50                             PetscTruth *done)
51 {
52   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
53   int         ierr, n = a->mbs,i;
54 
55   *nn = n;
56   if (!ia) return 0;
57   if (symmetric) {
58     ierr = MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,0,oshift,ia,ja);CHKERRQ(ierr);
59   } else if (oshift == 1) {
60     /* temporarily add 1 to i and j indices */
61     int nz = a->i[n] + 1;
62     for ( i=0; i<nz; i++ ) a->j[i]++;
63     for ( i=0; i<n+1; i++ ) a->i[i]++;
64     *ia = a->i; *ja = a->j;
65   } else {
66     *ia = a->i; *ja = a->j;
67   }
68 
69   return 0;
70 }
71 
72 #undef __FUNC__
73 #define __FUNC__ "MatRestoreRowIJ_SeqBAIJ"
74 static int MatRestoreRowIJ_SeqBAIJ(Mat A,int oshift,PetscTruth symmetric,int *nn,int **ia,int **ja,
75                                 PetscTruth *done)
76 {
77   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
78   int         i,n = a->mbs;
79 
80   if (!ia) return 0;
81   if (symmetric) {
82     PetscFree(*ia);
83     PetscFree(*ja);
84   } else if (oshift == 1) {
85     int nz = a->i[n];
86     for ( i=0; i<nz; i++ ) a->j[i]--;
87     for ( i=0; i<n+1; i++ ) a->i[i]--;
88   }
89   return 0;
90 }
91 
92 
93 #undef __FUNC__
94 #define __FUNC__ "MatView_SeqBAIJ_Binary"
95 static int MatView_SeqBAIJ_Binary(Mat A,Viewer viewer)
96 {
97   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
98   int         i, fd, *col_lens, ierr, bs = a->bs,count,*jj,j,k,l,bs2=a->bs2;
99   Scalar      *aa;
100 
101   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
102   col_lens = (int *) PetscMalloc((4+a->m)*sizeof(int));CHKPTRQ(col_lens);
103   col_lens[0] = MAT_COOKIE;
104 
105   col_lens[1] = a->m;
106   col_lens[2] = a->n;
107   col_lens[3] = a->nz*bs2;
108 
109   /* store lengths of each row and write (including header) to file */
110   count = 0;
111   for ( i=0; i<a->mbs; i++ ) {
112     for ( j=0; j<bs; j++ ) {
113       col_lens[4+count++] = bs*(a->i[i+1] - a->i[i]);
114     }
115   }
116   ierr = PetscBinaryWrite(fd,col_lens,4+a->m,PETSC_INT,1); CHKERRQ(ierr);
117   PetscFree(col_lens);
118 
119   /* store column indices (zero start index) */
120   jj = (int *) PetscMalloc( (a->nz+1)*bs2*sizeof(int) ); CHKPTRQ(jj);
121   count = 0;
122   for ( i=0; i<a->mbs; i++ ) {
123     for ( j=0; j<bs; j++ ) {
124       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
125         for ( l=0; l<bs; l++ ) {
126           jj[count++] = bs*a->j[k] + l;
127         }
128       }
129     }
130   }
131   ierr = PetscBinaryWrite(fd,jj,bs2*a->nz,PETSC_INT,0); CHKERRQ(ierr);
132   PetscFree(jj);
133 
134   /* store nonzero values */
135   aa = (Scalar *) PetscMalloc((a->nz+1)*bs2*sizeof(Scalar)); CHKPTRQ(aa);
136   count = 0;
137   for ( i=0; i<a->mbs; i++ ) {
138     for ( j=0; j<bs; j++ ) {
139       for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
140         for ( l=0; l<bs; l++ ) {
141           aa[count++] = a->a[bs2*k + l*bs + j];
142         }
143       }
144     }
145   }
146   ierr = PetscBinaryWrite(fd,aa,bs2*a->nz,PETSC_SCALAR,0); CHKERRQ(ierr);
147   PetscFree(aa);
148   return 0;
149 }
150 
151 #undef __FUNC__
152 #define __FUNC__ "MatView_SeqBAIJ_ASCII"
153 static int MatView_SeqBAIJ_ASCII(Mat A,Viewer viewer)
154 {
155   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
156   int         ierr, i,j,format,bs = a->bs,k,l,bs2=a->bs2;
157   FILE        *fd;
158   char        *outputname;
159 
160   ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
161   ierr = ViewerFileGetOutputname_Private(viewer,&outputname);CHKERRQ(ierr);
162   ierr = ViewerGetFormat(viewer,&format);
163   if (format == VIEWER_FORMAT_ASCII_INFO || format == VIEWER_FORMAT_ASCII_INFO_LONG) {
164     fprintf(fd,"  block size is %d\n",bs);
165   }
166   else if (format == VIEWER_FORMAT_ASCII_MATLAB) {
167     SETERRQ(1,0,"Matlab format not supported");
168   }
169   else if (format == VIEWER_FORMAT_ASCII_COMMON) {
170     for ( i=0; i<a->mbs; i++ ) {
171       for ( j=0; j<bs; j++ ) {
172         fprintf(fd,"row %d:",i*bs+j);
173         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
174           for ( l=0; l<bs; l++ ) {
175 #if defined(PETSC_COMPLEX)
176           if (imag(a->a[bs2*k + l*bs + j]) > 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
177             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
178               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
179           else if (imag(a->a[bs2*k + l*bs + j]) < 0.0 && real(a->a[bs2*k + l*bs + j]) != 0.0)
180             fprintf(fd," %d %g - %g i",bs*a->j[k]+l,
181               real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j]));
182           else if (real(a->a[bs2*k + l*bs + j]) != 0.0)
183             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
184 #else
185           if (a->a[bs2*k + l*bs + j] != 0.0)
186             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
187 #endif
188           }
189         }
190         fprintf(fd,"\n");
191       }
192     }
193   }
194   else {
195     for ( i=0; i<a->mbs; i++ ) {
196       for ( j=0; j<bs; j++ ) {
197         fprintf(fd,"row %d:",i*bs+j);
198         for ( k=a->i[i]; k<a->i[i+1]; k++ ) {
199           for ( l=0; l<bs; l++ ) {
200 #if defined(PETSC_COMPLEX)
201           if (imag(a->a[bs2*k + l*bs + j]) > 0.0) {
202             fprintf(fd," %d %g + %g i",bs*a->j[k]+l,
203               real(a->a[bs2*k + l*bs + j]),imag(a->a[bs2*k + l*bs + j]));
204           }
205           else if (imag(a->a[bs2*k + l*bs + j]) < 0.0) {
206             fprintf(fd," %d %g - %g i",bs*a->j[k]+l,
207               real(a->a[bs2*k + l*bs + j]),-imag(a->a[bs2*k + l*bs + j]));
208           }
209           else {
210             fprintf(fd," %d %g ",bs*a->j[k]+l,real(a->a[bs2*k + l*bs + j]));
211           }
212 #else
213             fprintf(fd," %d %g ",bs*a->j[k]+l,a->a[bs2*k + l*bs + j]);
214 #endif
215           }
216         }
217         fprintf(fd,"\n");
218       }
219     }
220   }
221   fflush(fd);
222   return 0;
223 }
224 
225 #undef __FUNC__
226 #define __FUNC__ "MatView_SeqBAIJ_Draw"
227 static int MatView_SeqBAIJ_Draw(Mat A,Viewer viewer)
228 {
229   Mat_SeqBAIJ  *a=(Mat_SeqBAIJ *) A->data;
230   int          row,ierr,i,j,k,l,mbs=a->mbs,pause,color,bs=a->bs,bs2=a->bs2;
231   double       xl,yl,xr,yr,w,h,xc,yc,scale = 1.0,x_l,x_r,y_l,y_r;
232   Scalar       *aa;
233   Draw         draw;
234   DrawButton   button;
235   PetscTruth   isnull;
236 
237   ViewerDrawGetDraw(viewer,&draw);
238   ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) return 0;
239 
240   xr  = a->n; yr = a->m; h = yr/10.0; w = xr/10.0;
241   xr += w;    yr += h;  xl = -w;     yl = -h;
242   ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
243   /* loop over matrix elements drawing boxes */
244   color = DRAW_BLUE;
245   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
246     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
247       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
248       x_l = a->j[j]*bs; x_r = x_l + 1.0;
249       aa = a->a + j*bs2;
250       for ( k=0; k<bs; k++ ) {
251         for ( l=0; l<bs; l++ ) {
252           if (PetscReal(*aa++) >=  0.) continue;
253           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
254         }
255       }
256     }
257   }
258   color = DRAW_CYAN;
259   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
260     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
261       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
262       x_l = a->j[j]*bs; x_r = x_l + 1.0;
263       aa = a->a + j*bs2;
264       for ( k=0; k<bs; k++ ) {
265         for ( l=0; l<bs; l++ ) {
266           if (PetscReal(*aa++) != 0.) continue;
267           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
268         }
269       }
270     }
271   }
272 
273   color = DRAW_RED;
274   for ( i=0,row=0; i<mbs; i++,row+=bs ) {
275     for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
276       y_l = a->m - row - 1.0; y_r = y_l + 1.0;
277       x_l = a->j[j]*bs; x_r = x_l + 1.0;
278       aa = a->a + j*bs2;
279       for ( k=0; k<bs; k++ ) {
280         for ( l=0; l<bs; l++ ) {
281           if (PetscReal(*aa++) <= 0.) continue;
282           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
283         }
284       }
285     }
286   }
287 
288   DrawFlush(draw);
289   DrawGetPause(draw,&pause);
290   if (pause >= 0) { PetscSleep(pause); return 0;}
291 
292   /* allow the matrix to zoom or shrink */
293   ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
294   while (button != BUTTON_RIGHT) {
295     DrawClear(draw);
296     if (button == BUTTON_LEFT) scale = .5;
297     else if (button == BUTTON_CENTER) scale = 2.;
298     xl = scale*(xl + w - xc) + xc - w*scale;
299     xr = scale*(xr - w - xc) + xc + w*scale;
300     yl = scale*(yl + h - yc) + yc - h*scale;
301     yr = scale*(yr - h - yc) + yc + h*scale;
302     w *= scale; h *= scale;
303     ierr = DrawSetCoordinates(draw,xl,yl,xr,yr); CHKERRQ(ierr);
304     color = DRAW_BLUE;
305     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
306       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
307         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
308         x_l = a->j[j]*bs; x_r = x_l + 1.0;
309         aa = a->a + j*bs2;
310         for ( k=0; k<bs; k++ ) {
311           for ( l=0; l<bs; l++ ) {
312             if (PetscReal(*aa++) >=  0.) continue;
313             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
314           }
315         }
316       }
317     }
318     color = DRAW_CYAN;
319     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
320       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
321         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
322         x_l = a->j[j]*bs; x_r = x_l + 1.0;
323         aa = a->a + j*bs2;
324         for ( k=0; k<bs; k++ ) {
325           for ( l=0; l<bs; l++ ) {
326           if (PetscReal(*aa++) != 0.) continue;
327           DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
328           }
329         }
330       }
331     }
332 
333     color = DRAW_RED;
334     for ( i=0,row=0; i<mbs; i++,row+=bs ) {
335       for ( j=a->i[i]; j<a->i[i+1]; j++ ) {
336         y_l = a->m - row - 1.0; y_r = y_l + 1.0;
337         x_l = a->j[j]*bs; x_r = x_l + 1.0;
338         aa = a->a + j*bs2;
339         for ( k=0; k<bs; k++ ) {
340           for ( l=0; l<bs; l++ ) {
341             if (PetscReal(*aa++) <= 0.) continue;
342             DrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
343           }
344         }
345       }
346     }
347     ierr = DrawGetMouseButton(draw,&button,&xc,&yc,0,0);
348   }
349   return 0;
350 }
351 
352 #undef __FUNC__
353 #define __FUNC__ "MatView_SeqBAIJ"
354 int MatView_SeqBAIJ(PetscObject obj,Viewer viewer)
355 {
356   Mat         A = (Mat) obj;
357   ViewerType  vtype;
358   int         ierr;
359 
360   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
361   if (vtype == MATLAB_VIEWER) {
362     SETERRQ(1,0,"Matlab viewer not supported");
363   }
364   else if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER){
365     return MatView_SeqBAIJ_ASCII(A,viewer);
366   }
367   else if (vtype == BINARY_FILE_VIEWER) {
368     return MatView_SeqBAIJ_Binary(A,viewer);
369   }
370   else if (vtype == DRAW_VIEWER) {
371     return MatView_SeqBAIJ_Draw(A,viewer);
372   }
373   return 0;
374 }
375 
376 #define CHUNKSIZE  10
377 
378 #undef __FUNC__
379 #define __FUNC__ "MatSetValues_SeqBAIJ"
380 int MatSetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
381 {
382   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
383   int         *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
384   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen,roworiented=a->roworiented;
385   int         *aj=a->j,nonew=a->nonew,bs=a->bs,brow,bcol;
386   int          ridx,cidx,bs2=a->bs2;
387   Scalar      *ap,value,*aa=a->a,*bap;
388 
389   for ( k=0; k<m; k++ ) { /* loop over added rows */
390     row  = im[k]; brow = row/bs;
391 #if defined(PETSC_BOPT_g)
392     if (row < 0) SETERRQ(1,0,"Negative row");
393     if (row >= a->m) SETERRQ(1,0,"Row too large");
394 #endif
395     rp   = aj + ai[brow];
396     ap   = aa + bs2*ai[brow];
397     rmax = imax[brow];
398     nrow = ailen[brow];
399     low  = 0;
400     for ( l=0; l<n; l++ ) { /* loop over added columns */
401 #if defined(PETSC_BOPT_g)
402       if (in[l] < 0) SETERRQ(1,0,"Negative column");
403       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
404 #endif
405       col = in[l]; bcol = col/bs;
406       ridx = row % bs; cidx = col % bs;
407       if (roworiented) {
408         value = *v++;
409       }
410       else {
411         value = v[k + l*m];
412       }
413       if (!sorted) low = 0; high = nrow;
414       while (high-low > 7) {
415         t = (low+high)/2;
416         if (rp[t] > bcol) high = t;
417         else              low  = t;
418       }
419       for ( i=low; i<high; i++ ) {
420         if (rp[i] > bcol) break;
421         if (rp[i] == bcol) {
422           bap  = ap +  bs2*i + bs*cidx + ridx;
423           if (is == ADD_VALUES) *bap += value;
424           else                  *bap  = value;
425           goto noinsert1;
426         }
427       }
428       if (nonew == 1) goto noinsert1;
429       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
430       if (nrow >= rmax) {
431         /* there is no extra room in row, therefore enlarge */
432         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
433         Scalar *new_a;
434 
435         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
436 
437         /* Malloc new storage space */
438         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
439         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
440         new_j   = (int *) (new_a + bs2*new_nz);
441         new_i   = new_j + new_nz;
442 
443         /* copy over old data into new slots */
444         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];}
445         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
446         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));
447         len = (new_nz - CHUNKSIZE - ai[brow] - nrow);
448         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,
449                                                            len*sizeof(int));
450         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar));
451         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
452         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE),
453                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));
454         /* free up old matrix storage */
455         PetscFree(a->a);
456         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
457         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
458         a->singlemalloc = 1;
459 
460         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
461         rmax = imax[brow] = imax[brow] + CHUNKSIZE;
462         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
463         a->maxnz += bs2*CHUNKSIZE;
464         a->reallocs++;
465         a->nz++;
466       }
467       N = nrow++ - 1;
468       /* shift up all the later entries in this row */
469       for ( ii=N; ii>=i; ii-- ) {
470         rp[ii+1] = rp[ii];
471         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
472       }
473       if (N>=i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
474       rp[i]                      = bcol;
475       ap[bs2*i + bs*cidx + ridx] = value;
476       noinsert1:;
477       low = i;
478     }
479     ailen[brow] = nrow;
480   }
481   return 0;
482 }
483 
484 #undef __FUNC__
485 #define __FUNC__ "MatSetValuesBlocked_SeqBAIJ"
486 int MatSetValuesBlocked_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v,InsertMode is)
487 {
488   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
489   int         *rp,k,low,high,t,ii,jj,row,nrow,i,col,l,rmax,N,sorted=a->sorted;
490   int         *imax=a->imax,*ai=a->i,*ailen=a->ilen, roworiented=a->roworiented;
491   int         *aj=a->j,nonew=a->nonew,bs2=a->bs2,bs=a->bs,stepval;
492   Scalar      *ap,*value=v,*aa=a->a,*bap;
493 
494   if (roworiented) {
495     stepval = (n-1)*bs;
496   } else {
497     stepval = (m-1)*bs;
498   }
499   for ( k=0; k<m; k++ ) { /* loop over added rows */
500     row  = im[k];
501 #if defined(PETSC_BOPT_g)
502     if (row < 0) SETERRQ(1,0,"Negative row");
503     if (row >= a->mbs) SETERRQ(1,0,"Row too large");
504 #endif
505     rp   = aj + ai[row];
506     ap   = aa + bs2*ai[row];
507     rmax = imax[row];
508     nrow = ailen[row];
509     low  = 0;
510     for ( l=0; l<n; l++ ) { /* loop over added columns */
511 #if defined(PETSC_BOPT_g)
512       if (in[l] < 0) SETERRQ(1,0,"Negative column");
513       if (in[l] >= a->nbs) SETERRQ(1,0,"Column too large");
514 #endif
515       col = in[l];
516       if (roworiented) {
517         value = v + k*(stepval+bs)*bs + l*bs;
518       } else {
519         value = v + l*(stepval+bs)*bs + k*bs;
520       }
521       if (!sorted) low = 0; high = nrow;
522       while (high-low > 7) {
523         t = (low+high)/2;
524         if (rp[t] > col) high = t;
525         else             low  = t;
526       }
527       for ( i=low; i<high; i++ ) {
528         if (rp[i] > col) break;
529         if (rp[i] == col) {
530           bap  = ap +  bs2*i;
531           if (roworiented) {
532             if (is == ADD_VALUES) {
533               for ( ii=0; ii<bs; ii++,value+=stepval ) {
534                 for (jj=ii; jj<bs2; jj+=bs ) {
535                   bap[jj] += *value++;
536                 }
537               }
538             } else {
539               for ( ii=0; ii<bs; ii++,value+=stepval ) {
540                 for (jj=ii; jj<bs2; jj+=bs ) {
541                   bap[jj] = *value++;
542                 }
543               }
544             }
545           } else {
546             if (is == ADD_VALUES) {
547               for ( ii=0; ii<bs; ii++,value+=stepval ) {
548                 for (jj=0; jj<bs; jj++ ) {
549                   *bap++ += *value++;
550                 }
551               }
552             } else {
553               for ( ii=0; ii<bs; ii++,value+=stepval ) {
554                 for (jj=0; jj<bs; jj++ ) {
555                   *bap++  = *value++;
556                 }
557               }
558             }
559           }
560           goto noinsert2;
561         }
562       }
563       if (nonew == 1) goto noinsert2;
564       else if (nonew == -1) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
565       if (nrow >= rmax) {
566         /* there is no extra room in row, therefore enlarge */
567         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j;
568         Scalar *new_a;
569 
570         if (nonew == -2) SETERRQ(1,0,"Inserting a new nonzero in the matrix");
571 
572         /* malloc new storage space */
573         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int);
574         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a);
575         new_j   = (int *) (new_a + bs2*new_nz);
576         new_i   = new_j + new_nz;
577 
578         /* copy over old data into new slots */
579         for ( ii=0; ii<row+1; ii++ ) {new_i[ii] = ai[ii];}
580         for ( ii=row+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;}
581         PetscMemcpy(new_j,aj,(ai[row]+nrow)*sizeof(int));
582         len = (new_nz - CHUNKSIZE - ai[row] - nrow);
583         PetscMemcpy(new_j+ai[row]+nrow+CHUNKSIZE,aj+ai[row]+nrow,len*sizeof(int));
584         PetscMemcpy(new_a,aa,(ai[row]+nrow)*bs2*sizeof(Scalar));
585         PetscMemzero(new_a+bs2*(ai[row]+nrow),bs2*CHUNKSIZE*sizeof(Scalar));
586         PetscMemcpy(new_a+bs2*(ai[row]+nrow+CHUNKSIZE),aa+bs2*(ai[row]+nrow),bs2*len*sizeof(Scalar));
587         /* free up old matrix storage */
588         PetscFree(a->a);
589         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);}
590         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;
591         a->singlemalloc = 1;
592 
593         rp   = aj + ai[row]; ap = aa + bs2*ai[row];
594         rmax = imax[row] = imax[row] + CHUNKSIZE;
595         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar)));
596         a->maxnz += bs2*CHUNKSIZE;
597         a->reallocs++;
598         a->nz++;
599       }
600       N = nrow++ - 1;
601       /* shift up all the later entries in this row */
602       for ( ii=N; ii>=i; ii-- ) {
603         rp[ii+1] = rp[ii];
604         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar));
605       }
606       if (N >= i) PetscMemzero(ap+bs2*i,bs2*sizeof(Scalar));
607       rp[i] = col;
608       bap   = ap +  bs2*i;
609       if (roworiented) {
610         for ( ii=0; ii<bs; ii++,value+=stepval) {
611           for (jj=ii; jj<bs2; jj+=bs ) {
612             bap[jj] = *value++;
613           }
614         }
615       } else {
616         for ( ii=0; ii<bs; ii++,value+=stepval ) {
617           for (jj=0; jj<bs; jj++ ) {
618             *bap++  = *value++;
619           }
620         }
621       }
622       noinsert2:;
623       low = i;
624     }
625     ailen[row] = nrow;
626   }
627   return 0;
628 }
629 
630 #undef __FUNC__
631 #define __FUNC__ "MatGetSize_SeqBAIJ"
632 int MatGetSize_SeqBAIJ(Mat A,int *m,int *n)
633 {
634   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
635   if (m) *m = a->m;
636   if (n) *n = a->n;
637   return 0;
638 }
639 
640 #undef __FUNC__
641 #define __FUNC__ "MatGetOwnershipRange_SeqBAIJ"
642 int MatGetOwnershipRange_SeqBAIJ(Mat A,int *m,int *n)
643 {
644   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
645   *m = 0; *n = a->m;
646   return 0;
647 }
648 
649 #undef __FUNC__
650 #define __FUNC__ "MatGetRow_SeqBAIJ"
651 int MatGetRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
652 {
653   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
654   int          itmp,i,j,k,M,*ai,*aj,bs,bn,bp,*idx_i,bs2;
655   Scalar      *aa,*v_i,*aa_i;
656 
657   bs  = a->bs;
658   ai  = a->i;
659   aj  = a->j;
660   aa  = a->a;
661   bs2 = a->bs2;
662 
663   if (row < 0 || row >= a->m) SETERRQ(1,0,"Row out of range");
664 
665   bn  = row/bs;   /* Block number */
666   bp  = row % bs; /* Block Position */
667   M   = ai[bn+1] - ai[bn];
668   *nz = bs*M;
669 
670   if (v) {
671     *v = 0;
672     if (*nz) {
673       *v = (Scalar *) PetscMalloc( (*nz)*sizeof(Scalar) ); CHKPTRQ(*v);
674       for ( i=0; i<M; i++ ) { /* for each block in the block row */
675         v_i  = *v + i*bs;
676         aa_i = aa + bs2*(ai[bn] + i);
677         for ( j=bp,k=0; j<bs2; j+=bs,k++ ) {v_i[k] = aa_i[j];}
678       }
679     }
680   }
681 
682   if (idx) {
683     *idx = 0;
684     if (*nz) {
685       *idx = (int *) PetscMalloc( (*nz)*sizeof(int) ); CHKPTRQ(*idx);
686       for ( i=0; i<M; i++ ) { /* for each block in the block row */
687         idx_i = *idx + i*bs;
688         itmp  = bs*aj[ai[bn] + i];
689         for ( j=0; j<bs; j++ ) {idx_i[j] = itmp++;}
690       }
691     }
692   }
693   return 0;
694 }
695 
696 #undef __FUNC__
697 #define __FUNC__ "MatRestoreRow_SeqBAIJ"
698 int MatRestoreRow_SeqBAIJ(Mat A,int row,int *nz,int **idx,Scalar **v)
699 {
700   if (idx) {if (*idx) PetscFree(*idx);}
701   if (v)   {if (*v)   PetscFree(*v);}
702   return 0;
703 }
704 
705 #undef __FUNC__
706 #define __FUNC__ "MatTranspose_SeqBAIJ"
707 int MatTranspose_SeqBAIJ(Mat A,Mat *B)
708 {
709   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
710   Mat         C;
711   int         i,j,k,ierr,*aj=a->j,*ai=a->i,bs=a->bs,mbs=a->mbs,nbs=a->nbs,len,*col;
712   int         *rows,*cols,bs2=a->bs2;
713   Scalar      *array=a->a;
714 
715   if (B==PETSC_NULL && mbs!=nbs)
716     SETERRQ(1,0,"Square matrix only for in-place");
717   col = (int *) PetscMalloc((1+nbs)*sizeof(int)); CHKPTRQ(col);
718   PetscMemzero(col,(1+nbs)*sizeof(int));
719 
720   for ( i=0; i<ai[mbs]; i++ ) col[aj[i]] += 1;
721   ierr = MatCreateSeqBAIJ(A->comm,bs,a->n,a->m,PETSC_NULL,col,&C); CHKERRQ(ierr);
722   PetscFree(col);
723   rows = (int *) PetscMalloc(2*bs*sizeof(int)); CHKPTRQ(rows);
724   cols = rows + bs;
725   for ( i=0; i<mbs; i++ ) {
726     cols[0] = i*bs;
727     for (k=1; k<bs; k++ ) cols[k] = cols[k-1] + 1;
728     len = ai[i+1] - ai[i];
729     for ( j=0; j<len; j++ ) {
730       rows[0] = (*aj++)*bs;
731       for (k=1; k<bs; k++ ) rows[k] = rows[k-1] + 1;
732       ierr = MatSetValues(C,bs,rows,bs,cols,array,INSERT_VALUES); CHKERRQ(ierr);
733       array += bs2;
734     }
735   }
736   PetscFree(rows);
737 
738   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
739   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
740 
741   if (B != PETSC_NULL) {
742     *B = C;
743   } else {
744     /* This isn't really an in-place transpose */
745     PetscFree(a->a);
746     if (!a->singlemalloc) {PetscFree(a->i); PetscFree(a->j);}
747     if (a->diag) PetscFree(a->diag);
748     if (a->ilen) PetscFree(a->ilen);
749     if (a->imax) PetscFree(a->imax);
750     if (a->solve_work) PetscFree(a->solve_work);
751     PetscFree(a);
752     PetscMemcpy(A,C,sizeof(struct _p_Mat));
753     PetscHeaderDestroy(C);
754   }
755   return 0;
756 }
757 
758 
759 #undef __FUNC__
760 #define __FUNC__ "MatAssemblyEnd_SeqBAIJ"
761 int MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
762 {
763   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
764   int        fshift = 0,i,j,*ai = a->i, *aj = a->j, *imax = a->imax;
765   int        m = a->m,*ip, N, *ailen = a->ilen;
766   int        mbs = a->mbs, bs2 = a->bs2,rmax = 0;
767   Scalar     *aa = a->a, *ap;
768 
769   if (mode == MAT_FLUSH_ASSEMBLY) return 0;
770 
771   if (m) rmax = ailen[0];
772   for ( i=1; i<mbs; i++ ) {
773     /* move each row back by the amount of empty slots (fshift) before it*/
774     fshift += imax[i-1] - ailen[i-1];
775     rmax   = PetscMax(rmax,ailen[i]);
776     if (fshift) {
777       ip = aj + ai[i]; ap = aa + bs2*ai[i];
778       N = ailen[i];
779       for ( j=0; j<N; j++ ) {
780         ip[j-fshift] = ip[j];
781         PetscMemcpy(ap+(j-fshift)*bs2,ap+j*bs2,bs2*sizeof(Scalar));
782       }
783     }
784     ai[i] = ai[i-1] + ailen[i-1];
785   }
786   if (mbs) {
787     fshift += imax[mbs-1] - ailen[mbs-1];
788     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
789   }
790   /* reset ilen and imax for each row */
791   for ( i=0; i<mbs; i++ ) {
792     ailen[i] = imax[i] = ai[i+1] - ai[i];
793   }
794   a->nz = ai[mbs];
795 
796   /* diagonals may have moved, so kill the diagonal pointers */
797   if (fshift && a->diag) {
798     PetscFree(a->diag);
799     PLogObjectMemory(A,-(m+1)*sizeof(int));
800     a->diag = 0;
801   }
802   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Matrix size: %d X %d, block size %d; storage space: %d unneeded, %d used\n",
803            m,a->n,a->bs,fshift*bs2,a->nz*bs2);
804   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Number of mallocs during MatSetValues is %d\n",
805            a->reallocs);
806   PLogInfo(A,"MatAssemblyEnd_SeqBAIJ:Most nonzeros blocks in any row is %d\n",rmax);
807   a->reallocs          = 0;
808   A->info.nz_unneeded  = (double)fshift*bs2;
809 
810   return 0;
811 }
812 
813 #undef __FUNC__
814 #define __FUNC__ "MatZeroEntries_SeqBAIJ"
815 int MatZeroEntries_SeqBAIJ(Mat A)
816 {
817   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
818   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(Scalar));
819   return 0;
820 }
821 
822 #undef __FUNC__
823 #define __FUNC__ "MatDestroy_SeqBAIJ"
824 int MatDestroy_SeqBAIJ(PetscObject obj)
825 {
826   Mat         A  = (Mat) obj;
827   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
828 
829 #if defined(PETSC_LOG)
830   PLogObjectState(obj,"Rows=%d, Cols=%d, NZ=%d",a->m,a->n,a->nz);
831 #endif
832   PetscFree(a->a);
833   if (!a->singlemalloc) { PetscFree(a->i); PetscFree(a->j);}
834   if (a->diag) PetscFree(a->diag);
835   if (a->ilen) PetscFree(a->ilen);
836   if (a->imax) PetscFree(a->imax);
837   if (a->solve_work) PetscFree(a->solve_work);
838   if (a->mult_work) PetscFree(a->mult_work);
839   PetscFree(a);
840   PLogObjectDestroy(A);
841   PetscHeaderDestroy(A);
842   return 0;
843 }
844 
845 #undef __FUNC__
846 #define __FUNC__ "MatSetOption_SeqBAIJ"
847 int MatSetOption_SeqBAIJ(Mat A,MatOption op)
848 {
849   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
850   if      (op == MAT_ROW_ORIENTED)                 a->roworiented = 1;
851   else if (op == MAT_COLUMN_ORIENTED)              a->roworiented = 0;
852   else if (op == MAT_COLUMNS_SORTED)               a->sorted      = 1;
853   else if (op == MAT_COLUMNS_UNSORTED)             a->sorted      = 0;
854   else if (op == MAT_NO_NEW_NONZERO_LOCATIONS)     a->nonew       = 1;
855   else if (op == MAT_NEW_NONZERO_LOCATION_ERROR)   a->nonew       = -1;
856   else if (op == MAT_NEW_NONZERO_ALLOCATION_ERROR) a->nonew       = -2;
857   else if (op == MAT_YES_NEW_NONZERO_LOCATIONS)    a->nonew       = 0;
858   else if (op == MAT_ROWS_SORTED ||
859            op == MAT_ROWS_UNSORTED ||
860            op == MAT_SYMMETRIC ||
861            op == MAT_STRUCTURALLY_SYMMETRIC ||
862            op == MAT_YES_NEW_DIAGONALS ||
863            op == MAT_IGNORE_OFF_PROC_ENTRIES)
864     PLogInfo(A,"Info:MatSetOption_SeqBAIJ:Option ignored\n");
865   else if (op == MAT_NO_NEW_DIAGONALS)
866     {SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");}
867   else
868     {SETERRQ(PETSC_ERR_SUP,0,"unknown option");}
869   return 0;
870 }
871 
872 
873 /* -------------------------------------------------------*/
874 /* Should check that shapes of vectors and matrices match */
875 /* -------------------------------------------------------*/
876 #include "pinclude/plapack.h"
877 
878 #undef __FUNC__
879 #define __FUNC__ "MatMult_SeqBAIJ_1"
880 static int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
881 {
882   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
883   register Scalar *x,*z,*v,sum;
884   int             mbs=a->mbs,i,*idx,*ii,n;
885 
886   VecGetArray_Fast(xx,x);
887   VecGetArray_Fast(zz,z);
888 
889   idx   = a->j;
890   v     = a->a;
891   ii    = a->i;
892 
893   for ( i=0; i<mbs; i++ ) {
894     n    = ii[1] - ii[0]; ii++;
895     sum  = 0.0;
896     while (n--) sum += *v++ * x[*idx++];
897     z[i] = sum;
898   }
899   VecRestoreArray_Fast(xx,x);
900   VecRestoreArray_Fast(zz,z);
901   PLogFlops(2*a->nz - a->m);
902   return 0;
903 }
904 
905 #undef __FUNC__
906 #define __FUNC__ "MatMult_SeqBAIJ_2"
907 static int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
908 {
909   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
910   register Scalar *x,*z,*v,*xb,sum1,sum2;
911   register Scalar x1,x2;
912   int             mbs=a->mbs,i,*idx,*ii;
913   int             j,n;
914 
915   VecGetArray_Fast(xx,x);
916   VecGetArray_Fast(zz,z);
917 
918   idx   = a->j;
919   v     = a->a;
920   ii    = a->i;
921 
922   for ( i=0; i<mbs; i++ ) {
923     n  = ii[1] - ii[0]; ii++;
924     sum1 = 0.0; sum2 = 0.0;
925     for ( j=0; j<n; j++ ) {
926       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
927       sum1 += v[0]*x1 + v[2]*x2;
928       sum2 += v[1]*x1 + v[3]*x2;
929       v += 4;
930     }
931     z[0] = sum1; z[1] = sum2;
932     z += 2;
933   }
934   VecRestoreArray_Fast(xx,x);
935   VecRestoreArray_Fast(zz,z);
936   PLogFlops(4*a->nz - a->m);
937   return 0;
938 }
939 
940 #undef __FUNC__
941 #define __FUNC__ "MatMult_SeqBAIJ_3"
942 static int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
943 {
944   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
945   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
946   int             mbs=a->mbs,i,*idx,*ii,j,n;
947 
948   VecGetArray_Fast(xx,x);
949   VecGetArray_Fast(zz,z);
950 
951   idx   = a->j;
952   v     = a->a;
953   ii    = a->i;
954 
955   for ( i=0; i<mbs; i++ ) {
956     n  = ii[1] - ii[0]; ii++;
957     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
958     for ( j=0; j<n; j++ ) {
959       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
960       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
961       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
962       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
963       v += 9;
964     }
965     z[0] = sum1; z[1] = sum2; z[2] = sum3;
966     z += 3;
967   }
968   VecRestoreArray_Fast(xx,x);
969   VecRestoreArray_Fast(zz,z);
970   PLogFlops(18*a->nz - a->m);
971   return 0;
972 }
973 
974 #undef __FUNC__
975 #define __FUNC__ "MatMult_SeqBAIJ_4"
976 static int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
977 {
978   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
979   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4;
980   register Scalar x1,x2,x3,x4;
981   int             mbs=a->mbs,i,*idx,*ii;
982   int             j,n;
983 
984   VecGetArray_Fast(xx,x);
985   VecGetArray_Fast(zz,z);
986 
987   idx   = a->j;
988   v     = a->a;
989   ii    = a->i;
990 
991   for ( i=0; i<mbs; i++ ) {
992     n  = ii[1] - ii[0]; ii++;
993     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
994     for ( j=0; j<n; j++ ) {
995       xb = x + 4*(*idx++);
996       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
997       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
998       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
999       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1000       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1001       v += 16;
1002     }
1003     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1004     z += 4;
1005   }
1006   VecRestoreArray_Fast(xx,x);
1007   VecRestoreArray_Fast(zz,z);
1008   PLogFlops(32*a->nz - a->m);
1009   return 0;
1010 }
1011 
1012 #undef __FUNC__
1013 #define __FUNC__ "MatMult_SeqBAIJ_5"
1014 static int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
1015 {
1016   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1017   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1018   register Scalar x1,x2,x3,x4,x5;
1019   int             mbs=a->mbs,i,*idx,*ii,j,n;
1020 
1021   VecGetArray_Fast(xx,x);
1022   VecGetArray_Fast(zz,z);
1023 
1024   idx   = a->j;
1025   v     = a->a;
1026   ii    = a->i;
1027 
1028   for ( i=0; i<mbs; i++ ) {
1029     n  = ii[1] - ii[0]; ii++;
1030     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
1031     for ( j=0; j<n; j++ ) {
1032       xb = x + 5*(*idx++);
1033       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1034       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1035       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1036       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1037       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1038       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1039       v += 25;
1040     }
1041     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1042     z += 5;
1043   }
1044   VecRestoreArray_Fast(xx,x);
1045   VecRestoreArray_Fast(zz,z);
1046   PLogFlops(50*a->nz - a->m);
1047   return 0;
1048 }
1049 
1050 #undef __FUNC__
1051 #define __FUNC__ "MatMult_SeqBAIJ_7"
1052 static int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1053 {
1054   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1055   register Scalar *x,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1056   register Scalar x1,x2,x3,x4,x5,x6,x7;
1057   int             mbs=a->mbs,i,*idx,*ii,j,n;
1058 
1059   VecGetArray_Fast(xx,x);
1060   VecGetArray_Fast(zz,z);
1061 
1062   idx   = a->j;
1063   v     = a->a;
1064   ii    = a->i;
1065 
1066   for ( i=0; i<mbs; i++ ) {
1067     n  = ii[1] - ii[0]; ii++;
1068     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1069     for ( j=0; j<n; j++ ) {
1070       xb = x + 7*(*idx++);
1071       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1072       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1073       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1074       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1075       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1076       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1077       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1078       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1079       v += 49;
1080     }
1081     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1082     z += 7;
1083   }
1084 
1085   VecRestoreArray_Fast(xx,x);
1086   VecRestoreArray_Fast(zz,z);
1087   PLogFlops(98*a->nz - a->m);
1088   return 0;
1089 }
1090 
1091 #undef __FUNC__
1092 #define __FUNC__ "MatMult_SeqBAIJ_N"
1093 static int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1094 {
1095   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1096   register Scalar *x,*z,*v,*xb;
1097   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
1098   int             _One = 1,ncols,k;
1099   Scalar          _DOne = 1.0,*work,*workt,_DZero = 0.0;
1100 
1101   VecGetArray_Fast(xx,x);
1102   VecGetArray_Fast(zz,z);
1103 
1104   idx   = a->j;
1105   v     = a->a;
1106   ii    = a->i;
1107 
1108 
1109   if (!a->mult_work) {
1110     k = PetscMax(a->m,a->n);
1111     a->mult_work = (Scalar *) PetscMalloc((k+1)*sizeof(Scalar));CHKPTRQ(a->mult_work);
1112   }
1113   work = a->mult_work;
1114   for ( i=0; i<mbs; i++ ) {
1115     n     = ii[1] - ii[0]; ii++;
1116     ncols = n*bs;
1117     workt = work;
1118     for ( j=0; j<n; j++ ) {
1119       xb = x + bs*(*idx++);
1120       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1121       workt += bs;
1122     }
1123     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One);
1124     v += n*bs2;
1125     z += bs;
1126   }
1127   VecRestoreArray_Fast(xx,x);
1128   VecRestoreArray_Fast(zz,z);
1129   PLogFlops(2*a->nz*bs2 - a->m);
1130   return 0;
1131 }
1132 
1133 #undef __FUNC__
1134 #define __FUNC__ "MatMultAdd_SeqBAIJ_1"
1135 static int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1136 {
1137   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1138   register Scalar *x,*y,*z,*v,sum;
1139   int             mbs=a->mbs,i,*idx,*ii,n;
1140 
1141   VecGetArray_Fast(xx,x);
1142   VecGetArray_Fast(yy,y);
1143   VecGetArray_Fast(zz,z);
1144 
1145   idx   = a->j;
1146   v     = a->a;
1147   ii    = a->i;
1148 
1149   for ( i=0; i<mbs; i++ ) {
1150     n    = ii[1] - ii[0]; ii++;
1151     sum  = y[i];
1152     while (n--) sum += *v++ * x[*idx++];
1153     z[i] = sum;
1154   }
1155   VecRestoreArray_Fast(xx,x);
1156   VecRestoreArray_Fast(yy,y);
1157   VecRestoreArray_Fast(zz,z);
1158   PLogFlops(2*a->nz);
1159   return 0;
1160 }
1161 
1162 #undef __FUNC__
1163 #define __FUNC__ "MatMultAdd_SeqBAIJ_2"
1164 static int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1165 {
1166   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1167   register Scalar *x,*y,*z,*v,*xb,sum1,sum2;
1168   register Scalar x1,x2;
1169   int             mbs=a->mbs,i,*idx,*ii;
1170   int             j,n;
1171 
1172   VecGetArray_Fast(xx,x);
1173   VecGetArray_Fast(yy,y);
1174   VecGetArray_Fast(zz,z);
1175 
1176   idx   = a->j;
1177   v     = a->a;
1178   ii    = a->i;
1179 
1180   for ( i=0; i<mbs; i++ ) {
1181     n  = ii[1] - ii[0]; ii++;
1182     sum1 = y[0]; sum2 = y[1];
1183     for ( j=0; j<n; j++ ) {
1184       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1185       sum1 += v[0]*x1 + v[2]*x2;
1186       sum2 += v[1]*x1 + v[3]*x2;
1187       v += 4;
1188     }
1189     z[0] = sum1; z[1] = sum2;
1190     z += 2; y += 2;
1191   }
1192   VecRestoreArray_Fast(xx,x);
1193   VecRestoreArray_Fast(yy,y);
1194   VecRestoreArray_Fast(zz,z);
1195   PLogFlops(4*a->nz);
1196   return 0;
1197 }
1198 
1199 #undef __FUNC__
1200 #define __FUNC__ "MatMultAdd_SeqBAIJ_3"
1201 static int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1202 {
1203   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1204   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,x1,x2,x3;
1205   int             mbs=a->mbs,i,*idx,*ii,j,n;
1206 
1207   VecGetArray_Fast(xx,x);
1208   VecGetArray_Fast(yy,y);
1209   VecGetArray_Fast(zz,z);
1210 
1211   idx   = a->j;
1212   v     = a->a;
1213   ii    = a->i;
1214 
1215   for ( i=0; i<mbs; i++ ) {
1216     n  = ii[1] - ii[0]; ii++;
1217     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1218     for ( j=0; j<n; j++ ) {
1219       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1220       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1221       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1222       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1223       v += 9;
1224     }
1225     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1226     z += 3; y += 3;
1227   }
1228   VecRestoreArray_Fast(xx,x);
1229   VecRestoreArray_Fast(yy,y);
1230   VecRestoreArray_Fast(zz,z);
1231   PLogFlops(18*a->nz);
1232   return 0;
1233 }
1234 
1235 #undef __FUNC__
1236 #define __FUNC__ "MatMultAdd_SeqBAIJ_4"
1237 static int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1238 {
1239   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1240   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4;
1241   register Scalar x1,x2,x3,x4;
1242   int             mbs=a->mbs,i,*idx,*ii;
1243   int             j,n;
1244 
1245   VecGetArray_Fast(xx,x);
1246   VecGetArray_Fast(yy,y);
1247   VecGetArray_Fast(zz,z);
1248 
1249   idx   = a->j;
1250   v     = a->a;
1251   ii    = a->i;
1252 
1253   for ( i=0; i<mbs; i++ ) {
1254     n  = ii[1] - ii[0]; ii++;
1255     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1256     for ( j=0; j<n; j++ ) {
1257       xb = x + 4*(*idx++);
1258       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1259       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1260       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1261       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1262       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1263       v += 16;
1264     }
1265     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1266     z += 4; y += 4;
1267   }
1268   VecRestoreArray_Fast(xx,x);
1269   VecRestoreArray_Fast(yy,y);
1270   VecRestoreArray_Fast(zz,z);
1271   PLogFlops(32*a->nz);
1272   return 0;
1273 }
1274 
1275 #undef __FUNC__
1276 #define __FUNC__ "MatMultAdd_SeqBAIJ_5"
1277 static int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1278 {
1279   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1280   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5;
1281   register Scalar x1,x2,x3,x4,x5;
1282   int             mbs=a->mbs,i,*idx,*ii,j,n;
1283 
1284   VecGetArray_Fast(xx,x);
1285   VecGetArray_Fast(yy,y);
1286   VecGetArray_Fast(zz,z);
1287 
1288   idx   = a->j;
1289   v     = a->a;
1290   ii    = a->i;
1291 
1292   for ( i=0; i<mbs; i++ ) {
1293     n  = ii[1] - ii[0]; ii++;
1294     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1295     for ( j=0; j<n; j++ ) {
1296       xb = x + 5*(*idx++);
1297       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1298       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1299       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1300       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1301       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1302       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1303       v += 25;
1304     }
1305     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1306     z += 5; y += 5;
1307   }
1308   VecRestoreArray_Fast(xx,x);
1309   VecRestoreArray_Fast(yy,y);
1310   VecRestoreArray_Fast(zz,z);
1311   PLogFlops(50*a->nz);
1312   return 0;
1313 }
1314 
1315 #undef __FUNC__
1316 #define __FUNC__ "MatMultAdd_SeqBAIJ_7"
1317 static int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1318 {
1319   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1320   register Scalar *x,*y,*z,*v,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1321   register Scalar x1,x2,x3,x4,x5,x6,x7;
1322   int             mbs=a->mbs,i,*idx,*ii,j,n;
1323 
1324   VecGetArray_Fast(xx,x);
1325   VecGetArray_Fast(yy,y);
1326   VecGetArray_Fast(zz,z);
1327 
1328   idx   = a->j;
1329   v     = a->a;
1330   ii    = a->i;
1331 
1332   for ( i=0; i<mbs; i++ ) {
1333     n  = ii[1] - ii[0]; ii++;
1334     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1335     for ( j=0; j<n; j++ ) {
1336       xb = x + 7*(*idx++);
1337       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1338       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1339       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1340       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1341       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1342       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1343       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1344       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1345       v += 49;
1346     }
1347     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1348     z += 7; y += 7;
1349   }
1350   VecRestoreArray_Fast(xx,x);
1351   VecRestoreArray_Fast(yy,y);
1352   VecRestoreArray_Fast(zz,z);
1353   PLogFlops(98*a->nz);
1354   return 0;
1355 }
1356 
1357 
1358 #undef __FUNC__
1359 #define __FUNC__ "MatMultAdd_SeqBAIJ_N"
1360 static int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1361 {
1362   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1363   register Scalar *x,*z,*v,*xb;
1364   int             mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
1365   int             _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1366 
1367   if ( xx != yy) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1368 
1369   VecGetArray_Fast(xx,x);
1370   VecGetArray_Fast(zz,z);
1371 
1372   idx   = a->j;
1373   v     = a->a;
1374   ii    = a->i;
1375 
1376 
1377   if (!a->mult_work) {
1378     k = PetscMax(a->m,a->n);
1379     a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));CHKPTRQ(a->mult_work);
1380   }
1381   work = a->mult_work;
1382   for ( i=0; i<mbs; i++ ) {
1383     n     = ii[1] - ii[0]; ii++;
1384     ncols = n*bs;
1385     workt = work;
1386     for ( j=0; j<n; j++ ) {
1387       xb = x + bs*(*idx++);
1388       for ( k=0; k<bs; k++ ) workt[k] = xb[k];
1389       workt += bs;
1390     }
1391     LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One);
1392     v += n*bs2;
1393     z += bs;
1394   }
1395   VecRestoreArray_Fast(xx,x);
1396   VecRestoreArray_Fast(zz,z);
1397   PLogFlops(2*a->nz*bs2 );
1398   return 0;
1399 }
1400 
1401 #undef __FUNC__
1402 #define __FUNC__ "MatMultTrans_SeqBAIJ"
1403 int MatMultTrans_SeqBAIJ(Mat A,Vec xx,Vec zz)
1404 {
1405   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1406   Scalar          *xg,*zg,*zb;
1407   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1408   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1409   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1410 
1411 
1412   VecGetArray_Fast(xx,xg); x = xg;
1413   VecGetArray_Fast(zz,zg); z = zg;
1414   PetscMemzero(z,N*sizeof(Scalar));
1415 
1416   idx   = a->j;
1417   v     = a->a;
1418   ii    = a->i;
1419 
1420   switch (bs) {
1421   case 1:
1422     for ( i=0; i<mbs; i++ ) {
1423       n  = ii[1] - ii[0]; ii++;
1424       xb = x + i; x1 = xb[0];
1425       ib = idx + ai[i];
1426       for ( j=0; j<n; j++ ) {
1427         rval    = ib[j];
1428         z[rval] += *v++ * x1;
1429       }
1430     }
1431     break;
1432   case 2:
1433     for ( i=0; i<mbs; i++ ) {
1434       n  = ii[1] - ii[0]; ii++;
1435       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1436       ib = idx + ai[i];
1437       for ( j=0; j<n; j++ ) {
1438         rval      = ib[j]*2;
1439         z[rval++] += v[0]*x1 + v[1]*x2;
1440         z[rval++] += v[2]*x1 + v[3]*x2;
1441         v += 4;
1442       }
1443     }
1444     break;
1445   case 3:
1446     for ( i=0; i<mbs; i++ ) {
1447       n  = ii[1] - ii[0]; ii++;
1448       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1449       ib = idx + ai[i];
1450       for ( j=0; j<n; j++ ) {
1451         rval      = ib[j]*3;
1452         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1453         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1454         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1455         v += 9;
1456       }
1457     }
1458     break;
1459   case 4:
1460     for ( i=0; i<mbs; i++ ) {
1461       n  = ii[1] - ii[0]; ii++;
1462       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1463       ib = idx + ai[i];
1464       for ( j=0; j<n; j++ ) {
1465         rval      = ib[j]*4;
1466         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1467         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1468         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1469         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1470         v += 16;
1471       }
1472     }
1473     break;
1474   case 5:
1475     for ( i=0; i<mbs; i++ ) {
1476       n  = ii[1] - ii[0]; ii++;
1477       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1478       x4 = xb[3];   x5 = xb[4];
1479       ib = idx + ai[i];
1480       for ( j=0; j<n; j++ ) {
1481         rval      = ib[j]*5;
1482         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1483         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1484         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1485         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1486         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1487         v += 25;
1488       }
1489     }
1490     break;
1491       /* block sizes larger then 5 by 5 are handled by BLAS */
1492     default: {
1493       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1494       if (!a->mult_work) {
1495         k = PetscMax(a->m,a->n);
1496         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1497         CHKPTRQ(a->mult_work);
1498       }
1499       work = a->mult_work;
1500       for ( i=0; i<mbs; i++ ) {
1501         n     = ii[1] - ii[0]; ii++;
1502         ncols = n*bs;
1503         PetscMemzero(work,ncols*sizeof(Scalar));
1504         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1505         v += n*bs2;
1506         x += bs;
1507         workt = work;
1508         for ( j=0; j<n; j++ ) {
1509           zb = z + bs*(*idx++);
1510           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1511           workt += bs;
1512         }
1513       }
1514     }
1515   }
1516   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1517   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1518   PLogFlops(2*a->nz*a->bs2 - a->n);
1519   return 0;
1520 }
1521 
1522 #undef __FUNC__
1523 #define __FUNC__ "MatMultTransAdd_SeqBAIJ"
1524 int MatMultTransAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1525 {
1526   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data;
1527   Scalar          *xg,*zg,*zb;
1528   register Scalar *x,*z,*v,*xb,x1,x2,x3,x4,x5;
1529   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,N=a->n;
1530   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1531 
1532 
1533 
1534   VecGetArray_Fast(xx,xg); x = xg;
1535   VecGetArray_Fast(zz,zg); z = zg;
1536 
1537   if ( yy != zz ) { ierr = VecCopy(yy,zz); CHKERRQ(ierr); }
1538   else PetscMemzero(z,N*sizeof(Scalar));
1539 
1540 
1541   idx   = a->j;
1542   v     = a->a;
1543   ii    = a->i;
1544 
1545   switch (bs) {
1546   case 1:
1547     for ( i=0; i<mbs; i++ ) {
1548       n  = ii[1] - ii[0]; ii++;
1549       xb = x + i; x1 = xb[0];
1550       ib = idx + ai[i];
1551       for ( j=0; j<n; j++ ) {
1552         rval    = ib[j];
1553         z[rval] += *v++ * x1;
1554       }
1555     }
1556     break;
1557   case 2:
1558     for ( i=0; i<mbs; i++ ) {
1559       n  = ii[1] - ii[0]; ii++;
1560       xb = x + 2*i; x1 = xb[0]; x2 = xb[1];
1561       ib = idx + ai[i];
1562       for ( j=0; j<n; j++ ) {
1563         rval      = ib[j]*2;
1564         z[rval++] += v[0]*x1 + v[1]*x2;
1565         z[rval++] += v[2]*x1 + v[3]*x2;
1566         v += 4;
1567       }
1568     }
1569     break;
1570   case 3:
1571     for ( i=0; i<mbs; i++ ) {
1572       n  = ii[1] - ii[0]; ii++;
1573       xb = x + 3*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1574       ib = idx + ai[i];
1575       for ( j=0; j<n; j++ ) {
1576         rval      = ib[j]*3;
1577         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1578         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1579         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1580         v += 9;
1581       }
1582     }
1583     break;
1584   case 4:
1585     for ( i=0; i<mbs; i++ ) {
1586       n  = ii[1] - ii[0]; ii++;
1587       xb = x + 4*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1588       ib = idx + ai[i];
1589       for ( j=0; j<n; j++ ) {
1590         rval      = ib[j]*4;
1591         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1592         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1593         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1594         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1595         v += 16;
1596       }
1597     }
1598     break;
1599   case 5:
1600     for ( i=0; i<mbs; i++ ) {
1601       n  = ii[1] - ii[0]; ii++;
1602       xb = x + 5*i; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1603       x4 = xb[3];   x5 = xb[4];
1604       ib = idx + ai[i];
1605       for ( j=0; j<n; j++ ) {
1606         rval      = ib[j]*5;
1607         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1608         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1609         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1610         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1611         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1612         v += 25;
1613       }
1614     }
1615     break;
1616       /* block sizes larger then 5 by 5 are handled by BLAS */
1617     default: {
1618       int  _One = 1,ncols,k; Scalar _DOne = 1.0, *work,*workt;
1619       if (!a->mult_work) {
1620         k = PetscMax(a->m,a->n);
1621         a->mult_work = (Scalar *) PetscMalloc(k*sizeof(Scalar));
1622         CHKPTRQ(a->mult_work);
1623       }
1624       work = a->mult_work;
1625       for ( i=0; i<mbs; i++ ) {
1626         n     = ii[1] - ii[0]; ii++;
1627         ncols = n*bs;
1628         PetscMemzero(work,ncols*sizeof(Scalar));
1629         LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One);
1630         v += n*bs2;
1631         x += bs;
1632         workt = work;
1633         for ( j=0; j<n; j++ ) {
1634           zb = z + bs*(*idx++);
1635           for ( k=0; k<bs; k++ ) zb[k] += workt[k] ;
1636           workt += bs;
1637         }
1638       }
1639     }
1640   }
1641   ierr = VecRestoreArray(xx,&xg); CHKERRQ(ierr);
1642   ierr = VecRestoreArray(zz,&zg); CHKERRQ(ierr);
1643   PLogFlops(2*a->nz*a->bs2);
1644   return 0;
1645 }
1646 
1647 #undef __FUNC__
1648 #define __FUNC__ "MatGetInfo_SeqBAIJ"
1649 int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1650 {
1651   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1652 
1653   info->rows_global    = (double)a->m;
1654   info->columns_global = (double)a->n;
1655   info->rows_local     = (double)a->m;
1656   info->columns_local  = (double)a->n;
1657   info->block_size     = a->bs2;
1658   info->nz_allocated   = a->maxnz;
1659   info->nz_used        = a->bs2*a->nz;
1660   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1661   /*  if (info->nz_unneeded != A->info.nz_unneeded)
1662     printf("space descrepancy: maxnz-nz = %d, nz_unneeded = %d\n",(int)info->nz_unneeded,(int)A->info.nz_unneeded); */
1663   info->assemblies   = A->num_ass;
1664   info->mallocs      = a->reallocs;
1665   info->memory       = A->mem;
1666   if (A->factor) {
1667     info->fill_ratio_given  = A->info.fill_ratio_given;
1668     info->fill_ratio_needed = A->info.fill_ratio_needed;
1669     info->factor_mallocs    = A->info.factor_mallocs;
1670   } else {
1671     info->fill_ratio_given  = 0;
1672     info->fill_ratio_needed = 0;
1673     info->factor_mallocs    = 0;
1674   }
1675   return 0;
1676 }
1677 
1678 #undef __FUNC__
1679 #define __FUNC__ "MatEqual_SeqBAIJ"
1680 int MatEqual_SeqBAIJ(Mat A,Mat B, PetscTruth* flg)
1681 {
1682   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
1683 
1684   if (B->type !=MATSEQBAIJ)SETERRQ(1,0,"Matrices must be same type");
1685 
1686   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1687   if ((a->m != b->m) || (a->n !=b->n) || (a->bs != b->bs)||
1688       (a->nz != b->nz)) {
1689     *flg = PETSC_FALSE; return 0;
1690   }
1691 
1692   /* if the a->i are the same */
1693   if (PetscMemcmp(a->i,b->i, (a->mbs+1)*sizeof(int))) {
1694     *flg = PETSC_FALSE; return 0;
1695   }
1696 
1697   /* if a->j are the same */
1698   if (PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int))) {
1699     *flg = PETSC_FALSE; return 0;
1700   }
1701 
1702   /* if a->a are the same */
1703   if (PetscMemcmp(a->a, b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar))) {
1704     *flg = PETSC_FALSE; return 0;
1705   }
1706   *flg = PETSC_TRUE;
1707   return 0;
1708 
1709 }
1710 
1711 #undef __FUNC__
1712 #define __FUNC__ "MatGetDiagonal_SeqBAIJ"
1713 int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1714 {
1715   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1716   int         i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1717   Scalar      *x, zero = 0.0,*aa,*aa_j;
1718 
1719   if (A->factor) SETERRQ(1,0,"Not for factored matrix");
1720   bs   = a->bs;
1721   aa   = a->a;
1722   ai   = a->i;
1723   aj   = a->j;
1724   ambs = a->mbs;
1725   bs2  = a->bs2;
1726 
1727   VecSet(&zero,v);
1728   VecGetArray_Fast(v,x); VecGetLocalSize_Fast(v,n);
1729   if (n != a->m) SETERRQ(1,0,"Nonconforming matrix and vector");
1730   for ( i=0; i<ambs; i++ ) {
1731     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1732       if (aj[j] == i) {
1733         row  = i*bs;
1734         aa_j = aa+j*bs2;
1735         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1736         break;
1737       }
1738     }
1739   }
1740   return 0;
1741 }
1742 
1743 #undef __FUNC__
1744 #define __FUNC__ "MatDiagonalScale_SeqBAIJ"
1745 int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1746 {
1747   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1748   Scalar      *l,*r,x,*v,*aa,*li,*ri;
1749   int         i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1750 
1751   ai  = a->i;
1752   aj  = a->j;
1753   aa  = a->a;
1754   m   = a->m;
1755   n   = a->n;
1756   bs  = a->bs;
1757   mbs = a->mbs;
1758   bs2 = a->bs2;
1759   if (ll) {
1760     VecGetArray_Fast(ll,l); VecGetLocalSize_Fast(ll,lm);
1761     if (lm != m) SETERRQ(1,0,"Left scaling vector wrong length");
1762     for ( i=0; i<mbs; i++ ) { /* for each block row */
1763       M  = ai[i+1] - ai[i];
1764       li = l + i*bs;
1765       v  = aa + bs2*ai[i];
1766       for ( j=0; j<M; j++ ) { /* for each block */
1767         for ( k=0; k<bs2; k++ ) {
1768           (*v++) *= li[k%bs];
1769         }
1770       }
1771     }
1772   }
1773 
1774   if (rr) {
1775     VecGetArray_Fast(rr,r); VecGetLocalSize_Fast(rr,rn);
1776     if (rn != n) SETERRQ(1,0,"Right scaling vector wrong length");
1777     for ( i=0; i<mbs; i++ ) { /* for each block row */
1778       M  = ai[i+1] - ai[i];
1779       v  = aa + bs2*ai[i];
1780       for ( j=0; j<M; j++ ) { /* for each block */
1781         ri = r + bs*aj[ai[i]+j];
1782         for ( k=0; k<bs; k++ ) {
1783           x = ri[k];
1784           for ( tmp=0; tmp<bs; tmp++ ) (*v++) *= x;
1785         }
1786       }
1787     }
1788   }
1789   return 0;
1790 }
1791 
1792 
1793 extern int MatLUFactorSymbolic_SeqBAIJ(Mat,IS,IS,double,Mat*);
1794 extern int MatLUFactor_SeqBAIJ(Mat,IS,IS,double);
1795 extern int MatIncreaseOverlap_SeqBAIJ(Mat,int,IS*,int);
1796 extern int MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatGetSubMatrixCall,Mat*);
1797 extern int MatGetSubMatrices_SeqBAIJ(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat**);
1798 
1799 extern int MatSolve_SeqBAIJ_N(Mat,Vec,Vec);
1800 extern int MatSolve_SeqBAIJ_1(Mat,Vec,Vec);
1801 extern int MatSolve_SeqBAIJ_2(Mat,Vec,Vec);
1802 extern int MatSolve_SeqBAIJ_3(Mat,Vec,Vec);
1803 extern int MatSolve_SeqBAIJ_4(Mat,Vec,Vec);
1804 extern int MatSolve_SeqBAIJ_5(Mat,Vec,Vec);
1805 extern int MatSolve_SeqBAIJ_7(Mat,Vec,Vec);
1806 
1807 extern int MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat*);
1808 extern int MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat*);
1809 extern int MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat*);
1810 extern int MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat*);
1811 extern int MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat*);
1812 extern int MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat*);
1813 
1814 #undef __FUNC__
1815 #define __FUNC__ "MatNorm_SeqBAIJ"
1816 int MatNorm_SeqBAIJ(Mat A,NormType type,double *norm)
1817 {
1818   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1819   Scalar      *v = a->a;
1820   double      sum = 0.0;
1821   int         i,nz=a->nz,bs2=a->bs2;
1822 
1823   if (type == NORM_FROBENIUS) {
1824     for (i=0; i< bs2*nz; i++ ) {
1825 #if defined(PETSC_COMPLEX)
1826       sum += real(conj(*v)*(*v)); v++;
1827 #else
1828       sum += (*v)*(*v); v++;
1829 #endif
1830     }
1831     *norm = sqrt(sum);
1832   }
1833   else {
1834     SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
1835   }
1836   return 0;
1837 }
1838 
1839 extern int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat,Vec,Vec);
1840 /*
1841      note: This can only work for identity for row and col. It would
1842    be good to check this and otherwise generate an error.
1843 */
1844 #undef __FUNC__
1845 #define __FUNC__ "MatILUFactor_SeqBAIJ"
1846 int MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,double efill,int fill)
1847 {
1848   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1849   Mat         outA;
1850   int         ierr;
1851 
1852   if (fill != 0) SETERRQ(1,0,"Only fill=0 supported");
1853 
1854   outA          = inA;
1855   inA->factor   = FACTOR_LU;
1856   a->row        = row;
1857   a->col        = col;
1858 
1859   if (!a->solve_work) {
1860     a->solve_work = (Scalar *) PetscMalloc((a->m+a->bs)*sizeof(Scalar));CHKPTRQ(a->solve_work);
1861     PLogObjectMemory(inA,(a->m+a->bs)*sizeof(Scalar));
1862   }
1863 
1864   if (!a->diag) {
1865     ierr = MatMarkDiag_SeqBAIJ(inA); CHKERRQ(ierr);
1866   }
1867   ierr = MatLUFactorNumeric(inA,&outA); CHKERRQ(ierr);
1868 
1869   /*
1870       Blocksize 4 has a special faster solver for ILU(0) factorization
1871     with natural ordering
1872   */
1873   if (a->bs == 4) {
1874     inA->ops.solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
1875   }
1876 
1877   return 0;
1878 }
1879 
1880 #undef __FUNC__
1881 #define __FUNC__ "MatScale_SeqBAIJ"
1882 int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1883 {
1884   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) inA->data;
1885   int         one = 1, totalnz = a->bs2*a->nz;
1886   BLscal_( &totalnz, alpha, a->a, &one );
1887   PLogFlops(totalnz);
1888   return 0;
1889 }
1890 
1891 #undef __FUNC__
1892 #define __FUNC__ "MatGetValues_SeqBAIJ"
1893 int MatGetValues_SeqBAIJ(Mat A,int m,int *im,int n,int *in,Scalar *v)
1894 {
1895   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
1896   int        *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
1897   int        *ai = a->i, *ailen = a->ilen;
1898   int        brow,bcol,ridx,cidx,bs=a->bs,bs2=a->bs2;
1899   Scalar     *ap, *aa = a->a, zero = 0.0;
1900 
1901   for ( k=0; k<m; k++ ) { /* loop over rows */
1902     row  = im[k]; brow = row/bs;
1903     if (row < 0) SETERRQ(1,0,"Negative row");
1904     if (row >= a->m) SETERRQ(1,0,"Row too large");
1905     rp   = aj + ai[brow] ; ap = aa + bs2*ai[brow] ;
1906     nrow = ailen[brow];
1907     for ( l=0; l<n; l++ ) { /* loop over columns */
1908       if (in[l] < 0) SETERRQ(1,0,"Negative column");
1909       if (in[l] >= a->n) SETERRQ(1,0,"Column too large");
1910       col  = in[l] ;
1911       bcol = col/bs;
1912       cidx = col%bs;
1913       ridx = row%bs;
1914       high = nrow;
1915       low  = 0; /* assume unsorted */
1916       while (high-low > 5) {
1917         t = (low+high)/2;
1918         if (rp[t] > bcol) high = t;
1919         else             low  = t;
1920       }
1921       for ( i=low; i<high; i++ ) {
1922         if (rp[i] > bcol) break;
1923         if (rp[i] == bcol) {
1924           *v++ = ap[bs2*i+bs*cidx+ridx];
1925           goto finished;
1926         }
1927       }
1928       *v++ = zero;
1929       finished:;
1930     }
1931   }
1932   return 0;
1933 }
1934 
1935 #undef __FUNC__
1936 #define __FUNC__ "MatGetBlockSize_SeqBAIJ"
1937 int MatGetBlockSize_SeqBAIJ(Mat mat, int *bs)
1938 {
1939   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *) mat->data;
1940   *bs = baij->bs;
1941   return 0;
1942 }
1943 
1944 /* idx should be of length atlease bs */
1945 #undef __FUNC__
1946 #define __FUNC__ "MatZeroRows_SeqBAIJ_Check_Block"
1947 static int MatZeroRows_SeqBAIJ_Check_Block(int *idx, int bs, PetscTruth *flg)
1948 {
1949   int i,row;
1950   row = idx[0];
1951   if (row%bs!=0) { *flg = PETSC_FALSE; return 0; }
1952 
1953   for ( i=1; i<bs; i++ ) {
1954     if (row+i != idx[i]) { *flg = PETSC_FALSE; return 0; }
1955   }
1956   *flg = PETSC_TRUE;
1957   return 0;
1958 }
1959 
1960 #undef __FUNC__
1961 #define __FUNC__ "MatZeroRows_SeqBAIJ"
1962 int MatZeroRows_SeqBAIJ(Mat A,IS is, Scalar *diag)
1963 {
1964   Mat_SeqBAIJ *baij=(Mat_SeqBAIJ*)A->data;
1965   IS          is_local;
1966   int         ierr,i,j,count,m=baij->m,is_n,*is_idx,*rows,bs=baij->bs,bs2=baij->bs2;
1967   PetscTruth  flg;
1968   Scalar      zero = 0.0,*aa;
1969 
1970   /* Make a copy of the IS and  sort it */
1971   ierr = ISGetSize(is,&is_n);CHKERRQ(ierr);
1972   ierr = ISGetIndices(is,&is_idx);CHKERRQ(ierr);
1973   ierr = ISCreateGeneral(A->comm,is_n,is_idx,&is_local); CHKERRQ(ierr);
1974   ierr = ISSort(is_local); CHKERRQ(ierr);
1975   ierr = ISGetIndices(is_local,&rows); CHKERRQ(ierr);
1976 
1977   i = 0;
1978   while (i < is_n) {
1979     if (rows[i]<0 || rows[i]>m) SETERRQ(1,0,"row out of range");
1980     flg = PETSC_FALSE;
1981     if (i+bs <= is_n) {ierr = MatZeroRows_SeqBAIJ_Check_Block(rows+i,bs,&flg); CHKERRQ(ierr); }
1982     if (flg) { /* There exists a block of rows to be Zerowed */
1983       baij->ilen[rows[i]/bs] = 0;
1984       i += bs;
1985     } else { /* Zero out only the requested row */
1986       count = (baij->i[rows[i]/bs +1] - baij->i[rows[i]/bs])*bs;
1987       aa    = baij->a + baij->i[rows[i]/bs]*bs2 + (rows[i]%bs);
1988       for ( j=0; j<count; j++ ) {
1989         aa[0] = zero;
1990         aa+=bs;
1991       }
1992       i++;
1993     }
1994   }
1995   if (diag) {
1996     for ( j=0; j<is_n; j++ ) {
1997       ierr = (*A->ops.setvalues)(A,1,rows+j,1,rows+j,diag,INSERT_VALUES);CHKERRQ(ierr);
1998     }
1999   }
2000   ierr = ISRestoreIndices(is,&is_idx); CHKERRQ(ierr);
2001   ierr = ISRestoreIndices(is_local,&rows); CHKERRQ(ierr);
2002   ierr = ISDestroy(is_local); CHKERRQ(ierr);
2003   ierr = MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2004 
2005   return 0;
2006 }
2007 
2008 #undef __FUNC__
2009 #define __FUNC__ "MatPrintHelp_SeqBAIJ"
2010 int MatPrintHelp_SeqBAIJ(Mat A)
2011 {
2012   static int called = 0;
2013   MPI_Comm   comm = A->comm;
2014 
2015   if (called) return 0; else called = 1;
2016   PetscPrintf(comm," Options for MATSEQBAIJ and MATMPIBAIJ matrix formats (the defaults):\n");
2017   PetscPrintf(comm,"  -mat_block_size <block_size>\n");
2018   return 0;
2019 }
2020 
2021 /* -------------------------------------------------------------------*/
2022 static struct _MatOps MatOps = {MatSetValues_SeqBAIJ,
2023        MatGetRow_SeqBAIJ,MatRestoreRow_SeqBAIJ,
2024        MatMult_SeqBAIJ_N,MatMultAdd_SeqBAIJ_N,
2025        MatMultTrans_SeqBAIJ,MatMultTransAdd_SeqBAIJ,
2026        MatSolve_SeqBAIJ_N,0,
2027        0,0,
2028        MatLUFactor_SeqBAIJ,0,
2029        0,
2030        MatTranspose_SeqBAIJ,
2031        MatGetInfo_SeqBAIJ,MatEqual_SeqBAIJ,
2032        MatGetDiagonal_SeqBAIJ,MatDiagonalScale_SeqBAIJ,MatNorm_SeqBAIJ,
2033        0,MatAssemblyEnd_SeqBAIJ,
2034        0,
2035        MatSetOption_SeqBAIJ,MatZeroEntries_SeqBAIJ,MatZeroRows_SeqBAIJ,
2036        MatLUFactorSymbolic_SeqBAIJ,MatLUFactorNumeric_SeqBAIJ_N,0,0,
2037        MatGetSize_SeqBAIJ,MatGetSize_SeqBAIJ,MatGetOwnershipRange_SeqBAIJ,
2038        MatILUFactorSymbolic_SeqBAIJ,0,
2039        0,0,
2040        MatConvertSameType_SeqBAIJ,0,0,
2041        MatILUFactor_SeqBAIJ,0,0,
2042        MatGetSubMatrices_SeqBAIJ,MatIncreaseOverlap_SeqBAIJ,
2043        MatGetValues_SeqBAIJ,0,
2044        MatPrintHelp_SeqBAIJ,MatScale_SeqBAIJ,
2045        0,0,0,MatGetBlockSize_SeqBAIJ,
2046        MatGetRowIJ_SeqBAIJ,
2047        MatRestoreRowIJ_SeqBAIJ,
2048        0,0,0,0,0,0,
2049        MatSetValuesBlocked_SeqBAIJ};
2050 
2051 #undef __FUNC__
2052 #define __FUNC__ "MatCreateSeqBAIJ"
2053 /*@C
2054    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
2055    compressed row) format.  For good matrix assembly performance the
2056    user should preallocate the matrix storage by setting the parameter nz
2057    (or the array nzz).  By setting these parameters accurately, performance
2058    during matrix assembly can be increased by more than a factor of 50.
2059 
2060    Input Parameters:
2061 .  comm - MPI communicator, set to PETSC_COMM_SELF
2062 .  bs - size of block
2063 .  m - number of rows
2064 .  n - number of columns
2065 .  nz - number of block nonzeros per block row (same for all rows)
2066 .  nzz - array containing the number of block nonzeros in the various block rows
2067          (possibly different for each block row) or PETSC_NULL
2068 
2069    Output Parameter:
2070 .  A - the matrix
2071 
2072    Options Database Keys:
2073 $    -mat_no_unroll - uses code that does not unroll the loops in the
2074 $                     block calculations (much slower)
2075 $    -mat_block_size - size of the blocks to use
2076 
2077    Notes:
2078    The block AIJ format is fully compatible with standard Fortran 77
2079    storage.  That is, the stored row and column indices can begin at
2080    either one (as in Fortran) or zero.  See the users' manual for details.
2081 
2082    Specify the preallocated storage with either nz or nnz (not both).
2083    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
2084    allocation.  For additional details, see the users manual chapter on
2085    matrices.
2086 
2087 .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues()
2088 @*/
2089 int MatCreateSeqBAIJ(MPI_Comm comm,int bs,int m,int n,int nz,int *nnz, Mat *A)
2090 {
2091   Mat         B;
2092   Mat_SeqBAIJ *b;
2093   int         i,len,ierr,flg,mbs=m/bs,nbs=n/bs,bs2=bs*bs,size;
2094 
2095   MPI_Comm_size(comm,&size);
2096   if (size > 1) SETERRQ(1,0,"Comm must be of size 1");
2097 
2098   ierr = OptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,&flg);CHKERRQ(ierr);
2099 
2100   if (mbs*bs!=m || nbs*bs!=n)
2101     SETERRQ(1,0,"Number rows, cols must be divisible by blocksize");
2102 
2103   *A = 0;
2104   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATSEQBAIJ,comm,MatDestroy,MatView);
2105   PLogObjectCreate(B);
2106   B->data = (void *) (b = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(b);
2107   PetscMemzero(b,sizeof(Mat_SeqBAIJ));
2108   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
2109   ierr = OptionsHasName(PETSC_NULL,"-mat_no_unroll",&flg); CHKERRQ(ierr);
2110   if (!flg) {
2111     switch (bs) {
2112     case 1:
2113       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2114       B->ops.solve           = MatSolve_SeqBAIJ_1;
2115       B->ops.mult            = MatMult_SeqBAIJ_1;
2116       B->ops.multadd         = MatMultAdd_SeqBAIJ_1;
2117       break;
2118     case 2:
2119       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2;
2120       B->ops.solve           = MatSolve_SeqBAIJ_2;
2121       B->ops.mult            = MatMult_SeqBAIJ_2;
2122       B->ops.multadd         = MatMultAdd_SeqBAIJ_2;
2123       break;
2124     case 3:
2125       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3;
2126       B->ops.solve           = MatSolve_SeqBAIJ_3;
2127       B->ops.mult            = MatMult_SeqBAIJ_3;
2128       B->ops.multadd         = MatMultAdd_SeqBAIJ_3;
2129       break;
2130     case 4:
2131       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4;
2132       B->ops.solve           = MatSolve_SeqBAIJ_4;
2133       B->ops.mult            = MatMult_SeqBAIJ_4;
2134       B->ops.multadd         = MatMultAdd_SeqBAIJ_4;
2135       break;
2136     case 5:
2137       B->ops.lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5;
2138       B->ops.solve           = MatSolve_SeqBAIJ_5;
2139       B->ops.mult            = MatMult_SeqBAIJ_5;
2140       B->ops.multadd         = MatMultAdd_SeqBAIJ_5;
2141       break;
2142     case 7:
2143       B->ops.mult            = MatMult_SeqBAIJ_7;
2144       B->ops.solve           = MatSolve_SeqBAIJ_7;
2145       B->ops.multadd         = MatMultAdd_SeqBAIJ_7;
2146       break;
2147     }
2148   }
2149   B->destroy          = MatDestroy_SeqBAIJ;
2150   B->view             = MatView_SeqBAIJ;
2151   B->factor           = 0;
2152   B->lupivotthreshold = 1.0;
2153   B->mapping          = 0;
2154   b->row              = 0;
2155   b->col              = 0;
2156   b->reallocs         = 0;
2157 
2158   b->m       = m; B->m = m; B->M = m;
2159   b->n       = n; B->n = n; B->N = n;
2160   b->mbs     = mbs;
2161   b->nbs     = nbs;
2162   b->imax    = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(b->imax);
2163   if (nnz == PETSC_NULL) {
2164     if (nz == PETSC_DEFAULT) nz = 5;
2165     else if (nz <= 0)        nz = 1;
2166     for ( i=0; i<mbs; i++ ) b->imax[i] = nz;
2167     nz = nz*mbs;
2168   }
2169   else {
2170     nz = 0;
2171     for ( i=0; i<mbs; i++ ) {b->imax[i] = nnz[i]; nz += nnz[i];}
2172   }
2173 
2174   /* allocate the matrix space */
2175   len   = nz*sizeof(int) + nz*bs2*sizeof(Scalar) + (b->m+1)*sizeof(int);
2176   b->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
2177   PetscMemzero(b->a,nz*bs2*sizeof(Scalar));
2178   b->j  = (int *) (b->a + nz*bs2);
2179   PetscMemzero(b->j,nz*sizeof(int));
2180   b->i  = b->j + nz;
2181   b->singlemalloc = 1;
2182 
2183   b->i[0] = 0;
2184   for (i=1; i<mbs+1; i++) {
2185     b->i[i] = b->i[i-1] + b->imax[i-1];
2186   }
2187 
2188   /* b->ilen will count nonzeros in each block row so far. */
2189   b->ilen = (int *) PetscMalloc((mbs+1)*sizeof(int));
2190   PLogObjectMemory(B,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ));
2191   for ( i=0; i<mbs; i++ ) { b->ilen[i] = 0;}
2192 
2193   b->bs               = bs;
2194   b->bs2              = bs2;
2195   b->mbs              = mbs;
2196   b->nz               = 0;
2197   b->maxnz            = nz*bs2;
2198   b->sorted           = 0;
2199   b->roworiented      = 1;
2200   b->nonew            = 0;
2201   b->diag             = 0;
2202   b->solve_work       = 0;
2203   b->mult_work        = 0;
2204   b->spptr            = 0;
2205   B->info.nz_unneeded = (double)b->maxnz;
2206 
2207   *A = B;
2208   ierr = OptionsHasName(PETSC_NULL,"-help", &flg); CHKERRQ(ierr);
2209   if (flg) {ierr = MatPrintHelp(B); CHKERRQ(ierr); }
2210   return 0;
2211 }
2212 
2213 #undef __FUNC__
2214 #define __FUNC__ "MatConvertSameType_SeqBAIJ"
2215 int MatConvertSameType_SeqBAIJ(Mat A,Mat *B,int cpvalues)
2216 {
2217   Mat         C;
2218   Mat_SeqBAIJ *c,*a = (Mat_SeqBAIJ *) A->data;
2219   int         i,len, mbs = a->mbs,nz = a->nz,bs2 =a->bs2;
2220 
2221   if (a->i[mbs] != nz) SETERRQ(1,0,"Corrupt matrix");
2222 
2223   *B = 0;
2224   PetscHeaderCreate(C,_p_Mat,MAT_COOKIE,MATSEQBAIJ,A->comm,MatDestroy,MatView);
2225   PLogObjectCreate(C);
2226   C->data       = (void *) (c = PetscNew(Mat_SeqBAIJ)); CHKPTRQ(c);
2227   PetscMemcpy(&C->ops,&A->ops,sizeof(struct _MatOps));
2228   C->destroy    = MatDestroy_SeqBAIJ;
2229   C->view       = MatView_SeqBAIJ;
2230   C->factor     = A->factor;
2231   c->row        = 0;
2232   c->col        = 0;
2233   C->assembled  = PETSC_TRUE;
2234 
2235   c->m = C->m   = a->m;
2236   c->n = C->n   = a->n;
2237   C->M          = a->m;
2238   C->N          = a->n;
2239 
2240   c->bs         = a->bs;
2241   c->bs2        = a->bs2;
2242   c->mbs        = a->mbs;
2243   c->nbs        = a->nbs;
2244 
2245   c->imax       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->imax);
2246   c->ilen       = (int *) PetscMalloc((mbs+1)*sizeof(int)); CHKPTRQ(c->ilen);
2247   for ( i=0; i<mbs; i++ ) {
2248     c->imax[i] = a->imax[i];
2249     c->ilen[i] = a->ilen[i];
2250   }
2251 
2252   /* allocate the matrix space */
2253   c->singlemalloc = 1;
2254   len   = (mbs+1)*sizeof(int) + nz*(bs2*sizeof(Scalar) + sizeof(int));
2255   c->a  = (Scalar *) PetscMalloc( len ); CHKPTRQ(c->a);
2256   c->j  = (int *) (c->a + nz*bs2);
2257   c->i  = c->j + nz;
2258   PetscMemcpy(c->i,a->i,(mbs+1)*sizeof(int));
2259   if (mbs > 0) {
2260     PetscMemcpy(c->j,a->j,nz*sizeof(int));
2261     if (cpvalues == COPY_VALUES) {
2262       PetscMemcpy(c->a,a->a,bs2*nz*sizeof(Scalar));
2263     }
2264   }
2265 
2266   PLogObjectMemory(C,len+2*(mbs+1)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_SeqBAIJ));
2267   c->sorted      = a->sorted;
2268   c->roworiented = a->roworiented;
2269   c->nonew       = a->nonew;
2270 
2271   if (a->diag) {
2272     c->diag = (int *) PetscMalloc( (mbs+1)*sizeof(int) ); CHKPTRQ(c->diag);
2273     PLogObjectMemory(C,(mbs+1)*sizeof(int));
2274     for ( i=0; i<mbs; i++ ) {
2275       c->diag[i] = a->diag[i];
2276     }
2277   }
2278   else c->diag          = 0;
2279   c->nz                 = a->nz;
2280   c->maxnz              = a->maxnz;
2281   c->solve_work         = 0;
2282   c->spptr              = 0;      /* Dangerous -I'm throwing away a->spptr */
2283   c->mult_work          = 0;
2284   *B = C;
2285   return 0;
2286 }
2287 
2288 #undef __FUNC__
2289 #define __FUNC__ "MatLoad_SeqBAIJ"
2290 int MatLoad_SeqBAIJ(Viewer viewer,MatType type,Mat *A)
2291 {
2292   Mat_SeqBAIJ  *a;
2293   Mat          B;
2294   int          i,nz,ierr,fd,header[4],size,*rowlengths=0,M,N,bs=1,flg;
2295   int          *mask,mbs,*jj,j,rowcount,nzcount,k,*browlengths,maskcount;
2296   int          kmax,jcount,block,idx,point,nzcountb,extra_rows;
2297   int          *masked, nmask,tmp,bs2,ishift;
2298   Scalar       *aa;
2299   MPI_Comm     comm = ((PetscObject) viewer)->comm;
2300 
2301   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
2302   bs2  = bs*bs;
2303 
2304   MPI_Comm_size(comm,&size);
2305   if (size > 1) SETERRQ(1,0,"view must have one processor");
2306   ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2307   ierr = PetscBinaryRead(fd,header,4,PETSC_INT); CHKERRQ(ierr);
2308   if (header[0] != MAT_COOKIE) SETERRQ(1,0,"not Mat object");
2309   M = header[1]; N = header[2]; nz = header[3];
2310 
2311   if (M != N) SETERRQ(1,0,"Can only do square matrices");
2312 
2313   /*
2314      This code adds extra rows to make sure the number of rows is
2315     divisible by the blocksize
2316   */
2317   mbs        = M/bs;
2318   extra_rows = bs - M + bs*(mbs);
2319   if (extra_rows == bs) extra_rows = 0;
2320   else                  mbs++;
2321   if (extra_rows) {
2322     PLogInfo(0,"MatLoad_SeqBAIJ:Padding loaded matrix to match blocksize\n");
2323   }
2324 
2325   /* read in row lengths */
2326   rowlengths = (int*) PetscMalloc((M+extra_rows)*sizeof(int));CHKPTRQ(rowlengths);
2327   ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2328   for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2329 
2330   /* read in column indices */
2331   jj = (int*) PetscMalloc( (nz+extra_rows)*sizeof(int) ); CHKPTRQ(jj);
2332   ierr = PetscBinaryRead(fd,jj,nz,PETSC_INT); CHKERRQ(ierr);
2333   for ( i=0; i<extra_rows; i++ ) jj[nz+i] = M+i;
2334 
2335   /* loop over row lengths determining block row lengths */
2336   browlengths = (int *) PetscMalloc(mbs*sizeof(int));CHKPTRQ(browlengths);
2337   PetscMemzero(browlengths,mbs*sizeof(int));
2338   mask   = (int *) PetscMalloc( 2*mbs*sizeof(int) ); CHKPTRQ(mask);
2339   PetscMemzero(mask,mbs*sizeof(int));
2340   masked = mask + mbs;
2341   rowcount = 0; nzcount = 0;
2342   for ( i=0; i<mbs; i++ ) {
2343     nmask = 0;
2344     for ( j=0; j<bs; j++ ) {
2345       kmax = rowlengths[rowcount];
2346       for ( k=0; k<kmax; k++ ) {
2347         tmp = jj[nzcount++]/bs;
2348         if (!mask[tmp]) {masked[nmask++] = tmp; mask[tmp] = 1;}
2349       }
2350       rowcount++;
2351     }
2352     browlengths[i] += nmask;
2353     /* zero out the mask elements we set */
2354     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2355   }
2356 
2357   /* create our matrix */
2358   ierr = MatCreateSeqBAIJ(comm,bs,M+extra_rows,N+extra_rows,0,browlengths,A);CHKERRQ(ierr);
2359   B = *A;
2360   a = (Mat_SeqBAIJ *) B->data;
2361 
2362   /* set matrix "i" values */
2363   a->i[0] = 0;
2364   for ( i=1; i<= mbs; i++ ) {
2365     a->i[i]      = a->i[i-1] + browlengths[i-1];
2366     a->ilen[i-1] = browlengths[i-1];
2367   }
2368   a->nz         = 0;
2369   for ( i=0; i<mbs; i++ ) a->nz += browlengths[i];
2370 
2371   /* read in nonzero values */
2372   aa = (Scalar *) PetscMalloc((nz+extra_rows)*sizeof(Scalar));CHKPTRQ(aa);
2373   ierr = PetscBinaryRead(fd,aa,nz,PETSC_SCALAR); CHKERRQ(ierr);
2374   for ( i=0; i<extra_rows; i++ ) aa[nz+i] = 1.0;
2375 
2376   /* set "a" and "j" values into matrix */
2377   nzcount = 0; jcount = 0;
2378   for ( i=0; i<mbs; i++ ) {
2379     nzcountb = nzcount;
2380     nmask    = 0;
2381     for ( j=0; j<bs; j++ ) {
2382       kmax = rowlengths[i*bs+j];
2383       for ( k=0; k<kmax; k++ ) {
2384         tmp = jj[nzcount++]/bs;
2385 	if (!mask[tmp]) { masked[nmask++] = tmp; mask[tmp] = 1;}
2386       }
2387       rowcount++;
2388     }
2389     /* sort the masked values */
2390     PetscSortInt(nmask,masked);
2391 
2392     /* set "j" values into matrix */
2393     maskcount = 1;
2394     for ( j=0; j<nmask; j++ ) {
2395       a->j[jcount++]  = masked[j];
2396       mask[masked[j]] = maskcount++;
2397     }
2398     /* set "a" values into matrix */
2399     ishift = bs2*a->i[i];
2400     for ( j=0; j<bs; j++ ) {
2401       kmax = rowlengths[i*bs+j];
2402       for ( k=0; k<kmax; k++ ) {
2403         tmp    = jj[nzcountb]/bs ;
2404         block  = mask[tmp] - 1;
2405         point  = jj[nzcountb] - bs*tmp;
2406         idx    = ishift + bs2*block + j + bs*point;
2407         a->a[idx] = aa[nzcountb++];
2408       }
2409     }
2410     /* zero out the mask elements we set */
2411     for ( j=0; j<nmask; j++ ) mask[masked[j]] = 0;
2412   }
2413   if (jcount != a->nz) SETERRQ(1,0,"Error bad binary matrix");
2414 
2415   PetscFree(rowlengths);
2416   PetscFree(browlengths);
2417   PetscFree(aa);
2418   PetscFree(jj);
2419   PetscFree(mask);
2420 
2421   B->assembled = PETSC_TRUE;
2422 
2423   ierr = MatView_Private(B); CHKERRQ(ierr);
2424   return 0;
2425 }
2426 
2427 
2428 
2429