xref: /petsc/src/mat/impls/baij/mpi/mpibaij.c (revision 76be9ce4a233aaa47cda2bc7f5a27cd7faabecaa)
1 #ifdef PETSC_RCS_HEADER
2 static char vcid[] = "$Id: mpibaij.c,v 1.98 1998/01/12 21:15:42 balay Exp balay $";
3 #endif
4 
5 #include "pinclude/pviewer.h"
6 #include "src/mat/impls/baij/mpi/mpibaij.h"
7 #include "src/vec/vecimpl.h"
8 
9 
10 extern int MatSetUpMultiply_MPIBAIJ(Mat);
11 extern int DisAssemble_MPIBAIJ(Mat);
12 extern int MatIncreaseOverlap_MPIBAIJ(Mat,int,IS *,int);
13 extern int MatGetSubMatrices_MPIBAIJ(Mat,int,IS *,IS *,MatGetSubMatrixCall,Mat **);
14 
15 /*
16      Local utility routine that creates a mapping from the global column
17    number to the local number in the off-diagonal part of the local
18    storage of the matrix.  This is done in a non scable way since the
19    length of colmap equals the global matrix length.
20 */
21 #undef __FUNC__
22 #define __FUNC__ "CreateColmap_MPIBAIJ_Private"
23 static int CreateColmap_MPIBAIJ_Private(Mat mat)
24 {
25   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
26   Mat_SeqBAIJ *B = (Mat_SeqBAIJ*) baij->B->data;
27   int         nbs = B->nbs,i,bs=B->bs;;
28 
29   PetscFunctionBegin;
30   baij->colmap = (int *) PetscMalloc((baij->Nbs+1)*sizeof(int));CHKPTRQ(baij->colmap);
31   PLogObjectMemory(mat,baij->Nbs*sizeof(int));
32   PetscMemzero(baij->colmap,baij->Nbs*sizeof(int));
33   for ( i=0; i<nbs; i++ ) baij->colmap[baij->garray[i]] = i*bs+1;
34   PetscFunctionReturn(0);
35 }
36 
37 #define CHUNKSIZE  10
38 
39 #define  MatSetValues_SeqBAIJ_A_Private(row,col,value,addv) \
40 { \
41  \
42     brow = row/bs;  \
43     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
44     rmax = aimax[brow]; nrow = ailen[brow]; \
45       bcol = col/bs; \
46       ridx = row % bs; cidx = col % bs; \
47       low = 0; high = nrow; \
48       while (high-low > 3) { \
49         t = (low+high)/2; \
50         if (rp[t] > bcol) high = t; \
51         else              low  = t; \
52       } \
53       for ( _i=low; _i<high; _i++ ) { \
54         if (rp[_i] > bcol) break; \
55         if (rp[_i] == bcol) { \
56           bap  = ap +  bs2*_i + bs*cidx + ridx; \
57           if (addv == ADD_VALUES) *bap += value;  \
58           else                    *bap  = value;  \
59           goto a_noinsert; \
60         } \
61       } \
62       if (a->nonew == 1) goto a_noinsert; \
63       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
64       if (nrow >= rmax) { \
65         /* there is no extra room in row, therefore enlarge */ \
66         int    new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
67         Scalar *new_a; \
68  \
69         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
70  \
71         /* malloc new storage space */ \
72         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(a->mbs+1)*sizeof(int); \
73         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
74         new_j   = (int *) (new_a + bs2*new_nz); \
75         new_i   = new_j + new_nz; \
76  \
77         /* copy over old data into new slots */ \
78         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = ai[ii];} \
79         for ( ii=brow+1; ii<a->mbs+1; ii++ ) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
80         PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int)); \
81         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
82         PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow, \
83                                                            len*sizeof(int)); \
84         PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(Scalar)); \
85         PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
86         PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
87                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(Scalar));  \
88         /* free up old matrix storage */ \
89         PetscFree(a->a);  \
90         if (!a->singlemalloc) {PetscFree(a->i);PetscFree(a->j);} \
91         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
92         a->singlemalloc = 1; \
93  \
94         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
95         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
96         PLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
97         a->maxnz += bs2*CHUNKSIZE; \
98         a->reallocs++; \
99         a->nz++; \
100       } \
101       N = nrow++ - 1;  \
102       /* shift up all the later entries in this row */ \
103       for ( ii=N; ii>=_i; ii-- ) { \
104         rp[ii+1] = rp[ii]; \
105         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
106       } \
107       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
108       rp[_i]                      = bcol;  \
109       ap[bs2*_i + bs*cidx + ridx] = value;  \
110       a_noinsert:; \
111     ailen[brow] = nrow; \
112 }
113 
114 #define  MatSetValues_SeqBAIJ_B_Private(row,col,value,addv) \
115 { \
116  \
117     brow = row/bs;  \
118     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
119     rmax = bimax[brow]; nrow = bilen[brow]; \
120       bcol = col/bs; \
121       ridx = row % bs; cidx = col % bs; \
122       low = 0; high = nrow; \
123       while (high-low > 3) { \
124         t = (low+high)/2; \
125         if (rp[t] > bcol) high = t; \
126         else              low  = t; \
127       } \
128       for ( _i=low; _i<high; _i++ ) { \
129         if (rp[_i] > bcol) break; \
130         if (rp[_i] == bcol) { \
131           bap  = ap +  bs2*_i + bs*cidx + ridx; \
132           if (addv == ADD_VALUES) *bap += value;  \
133           else                    *bap  = value;  \
134           goto b_noinsert; \
135         } \
136       } \
137       if (b->nonew == 1) goto b_noinsert; \
138       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero into matrix"); \
139       if (nrow >= rmax) { \
140         /* there is no extra room in row, therefore enlarge */ \
141         int    new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
142         Scalar *new_a; \
143  \
144         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Inserting a new nonzero in the matrix"); \
145  \
146         /* malloc new storage space */ \
147         len     = new_nz*(sizeof(int)+bs2*sizeof(Scalar))+(b->mbs+1)*sizeof(int); \
148         new_a   = (Scalar *) PetscMalloc( len ); CHKPTRQ(new_a); \
149         new_j   = (int *) (new_a + bs2*new_nz); \
150         new_i   = new_j + new_nz; \
151  \
152         /* copy over old data into new slots */ \
153         for ( ii=0; ii<brow+1; ii++ ) {new_i[ii] = bi[ii];} \
154         for ( ii=brow+1; ii<b->mbs+1; ii++ ) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
155         PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int)); \
156         len = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
157         PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow, \
158                                                            len*sizeof(int)); \
159         PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(Scalar)); \
160         PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(Scalar)); \
161         PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
162                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(Scalar));  \
163         /* free up old matrix storage */ \
164         PetscFree(b->a);  \
165         if (!b->singlemalloc) {PetscFree(b->i);PetscFree(b->j);} \
166         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
167         b->singlemalloc = 1; \
168  \
169         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
170         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
171         PLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(Scalar))); \
172         b->maxnz += bs2*CHUNKSIZE; \
173         b->reallocs++; \
174         b->nz++; \
175       } \
176       N = nrow++ - 1;  \
177       /* shift up all the later entries in this row */ \
178       for ( ii=N; ii>=_i; ii-- ) { \
179         rp[ii+1] = rp[ii]; \
180         PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(Scalar)); \
181       } \
182       if (N>=_i) PetscMemzero(ap+bs2*_i,bs2*sizeof(Scalar));  \
183       rp[_i]                      = bcol;  \
184       ap[bs2*_i + bs*cidx + ridx] = value;  \
185       b_noinsert:; \
186     bilen[brow] = nrow; \
187 }
188 
189 #undef __FUNC__
190 #define __FUNC__ "MatSetValues_MPIBAIJ"
191 int MatSetValues_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
192 {
193   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
194   Scalar      value;
195   int         ierr,i,j,row,col;
196   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
197   int         rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
198   int         cend_orig=baij->cend_bs,bs=baij->bs;
199 
200   /* Some Variables required in the macro */
201   Mat         A = baij->A;
202   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) (A)->data;
203   int         *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
204   Scalar      *aa=a->a;
205 
206   Mat         B = baij->B;
207   Mat_SeqBAIJ *b = (Mat_SeqBAIJ *) (B)->data;
208   int         *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
209   Scalar      *ba=b->a;
210 
211   int         *rp,ii,nrow,_i,rmax,N,brow,bcol;
212   int         low,high,t,ridx,cidx,bs2=a->bs2;
213   Scalar      *ap,*bap;
214 
215   PetscFunctionBegin;
216   for ( i=0; i<m; i++ ) {
217 #if defined(USE_PETSC_BOPT_g)
218     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
219     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
220 #endif
221     if (im[i] >= rstart_orig && im[i] < rend_orig) {
222       row = im[i] - rstart_orig;
223       for ( j=0; j<n; j++ ) {
224         if (in[j] >= cstart_orig && in[j] < cend_orig){
225           col = in[j] - cstart_orig;
226           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
227           MatSetValues_SeqBAIJ_A_Private(row,col,value,addv);
228           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
229         }
230 #if defined(USE_PETSC_BOPT_g)
231         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
232         else if (in[j] >= baij->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Col too large");}
233 #endif
234         else {
235           if (mat->was_assembled) {
236             if (!baij->colmap) {
237               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
238             }
239             col = baij->colmap[in[j]/bs] - 1 + in[j]%bs;
240             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
241               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
242               col =  in[j];
243               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
244               B = baij->B;
245               b = (Mat_SeqBAIJ *) (B)->data;
246               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
247               ba=b->a;
248             }
249           } else col = in[j];
250           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
251           MatSetValues_SeqBAIJ_B_Private(row,col,value,addv);
252           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
253         }
254       }
255     } else {
256       if (roworiented && !baij->donotstash) {
257         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
258       } else {
259         if (!baij->donotstash) {
260           row = im[i];
261 	  for ( j=0; j<n; j++ ) {
262 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
263           }
264         }
265       }
266     }
267   }
268   PetscFunctionReturn(0);
269 }
270 
271 extern int MatSetValuesBlocked_SeqBAIJ(Mat,int,int*,int,int*,Scalar*,InsertMode);
272 #undef __FUNC__
273 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ"
274 int MatSetValuesBlocked_MPIBAIJ(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
275 {
276   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
277   Scalar      *value,*barray=baij->barray;
278   int         ierr,i,j,ii,jj,row,col,k,l;
279   int         roworiented = baij->roworiented,rstart=baij->rstart ;
280   int         rend=baij->rend,cstart=baij->cstart,stepval;
281   int         cend=baij->cend,bs=baij->bs,bs2=baij->bs2;
282 
283   if(!barray) {
284     baij->barray = barray = (Scalar*) PetscMalloc(bs2*sizeof(Scalar)); CHKPTRQ(barray);
285   }
286 
287   if (roworiented) {
288     stepval = (n-1)*bs;
289   } else {
290     stepval = (m-1)*bs;
291   }
292   for ( i=0; i<m; i++ ) {
293 #if defined(USE_PETSC_BOPT_g)
294     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
295     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
296 #endif
297     if (im[i] >= rstart && im[i] < rend) {
298       row = im[i] - rstart;
299       for ( j=0; j<n; j++ ) {
300         /* If NumCol = 1 then a copy is not required */
301         if ((roworiented) && (n == 1)) {
302           barray = v + i*bs2;
303         } else if((!roworiented) && (m == 1)) {
304           barray = v + j*bs2;
305         } else { /* Here a copy is required */
306           if (roworiented) {
307             value = v + i*(stepval+bs)*bs + j*bs;
308           } else {
309             value = v + j*(stepval+bs)*bs + i*bs;
310           }
311           for ( ii=0; ii<bs; ii++,value+=stepval ) {
312             for (jj=0; jj<bs; jj++ ) {
313               *barray++  = *value++;
314             }
315           }
316           barray -=bs2;
317         }
318 
319         if (in[j] >= cstart && in[j] < cend){
320           col  = in[j] - cstart;
321           ierr = MatSetValuesBlocked_SeqBAIJ(baij->A,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
322         }
323 #if defined(USE_PETSC_BOPT_g)
324         else if (in[j] < 0) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");}
325         else if (in[j] >= baij->Nbs) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");}
326 #endif
327         else {
328           if (mat->was_assembled) {
329             if (!baij->colmap) {
330               ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
331             }
332 
333 #if defined(USE_PETSC_BOPT_g)
334             if ((baij->colmap[in[j]] - 1) % bs) {SETERRQ(PETSC_ERR_PLIB,0,"Incorrect colmap");}
335 #endif
336             col = (baij->colmap[in[j]] - 1)/bs;
337             if (col < 0 && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
338               ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
339               col =  in[j];
340             }
341           }
342           else col = in[j];
343           ierr = MatSetValuesBlocked_SeqBAIJ(baij->B,1,&row,1,&col,barray,addv);CHKERRQ(ierr);
344         }
345       }
346     } else {
347       if (!baij->donotstash) {
348         if (roworiented ) {
349           row   = im[i]*bs;
350           value = v + i*(stepval+bs)*bs;
351           for ( j=0; j<bs; j++,row++ ) {
352             for ( k=0; k<n; k++ ) {
353               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
354                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
355               }
356             }
357           }
358         } else {
359           for ( j=0; j<n; j++ ) {
360             value = v + j*(stepval+bs)*bs + i*bs;
361             col   = in[j]*bs;
362             for ( k=0; k<bs; k++,col++,value+=stepval) {
363               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
364                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
365               }
366             }
367           }
368         }
369       }
370     }
371   }
372   PetscFunctionReturn(0);
373 }
374 #include <math.h>
375 #define HASH_KEY 0.6180339887
376 #define HASH1(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1)))
377 
378 #undef __FUNC__
379 #define __FUNC__ "MatSetValues_MPIBAIJ_HT"
380 int MatSetValues_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
381 {
382   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
383   int         ierr,i,j,row,col;
384   int         roworiented = baij->roworiented,rstart_orig=baij->rstart_bs ;
385   int         rend_orig=baij->rend_bs;
386 
387   int         h1,key,size=baij->ht_size,k,bs=baij->bs;
388   int         * HT  = baij->ht;
389   Scalar      ** HD = baij->hd,value;
390 
391 
392   PetscFunctionBegin;
393 
394   for ( i=0; i<m; i++ ) {
395 #if defined(USE_PETSC_BOPT_g)
396     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
397     if (im[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
398 #endif
399     if (im[i] >= rstart_orig && im[i] < rend_orig) {
400       row = im[i];
401       for ( j=0; j<n; j++ ) {
402         col = in[j];
403         if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
404         /* Look up into the Hash Table */
405         key = (row/bs)*baij->Nbs+(col/bs)+1;
406         h1  = HASH1(size,key);
407 
408         for ( k=0; k<size; k++ ){
409           if (HT[(h1+k)%size] == key) {
410             if (addv == ADD_VALUES) *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs) += value;
411             else                    *(HD[(h1+k)%size]+ (col % bs)*bs + row % bs)  = value;
412             break;
413           }
414         }
415         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
416       }
417     } else {
418       if (roworiented && !baij->donotstash) {
419         ierr = StashValues_Private(&baij->stash,im[i],n,in,v+i*n,addv);CHKERRQ(ierr);
420       } else {
421         if (!baij->donotstash) {
422           row = im[i];
423 	  for ( j=0; j<n; j++ ) {
424 	    ierr = StashValues_Private(&baij->stash,row,1,in+j,v+i+j*m,addv);CHKERRQ(ierr);
425           }
426         }
427       }
428     }
429   }
430   PetscFunctionReturn(0);
431 }
432 
433 #undef __FUNC__
434 #define __FUNC__ "MatSetValuesBlocked_MPIBAIJ_HT"
435 int MatSetValuesBlocked_MPIBAIJ_HT(Mat mat,int m,int *im,int n,int *in,Scalar *v,InsertMode addv)
436 {
437   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
438   int         ierr,i,j,ii,jj,row,col,k,l;
439   int         roworiented = baij->roworiented,rstart=baij->rstart ;
440   int         rend=baij->rend,stepval,bs=baij->bs,bs2=baij->bs2;
441 
442   int         h1,key,size=baij->ht_size;
443   int         * HT  = baij->ht;
444   Scalar      ** HD = baij->hd,*value,*baij_a;
445 
446   PetscFunctionBegin;
447 
448   if (roworiented) {
449     stepval = (n-1)*bs;
450   } else {
451     stepval = (m-1)*bs;
452   }
453   for ( i=0; i<m; i++ ) {
454 #if defined(USE_PETSC_BOPT_g)
455     if (im[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
456     if (im[i] >= baij->Mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
457 #endif
458     if (im[i] >= rstart && im[i] < rend) {
459       row = im[i];
460       for ( j=0; j<n; j++ ) {
461         col = in[j];
462 
463         /* Look up into the Hash Table */
464         key = row*baij->Nbs+col+1;
465         h1  = HASH1(size,key);
466 
467         for ( k=0; k<size; k++ ){
468           if (HT[(h1+k)%size] == key) {
469             baij_a = HD[(h1+k)%size];
470 
471             if (roworiented) {
472               value = v + i*(stepval+bs)*bs + j*bs;
473               if (addv == ADD_VALUES) {
474                 for ( ii=0; ii<bs; ii++,value+=stepval,baij_a++ ) {
475                   for ( jj=0; jj<bs2; jj+=bs ) {
476                     baij_a[jj]  += *value++;
477                   }
478                 }
479               } else {
480                 for ( ii=0; ii<bs; ii++,value+=stepval,baij_a++ ) {
481                   for ( jj=0; jj<bs2; jj+=bs ) {
482                     baij_a[jj]  = *value++;
483                   }
484                 }
485               }
486             } else {
487               value = v + j*(stepval+bs)*bs + i*bs;
488               if (addv == ADD_VALUES) {
489                 for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
490                   for ( jj=0; jj<bs; jj++ ) {
491                     baij_a[jj]  += *value++;
492                   }
493                 }
494               } else {
495                 for ( ii=0; ii<bs; ii++,value+=stepval,baij_a+=bs ) {
496                   for ( jj=0; jj<bs; jj++ ) {
497                     baij_a[jj]  = *value++;
498                   }
499                 }
500               }
501             }
502             break;
503           }
504         }
505         if ( k==size) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"(row,col) has no entry in the hash table");
506       }
507     } else {
508       if (!baij->donotstash) {
509         if (roworiented ) {
510           row   = im[i]*bs;
511           value = v + i*(stepval+bs)*bs;
512           for ( j=0; j<bs; j++,row++ ) {
513             for ( k=0; k<n; k++ ) {
514               for ( col=in[k]*bs,l=0; l<bs; l++,col++) {
515                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
516               }
517             }
518           }
519         } else {
520           for ( j=0; j<n; j++ ) {
521             value = v + j*(stepval+bs)*bs + i*bs;
522             col   = in[j]*bs;
523             for ( k=0; k<bs; k++,col++,value+=stepval) {
524               for ( row = im[i]*bs,l=0; l<bs; l++,row++) {
525                 ierr = StashValues_Private(&baij->stash,row,1,&col,value++,addv);CHKERRQ(ierr);
526               }
527             }
528           }
529         }
530       }
531     }
532   }
533   PetscFunctionReturn(0);
534 }
535 #undef __FUNC__
536 #define __FUNC__ "MatGetValues_MPIBAIJ"
537 int MatGetValues_MPIBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,Scalar *v)
538 {
539   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
540   int        bs=baij->bs,ierr,i,j, bsrstart = baij->rstart*bs, bsrend = baij->rend*bs;
541   int        bscstart = baij->cstart*bs, bscend = baij->cend*bs,row,col;
542 
543   for ( i=0; i<m; i++ ) {
544     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative row");
545     if (idxm[i] >= baij->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Row too large");
546     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
547       row = idxm[i] - bsrstart;
548       for ( j=0; j<n; j++ ) {
549         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Negative column");
550         if (idxn[j] >= baij->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Column too large");
551         if (idxn[j] >= bscstart && idxn[j] < bscend){
552           col = idxn[j] - bscstart;
553           ierr = MatGetValues(baij->A,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
554         } else {
555           if (!baij->colmap) {
556             ierr = CreateColmap_MPIBAIJ_Private(mat);CHKERRQ(ierr);
557           }
558           if((baij->colmap[idxn[j]/bs]-1 < 0) ||
559              (baij->garray[(baij->colmap[idxn[j]/bs]-1)/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
560           else {
561             col  = (baij->colmap[idxn[j]/bs]-1) + idxn[j]%bs;
562             ierr = MatGetValues(baij->B,1,&row,1,&col,v+i*n+j); CHKERRQ(ierr);
563           }
564         }
565       }
566     } else {
567       SETERRQ(PETSC_ERR_SUP,0,"Only local values currently supported");
568     }
569   }
570   PetscFunctionReturn(0);
571 }
572 
573 #undef __FUNC__
574 #define __FUNC__ "MatNorm_MPIBAIJ"
575 int MatNorm_MPIBAIJ(Mat mat,NormType type,double *norm)
576 {
577   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
578   Mat_SeqBAIJ *amat = (Mat_SeqBAIJ*) baij->A->data, *bmat = (Mat_SeqBAIJ*) baij->B->data;
579   int        ierr, i,bs2=baij->bs2;
580   double     sum = 0.0;
581   Scalar     *v;
582 
583   PetscFunctionBegin;
584   if (baij->size == 1) {
585     ierr =  MatNorm(baij->A,type,norm); CHKERRQ(ierr);
586   } else {
587     if (type == NORM_FROBENIUS) {
588       v = amat->a;
589       for (i=0; i<amat->nz*bs2; i++ ) {
590 #if defined(USE_PETSC_COMPLEX)
591         sum += real(conj(*v)*(*v)); v++;
592 #else
593         sum += (*v)*(*v); v++;
594 #endif
595       }
596       v = bmat->a;
597       for (i=0; i<bmat->nz*bs2; i++ ) {
598 #if defined(USE_PETSC_COMPLEX)
599         sum += real(conj(*v)*(*v)); v++;
600 #else
601         sum += (*v)*(*v); v++;
602 #endif
603       }
604       ierr = MPI_Allreduce(&sum,norm,1,MPI_DOUBLE,MPI_SUM,mat->comm);CHKERRQ(ierr);
605       *norm = sqrt(*norm);
606     } else {
607       SETERRQ(PETSC_ERR_SUP,0,"No support for this norm yet");
608     }
609   }
610   PetscFunctionReturn(0);
611 }
612 
613 #undef __FUNC__
614 #define __FUNC__ "MatAssemblyBegin_MPIBAIJ"
615 int MatAssemblyBegin_MPIBAIJ(Mat mat,MatAssemblyType mode)
616 {
617   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
618   MPI_Comm    comm = mat->comm;
619   int         size = baij->size, *owners = baij->rowners,bs=baij->bs;
620   int         rank = baij->rank,tag = mat->tag, *owner,*starts,count,ierr;
621   MPI_Request *send_waits,*recv_waits;
622   int         *nprocs,i,j,idx,*procs,nsends,nreceives,nmax,*work;
623   InsertMode  addv;
624   Scalar      *rvalues,*svalues;
625 
626   PetscFunctionBegin;
627   /* make sure all processors are either in INSERTMODE or ADDMODE */
628   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,comm);CHKERRQ(ierr);
629   if (addv == (ADD_VALUES|INSERT_VALUES)) {
630     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Some processors inserted others added");
631   }
632   mat->insertmode = addv; /* in case this processor had no cache */
633 
634   /*  first count number of contributors to each processor */
635   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
636   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
637   owner = (int *) PetscMalloc( (baij->stash.n+1)*sizeof(int) ); CHKPTRQ(owner);
638   for ( i=0; i<baij->stash.n; i++ ) {
639     idx = baij->stash.idx[i];
640     for ( j=0; j<size; j++ ) {
641       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
642         nprocs[j]++; procs[j] = 1; owner[i] = j; break;
643       }
644     }
645   }
646   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
647 
648   /* inform other processors of number of messages and max length*/
649   work      = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
650   ierr      = MPI_Allreduce(procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
651   nreceives = work[rank];
652   ierr      = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
653   nmax      = work[rank];
654   PetscFree(work);
655 
656   /* post receives:
657        1) each message will consist of ordered pairs
658      (global index,value) we store the global index as a double
659      to simplify the message passing.
660        2) since we don't know how long each individual message is we
661      allocate the largest needed buffer for each receive. Potentially
662      this is a lot of wasted space.
663 
664 
665        This could be done better.
666   */
667   rvalues = (Scalar *) PetscMalloc(3*(nreceives+1)*(nmax+1)*sizeof(Scalar));CHKPTRQ(rvalues);
668   recv_waits = (MPI_Request *) PetscMalloc((nreceives+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
669   for ( i=0; i<nreceives; i++ ) {
670     ierr = MPI_Irecv(rvalues+3*nmax*i,3*nmax,MPIU_SCALAR,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
671   }
672 
673   /* do sends:
674       1) starts[i] gives the starting index in svalues for stuff going to
675          the ith processor
676   */
677   svalues = (Scalar *) PetscMalloc(3*(baij->stash.n+1)*sizeof(Scalar));CHKPTRQ(svalues);
678   send_waits = (MPI_Request *) PetscMalloc((nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
679   starts = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(starts);
680   starts[0] = 0;
681   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
682   for ( i=0; i<baij->stash.n; i++ ) {
683     svalues[3*starts[owner[i]]]       = (Scalar)  baij->stash.idx[i];
684     svalues[3*starts[owner[i]]+1]     = (Scalar)  baij->stash.idy[i];
685     svalues[3*(starts[owner[i]]++)+2] =  baij->stash.array[i];
686   }
687   PetscFree(owner);
688   starts[0] = 0;
689   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
690   count = 0;
691   for ( i=0; i<size; i++ ) {
692     if (procs[i]) {
693       ierr = MPI_Isend(svalues+3*starts[i],3*nprocs[i],MPIU_SCALAR,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
694     }
695   }
696   PetscFree(starts); PetscFree(nprocs);
697 
698   /* Free cache space */
699   PLogInfo(0,"[%d]MatAssemblyBegin_MPIBAIJ:Number of off-processor values %d\n",rank,baij->stash.n);
700   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
701 
702   baij->svalues    = svalues;    baij->rvalues    = rvalues;
703   baij->nsends     = nsends;     baij->nrecvs     = nreceives;
704   baij->send_waits = send_waits; baij->recv_waits = recv_waits;
705   baij->rmax       = nmax;
706 
707   PetscFunctionReturn(0);
708 }
709 
710 /*
711   Creates the hash table, and sets the table
712   This table is created only once.
713   If new entried need to be added to the matrix
714   then the hash table has to be destroyed and
715   recreated.
716 */
717 #undef __FUNC__
718 #define __FUNC__ "MatCreateHashTable_MPIBAIJ_Private"
719 int MatCreateHashTable_MPIBAIJ_Private(Mat mat,double factor)
720 {
721   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
722   Mat         A = baij->A, B=baij->B;
723   Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data, *b=(Mat_SeqBAIJ *)B->data;
724   int         i,j,k,nz=a->nz+b->nz,h1,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
725   int         size,ct=0,max1=0,max2=0,bs2=baij->bs2,rstart=baij->rstart;
726   int         cstart=baij->cstart,*garray=baij->garray,row,col;
727   int         *HT,key;
728   Scalar      **HD;
729 
730   PetscFunctionBegin;
731   baij->ht_size=(int)(factor*nz);
732   size = baij->ht_size;
733 
734   if (baij->ht) {
735     PetscFunctionReturn(0);
736   }
737 
738   /* Allocate Memory for Hash Table */
739   baij->hd = (Scalar**)PetscMalloc((size)*(sizeof(int)+sizeof(Scalar*))+1); CHKPTRQ(baij->hd);
740   baij->ht = (int*)(baij->hd + size);
741   HD = baij->hd;
742   HT = baij->ht;
743 
744 
745   PetscMemzero(HT,size*sizeof(int)+sizeof(Scalar*));
746 
747 
748   /* Loop Over A */
749   for ( i=0; i<a->mbs; i++ ) {
750     for ( j=ai[i]; j<ai[i+1]; j++ ) {
751       row = i+rstart;
752       col = aj[j]+cstart;
753 
754       key = row*baij->Nbs + col + 1;
755       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
756       h1  = HASH1(size,key);
757 
758       for ( k=0; k<size; k++ ){
759         if (HT[(h1+k)%size] == 0.0) {
760           HT[(h1+k)%size] = key;
761           HD[(h1+k)%size] = a->a + j*bs2;
762           break;
763         } else ct++;
764       }
765       if (k> max1) max1 = k;
766     }
767   }
768   /* Loop Over B */
769   for ( i=0; i<b->mbs; i++ ) {
770     for ( j=bi[i]; j<bi[i+1]; j++ ) {
771       row = i+rstart;
772       col = garray[bj[j]];
773       key = row*baij->Nbs + col + 1;
774       /* printf("[%d]row,col,key = %2d,%2d,%5.2f\n",PetscGlobalRank,row,col,key); */
775       h1  = HASH1(size,key);
776       for ( k=0; k<size; k++ ){
777         if (HT[(h1+k)%size] == 0.0) {
778           HT[(h1+k)%size] = key;
779           HD[(h1+k)%size] = b->a + j*bs2;
780           break;
781         } else ct++;
782       }
783       if (k> max2) max2 = k;
784     }
785   }
786 
787   /* Print Summary */
788   for ( i=0,key=0.0,j=0; i<size; i++)
789     if (HT[i]) {j++;}
790 
791   /*printf("[%d] fact = %3.2f max1 = %5d max2 = %5d Size %5d - Searches %5d Avg %5.2f Keys %5d\n",
792          PetscGlobalRank,factor,max1,max2,size,ct+j,((float)ct+j),j); */
793   fflush(stdout);
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNC__
798 #define __FUNC__ "MatAssemblyEnd_MPIBAIJ"
799 int MatAssemblyEnd_MPIBAIJ(Mat mat,MatAssemblyType mode)
800 {
801   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
802   MPI_Status  *send_status,recv_status;
803   int         imdex,nrecvs = baij->nrecvs, count = nrecvs, i, n, ierr;
804   int         bs=baij->bs,row,col,other_disassembled,flg;
805   Scalar      *values,val;
806   InsertMode  addv = mat->insertmode;
807 
808   PetscFunctionBegin;
809   /*  wait on receives */
810   while (count) {
811     ierr = MPI_Waitany(nrecvs,baij->recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
812     /* unpack receives into our local space */
813     values = baij->rvalues + 3*imdex*baij->rmax;
814     ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,&n);CHKERRQ(ierr);
815     n = n/3;
816     for ( i=0; i<n; i++ ) {
817       row = (int) PetscReal(values[3*i]) - baij->rstart*bs;
818       col = (int) PetscReal(values[3*i+1]);
819       val = values[3*i+2];
820       if (col >= baij->cstart*bs && col < baij->cend*bs) {
821         col -= baij->cstart*bs;
822         ierr = MatSetValues(baij->A,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
823       } else {
824         if (mat->was_assembled) {
825           if (!baij->colmap) {
826             ierr = CreateColmap_MPIBAIJ_Private(mat); CHKERRQ(ierr);
827           }
828           col = (baij->colmap[col/bs]) - 1 + col%bs;
829           if (col < 0  && !((Mat_SeqBAIJ*)(baij->A->data))->nonew) {
830             ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
831             col = (int) PetscReal(values[3*i+1]);
832           }
833         }
834         ierr = MatSetValues(baij->B,1,&row,1,&col,&val,addv); CHKERRQ(ierr)
835       }
836     }
837     count--;
838   }
839   PetscFree(baij->recv_waits); PetscFree(baij->rvalues);
840 
841   /* wait on sends */
842   if (baij->nsends) {
843     send_status = (MPI_Status *) PetscMalloc(baij->nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
844     ierr        = MPI_Waitall(baij->nsends,baij->send_waits,send_status);CHKERRQ(ierr);
845     PetscFree(send_status);
846   }
847   PetscFree(baij->send_waits); PetscFree(baij->svalues);
848 
849   ierr = MatAssemblyBegin(baij->A,mode); CHKERRQ(ierr);
850   ierr = MatAssemblyEnd(baij->A,mode); CHKERRQ(ierr);
851 
852   /* determine if any processor has disassembled, if so we must
853      also disassemble ourselfs, in order that we may reassemble. */
854   ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
855   if (mat->was_assembled && !other_disassembled) {
856     ierr = DisAssemble_MPIBAIJ(mat); CHKERRQ(ierr);
857   }
858 
859   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
860     ierr = MatSetUpMultiply_MPIBAIJ(mat); CHKERRQ(ierr);
861   }
862   ierr = MatAssemblyBegin(baij->B,mode); CHKERRQ(ierr);
863   ierr = MatAssemblyEnd(baij->B,mode); CHKERRQ(ierr);
864 
865   ierr = OptionsHasName(PETSC_NULL,"-use_hash",&flg); CHKERRQ(ierr);
866   if (flg && !baij->ht && mode== MAT_FINAL_ASSEMBLY) {
867     double fact = 1.39;
868     ierr = OptionsGetDouble(PETSC_NULL,"-use_hash",&fact,&flg); CHKERRQ(ierr);
869     if (fact <= 1.0) fact = 1.39;
870     PLogInfo(0,"[%d]MatAssemblyEnd_MPIBAIJ:Hash table Factor used %5.2f\n",PetscGlobalRank,fact);
871     ierr = MatCreateHashTable_MPIBAIJ_Private(mat,fact); CHKERRQ(ierr);
872     mat->ops.setvalues        = MatSetValues_MPIBAIJ_HT;
873     mat->ops.setvaluesblocked = MatSetValuesBlocked_MPIBAIJ_HT;
874   }
875   if (baij->rowvalues) {PetscFree(baij->rowvalues); baij->rowvalues = 0;}
876   PetscFunctionReturn(0);
877 }
878 
879 #undef __FUNC__
880 #define __FUNC__ "MatView_MPIBAIJ_Binary"
881 static int MatView_MPIBAIJ_Binary(Mat mat,Viewer viewer)
882 {
883   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
884   int          ierr;
885 
886   PetscFunctionBegin;
887   if (baij->size == 1) {
888     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
889   } else SETERRQ(PETSC_ERR_SUP,0,"Only uniprocessor output supported");
890   PetscFunctionReturn(0);
891 }
892 
893 #undef __FUNC__
894 #define __FUNC__ "MatView_MPIBAIJ_ASCIIorDraworMatlab"
895 static int MatView_MPIBAIJ_ASCIIorDraworMatlab(Mat mat,Viewer viewer)
896 {
897   Mat_MPIBAIJ  *baij = (Mat_MPIBAIJ *) mat->data;
898   int          ierr, format,rank,bs = baij->bs;
899   FILE         *fd;
900   ViewerType   vtype;
901 
902   PetscFunctionBegin;
903   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
904   if (vtype  == ASCII_FILES_VIEWER || vtype == ASCII_FILE_VIEWER) {
905     ierr = ViewerGetFormat(viewer,&format);
906     if (format == VIEWER_FORMAT_ASCII_INFO_LONG) {
907       MatInfo info;
908       MPI_Comm_rank(mat->comm,&rank);
909       ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
910       ierr = MatGetInfo(mat,MAT_LOCAL,&info);
911       PetscSequentialPhaseBegin(mat->comm,1);
912       fprintf(fd,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
913               rank,baij->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
914               baij->bs,(int)info.memory);
915       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);
916       fprintf(fd,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
917       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);
918       fprintf(fd,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);
919       fflush(fd);
920       PetscSequentialPhaseEnd(mat->comm,1);
921       ierr = VecScatterView(baij->Mvctx,viewer); CHKERRQ(ierr);
922       PetscFunctionReturn(0);
923     } else if (format == VIEWER_FORMAT_ASCII_INFO) {
924       PetscPrintf(mat->comm,"  block size is %d\n",bs);
925       PetscFunctionReturn(0);
926     }
927   }
928 
929   if (vtype == DRAW_VIEWER) {
930     Draw       draw;
931     PetscTruth isnull;
932     ierr = ViewerDrawGetDraw(viewer,&draw); CHKERRQ(ierr);
933     ierr = DrawIsNull(draw,&isnull); CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
934   }
935 
936   if (vtype == ASCII_FILE_VIEWER) {
937     ierr = ViewerASCIIGetPointer(viewer,&fd); CHKERRQ(ierr);
938     PetscSequentialPhaseBegin(mat->comm,1);
939     fprintf(fd,"[%d] rows %d starts %d ends %d cols %d starts %d ends %d\n",
940            baij->rank,baij->m,baij->rstart*bs,baij->rend*bs,baij->n,
941             baij->cstart*bs,baij->cend*bs);
942     ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
943     ierr = MatView(baij->B,viewer); CHKERRQ(ierr);
944     fflush(fd);
945     PetscSequentialPhaseEnd(mat->comm,1);
946   } else {
947     int size = baij->size;
948     rank = baij->rank;
949     if (size == 1) {
950       ierr = MatView(baij->A,viewer); CHKERRQ(ierr);
951     } else {
952       /* assemble the entire matrix onto first processor. */
953       Mat         A;
954       Mat_SeqBAIJ *Aloc;
955       int         M = baij->M, N = baij->N,*ai,*aj,row,col,i,j,k,*rvals;
956       int         mbs=baij->mbs;
957       Scalar      *a;
958 
959       if (!rank) {
960         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
961       } else {
962         ierr = MatCreateMPIBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
963       }
964       PLogObjectParent(mat,A);
965 
966       /* copy over the A part */
967       Aloc = (Mat_SeqBAIJ*) baij->A->data;
968       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
969       row = baij->rstart;
970       rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
971 
972       for ( i=0; i<mbs; i++ ) {
973         rvals[0] = bs*(baij->rstart + i);
974         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
975         for ( j=ai[i]; j<ai[i+1]; j++ ) {
976           col = (baij->cstart+aj[j])*bs;
977           for (k=0; k<bs; k++ ) {
978             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
979             col++; a += bs;
980           }
981         }
982       }
983       /* copy over the B part */
984       Aloc = (Mat_SeqBAIJ*) baij->B->data;
985       ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
986       row = baij->rstart*bs;
987       for ( i=0; i<mbs; i++ ) {
988         rvals[0] = bs*(baij->rstart + i);
989         for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
990         for ( j=ai[i]; j<ai[i+1]; j++ ) {
991           col = baij->garray[aj[j]]*bs;
992           for (k=0; k<bs; k++ ) {
993             ierr = MatSetValues(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
994             col++; a += bs;
995           }
996         }
997       }
998       PetscFree(rvals);
999       ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1000       ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1001       /*
1002          Everyone has to call to draw the matrix since the graphics waits are
1003          synchronized across all processors that share the Draw object
1004       */
1005       if (!rank || vtype == DRAW_VIEWER) {
1006         ierr = MatView(((Mat_MPIBAIJ*)(A->data))->A,viewer); CHKERRQ(ierr);
1007       }
1008       ierr = MatDestroy(A); CHKERRQ(ierr);
1009     }
1010   }
1011   PetscFunctionReturn(0);
1012 }
1013 
1014 
1015 
1016 #undef __FUNC__
1017 #define __FUNC__ "MatView_MPIBAIJ"
1018 int MatView_MPIBAIJ(PetscObject obj,Viewer viewer)
1019 {
1020   Mat         mat = (Mat) obj;
1021   int         ierr;
1022   ViewerType  vtype;
1023 
1024   PetscFunctionBegin;
1025   ierr = ViewerGetType(viewer,&vtype); CHKERRQ(ierr);
1026   if (vtype == ASCII_FILE_VIEWER || vtype == ASCII_FILES_VIEWER ||
1027       vtype == DRAW_VIEWER       || vtype == MATLAB_VIEWER) {
1028     ierr = MatView_MPIBAIJ_ASCIIorDraworMatlab(mat,viewer); CHKERRQ(ierr);
1029   } else if (vtype == BINARY_FILE_VIEWER) {
1030     ierr = MatView_MPIBAIJ_Binary(mat,viewer);CHKERRQ(ierr);
1031   }
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 #undef __FUNC__
1036 #define __FUNC__ "MatDestroy_MPIBAIJ"
1037 int MatDestroy_MPIBAIJ(PetscObject obj)
1038 {
1039   Mat         mat = (Mat) obj;
1040   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1041   int         ierr;
1042 
1043   PetscFunctionBegin;
1044 #if defined(USE_PETSC_LOG)
1045   PLogObjectState(obj,"Rows=%d, Cols=%d",baij->M,baij->N);
1046 #endif
1047 
1048   ierr = StashDestroy_Private(&baij->stash); CHKERRQ(ierr);
1049   PetscFree(baij->rowners);
1050   ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1051   ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1052   if (baij->colmap) PetscFree(baij->colmap);
1053   if (baij->garray) PetscFree(baij->garray);
1054   if (baij->lvec)   VecDestroy(baij->lvec);
1055   if (baij->Mvctx)  VecScatterDestroy(baij->Mvctx);
1056   if (baij->rowvalues) PetscFree(baij->rowvalues);
1057   if (baij->barray) PetscFree(baij->barray);
1058   if (baij->hd) PetscFree(baij->hd);
1059   PetscFree(baij);
1060   PLogObjectDestroy(mat);
1061   PetscHeaderDestroy(mat);
1062   PetscFunctionReturn(0);
1063 }
1064 
1065 #undef __FUNC__
1066 #define __FUNC__ "MatMult_MPIBAIJ"
1067 int MatMult_MPIBAIJ(Mat A,Vec xx,Vec yy)
1068 {
1069   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1070   int         ierr, nt;
1071 
1072   PetscFunctionBegin;
1073   VecGetLocalSize_Fast(xx,nt);
1074   if (nt != a->n) {
1075     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible partition of A and xx");
1076   }
1077   VecGetLocalSize_Fast(yy,nt);
1078   if (nt != a->m) {
1079     SETERRQ(PETSC_ERR_ARG_SIZ,0,"Incompatible parition of A and yy");
1080   }
1081   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1082   ierr = (*a->A->ops.mult)(a->A,xx,yy); CHKERRQ(ierr);
1083   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1084   ierr = (*a->B->ops.multadd)(a->B,a->lvec,yy,yy); CHKERRQ(ierr);
1085   ierr = VecScatterPostRecvs(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 #undef __FUNC__
1090 #define __FUNC__ "MatMultAdd_MPIBAIJ"
1091 int MatMultAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1092 {
1093   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1094   int        ierr;
1095 
1096   PetscFunctionBegin;
1097   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1098   ierr = (*a->A->ops.multadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1099   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
1100   ierr = (*a->B->ops.multadd)(a->B,a->lvec,zz,zz); CHKERRQ(ierr);
1101   PetscFunctionReturn(0);
1102 }
1103 
1104 #undef __FUNC__
1105 #define __FUNC__ "MatMultTrans_MPIBAIJ"
1106 int MatMultTrans_MPIBAIJ(Mat A,Vec xx,Vec yy)
1107 {
1108   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1109   int         ierr;
1110 
1111   PetscFunctionBegin;
1112   /* do nondiagonal part */
1113   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1114   /* send it on its way */
1115   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1116   /* do local part */
1117   ierr = (*a->A->ops.multtrans)(a->A,xx,yy); CHKERRQ(ierr);
1118   /* receive remote parts: note this assumes the values are not actually */
1119   /* inserted in yy until the next line, which is true for my implementation*/
1120   /* but is not perhaps always true. */
1121   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
1122   PetscFunctionReturn(0);
1123 }
1124 
1125 #undef __FUNC__
1126 #define __FUNC__ "MatMultTransAdd_MPIBAIJ"
1127 int MatMultTransAdd_MPIBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1128 {
1129   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1130   int         ierr;
1131 
1132   PetscFunctionBegin;
1133   /* do nondiagonal part */
1134   ierr = (*a->B->ops.multtrans)(a->B,xx,a->lvec); CHKERRQ(ierr);
1135   /* send it on its way */
1136   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1137   /* do local part */
1138   ierr = (*a->A->ops.multtransadd)(a->A,xx,yy,zz); CHKERRQ(ierr);
1139   /* receive remote parts: note this assumes the values are not actually */
1140   /* inserted in yy until the next line, which is true for my implementation*/
1141   /* but is not perhaps always true. */
1142   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx); CHKERRQ(ierr);
1143   PetscFunctionReturn(0);
1144 }
1145 
1146 /*
1147   This only works correctly for square matrices where the subblock A->A is the
1148    diagonal block
1149 */
1150 #undef __FUNC__
1151 #define __FUNC__ "MatGetDiagonal_MPIBAIJ"
1152 int MatGetDiagonal_MPIBAIJ(Mat A,Vec v)
1153 {
1154   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1155   int         ierr;
1156 
1157   PetscFunctionBegin;
1158   if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,0,"Supports only square matrix where A->A is diag block");
1159   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1160   PetscFunctionReturn(0);
1161 }
1162 
1163 #undef __FUNC__
1164 #define __FUNC__ "MatScale_MPIBAIJ"
1165 int MatScale_MPIBAIJ(Scalar *aa,Mat A)
1166 {
1167   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1168   int         ierr;
1169 
1170   PetscFunctionBegin;
1171   ierr = MatScale(aa,a->A); CHKERRQ(ierr);
1172   ierr = MatScale(aa,a->B); CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNC__
1177 #define __FUNC__ "MatGetSize_MPIBAIJ"
1178 int MatGetSize_MPIBAIJ(Mat matin,int *m,int *n)
1179 {
1180   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1181 
1182   PetscFunctionBegin;
1183   if (m) *m = mat->M;
1184   if (n) *n = mat->N;
1185   PetscFunctionReturn(0);
1186 }
1187 
1188 #undef __FUNC__
1189 #define __FUNC__ "MatGetLocalSize_MPIBAIJ"
1190 int MatGetLocalSize_MPIBAIJ(Mat matin,int *m,int *n)
1191 {
1192   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1193 
1194   PetscFunctionBegin;
1195   *m = mat->m; *n = mat->N;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 #undef __FUNC__
1200 #define __FUNC__ "MatGetOwnershipRange_MPIBAIJ"
1201 int MatGetOwnershipRange_MPIBAIJ(Mat matin,int *m,int *n)
1202 {
1203   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1204 
1205   PetscFunctionBegin;
1206   *m = mat->rstart*mat->bs; *n = mat->rend*mat->bs;
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 extern int MatGetRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1211 extern int MatRestoreRow_SeqBAIJ(Mat,int,int*,int**,Scalar**);
1212 
1213 #undef __FUNC__
1214 #define __FUNC__ "MatGetRow_MPIBAIJ"
1215 int MatGetRow_MPIBAIJ(Mat matin,int row,int *nz,int **idx,Scalar **v)
1216 {
1217   Mat_MPIBAIJ *mat = (Mat_MPIBAIJ *) matin->data;
1218   Scalar     *vworkA, *vworkB, **pvA, **pvB,*v_p;
1219   int        bs = mat->bs, bs2 = mat->bs2, i, ierr, *cworkA, *cworkB, **pcA, **pcB;
1220   int        nztot, nzA, nzB, lrow, brstart = mat->rstart*bs, brend = mat->rend*bs;
1221   int        *cmap, *idx_p,cstart = mat->cstart;
1222 
1223   PetscFunctionBegin;
1224   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"Already active");
1225   mat->getrowactive = PETSC_TRUE;
1226 
1227   if (!mat->rowvalues && (idx || v)) {
1228     /*
1229         allocate enough space to hold information from the longest row.
1230     */
1231     Mat_SeqBAIJ *Aa = (Mat_SeqBAIJ *) mat->A->data,*Ba = (Mat_SeqBAIJ *) mat->B->data;
1232     int     max = 1,mbs = mat->mbs,tmp;
1233     for ( i=0; i<mbs; i++ ) {
1234       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i];
1235       if (max < tmp) { max = tmp; }
1236     }
1237     mat->rowvalues = (Scalar *) PetscMalloc( max*bs2*(sizeof(int)+sizeof(Scalar)));
1238     CHKPTRQ(mat->rowvalues);
1239     mat->rowindices = (int *) (mat->rowvalues + max*bs2);
1240   }
1241 
1242   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,0,"Only local rows")
1243   lrow = row - brstart;
1244 
1245   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1246   if (!v)   {pvA = 0; pvB = 0;}
1247   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1248   ierr = (*mat->A->ops.getrow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1249   ierr = (*mat->B->ops.getrow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1250   nztot = nzA + nzB;
1251 
1252   cmap  = mat->garray;
1253   if (v  || idx) {
1254     if (nztot) {
1255       /* Sort by increasing column numbers, assuming A and B already sorted */
1256       int imark = -1;
1257       if (v) {
1258         *v = v_p = mat->rowvalues;
1259         for ( i=0; i<nzB; i++ ) {
1260           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1261           else break;
1262         }
1263         imark = i;
1264         for ( i=0; i<nzA; i++ )     v_p[imark+i] = vworkA[i];
1265         for ( i=imark; i<nzB; i++ ) v_p[nzA+i]   = vworkB[i];
1266       }
1267       if (idx) {
1268         *idx = idx_p = mat->rowindices;
1269         if (imark > -1) {
1270           for ( i=0; i<imark; i++ ) {
1271             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1272           }
1273         } else {
1274           for ( i=0; i<nzB; i++ ) {
1275             if (cmap[cworkB[i]/bs] < cstart)
1276               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1277             else break;
1278           }
1279           imark = i;
1280         }
1281         for ( i=0; i<nzA; i++ )     idx_p[imark+i] = cstart*bs + cworkA[i];
1282         for ( i=imark; i<nzB; i++ ) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1283       }
1284     } else {
1285       if (idx) *idx = 0;
1286       if (v)   *v   = 0;
1287     }
1288   }
1289   *nz = nztot;
1290   ierr = (*mat->A->ops.restorerow)(mat->A,lrow,&nzA,pcA,pvA); CHKERRQ(ierr);
1291   ierr = (*mat->B->ops.restorerow)(mat->B,lrow,&nzB,pcB,pvB); CHKERRQ(ierr);
1292   PetscFunctionReturn(0);
1293 }
1294 
1295 #undef __FUNC__
1296 #define __FUNC__ "MatRestoreRow_MPIBAIJ"
1297 int MatRestoreRow_MPIBAIJ(Mat mat,int row,int *nz,int **idx,Scalar **v)
1298 {
1299   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1300 
1301   PetscFunctionBegin;
1302   if (baij->getrowactive == PETSC_FALSE) {
1303     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"MatGetRow not called");
1304   }
1305   baij->getrowactive = PETSC_FALSE;
1306   PetscFunctionReturn(0);
1307 }
1308 
1309 #undef __FUNC__
1310 #define __FUNC__ "MatGetBlockSize_MPIBAIJ"
1311 int MatGetBlockSize_MPIBAIJ(Mat mat,int *bs)
1312 {
1313   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) mat->data;
1314 
1315   PetscFunctionBegin;
1316   *bs = baij->bs;
1317   PetscFunctionReturn(0);
1318 }
1319 
1320 #undef __FUNC__
1321 #define __FUNC__ "MatZeroEntries_MPIBAIJ"
1322 int MatZeroEntries_MPIBAIJ(Mat A)
1323 {
1324   Mat_MPIBAIJ *l = (Mat_MPIBAIJ *) A->data;
1325   int         ierr;
1326 
1327   PetscFunctionBegin;
1328   ierr = MatZeroEntries(l->A); CHKERRQ(ierr);
1329   ierr = MatZeroEntries(l->B); CHKERRQ(ierr);
1330   PetscFunctionReturn(0);
1331 }
1332 
1333 #undef __FUNC__
1334 #define __FUNC__ "MatGetInfo_MPIBAIJ"
1335 int MatGetInfo_MPIBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1336 {
1337   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) matin->data;
1338   Mat         A = a->A, B = a->B;
1339   int         ierr;
1340   double      isend[5], irecv[5];
1341 
1342   PetscFunctionBegin;
1343   info->block_size     = (double)a->bs;
1344   ierr = MatGetInfo(A,MAT_LOCAL,info); CHKERRQ(ierr);
1345   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->memory;
1346   ierr = MatGetInfo(B,MAT_LOCAL,info); CHKERRQ(ierr);
1347   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->memory;
1348   if (flag == MAT_LOCAL) {
1349     info->nz_used      = isend[0];
1350     info->nz_allocated = isend[1];
1351     info->nz_unneeded  = isend[2];
1352     info->memory       = isend[3];
1353     info->mallocs      = isend[4];
1354   } else if (flag == MAT_GLOBAL_MAX) {
1355     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_MAX,matin->comm);CHKERRQ(ierr);
1356     info->nz_used      = irecv[0];
1357     info->nz_allocated = irecv[1];
1358     info->nz_unneeded  = irecv[2];
1359     info->memory       = irecv[3];
1360     info->mallocs      = irecv[4];
1361   } else if (flag == MAT_GLOBAL_SUM) {
1362     ierr = MPI_Allreduce(isend,irecv,5,MPI_INT,MPI_SUM,matin->comm);CHKERRQ(ierr);
1363     info->nz_used      = irecv[0];
1364     info->nz_allocated = irecv[1];
1365     info->nz_unneeded  = irecv[2];
1366     info->memory       = irecv[3];
1367     info->mallocs      = irecv[4];
1368   }
1369   info->rows_global       = (double)a->M;
1370   info->columns_global    = (double)a->N;
1371   info->rows_local        = (double)a->m;
1372   info->columns_local     = (double)a->N;
1373   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1374   info->fill_ratio_needed = 0;
1375   info->factor_mallocs    = 0;
1376   PetscFunctionReturn(0);
1377 }
1378 
1379 #undef __FUNC__
1380 #define __FUNC__ "MatSetOption_MPIBAIJ"
1381 int MatSetOption_MPIBAIJ(Mat A,MatOption op)
1382 {
1383   Mat_MPIBAIJ *a = (Mat_MPIBAIJ *) A->data;
1384 
1385   PetscFunctionBegin;
1386   if (op == MAT_NO_NEW_NONZERO_LOCATIONS ||
1387       op == MAT_YES_NEW_NONZERO_LOCATIONS ||
1388       op == MAT_COLUMNS_UNSORTED ||
1389       op == MAT_COLUMNS_SORTED ||
1390       op == MAT_NEW_NONZERO_ALLOCATION_ERROR ||
1391       op == MAT_NEW_NONZERO_LOCATION_ERROR) {
1392         MatSetOption(a->A,op);
1393         MatSetOption(a->B,op);
1394   } else if (op == MAT_ROW_ORIENTED) {
1395         a->roworiented = 1;
1396         MatSetOption(a->A,op);
1397         MatSetOption(a->B,op);
1398   } else if (op == MAT_ROWS_SORTED ||
1399              op == MAT_ROWS_UNSORTED ||
1400              op == MAT_SYMMETRIC ||
1401              op == MAT_STRUCTURALLY_SYMMETRIC ||
1402              op == MAT_YES_NEW_DIAGONALS)
1403     PLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1404   else if (op == MAT_COLUMN_ORIENTED) {
1405     a->roworiented = 0;
1406     MatSetOption(a->A,op);
1407     MatSetOption(a->B,op);
1408   } else if (op == MAT_IGNORE_OFF_PROC_ENTRIES) {
1409     a->donotstash = 1;
1410   } else if (op == MAT_NO_NEW_DIAGONALS) {
1411     SETERRQ(PETSC_ERR_SUP,0,"MAT_NO_NEW_DIAGONALS");
1412   } else {
1413     SETERRQ(PETSC_ERR_SUP,0,"unknown option");
1414   }
1415   PetscFunctionReturn(0);
1416 }
1417 
1418 #undef __FUNC__
1419 #define __FUNC__ "MatTranspose_MPIBAIJ("
1420 int MatTranspose_MPIBAIJ(Mat A,Mat *matout)
1421 {
1422   Mat_MPIBAIJ *baij = (Mat_MPIBAIJ *) A->data;
1423   Mat_SeqBAIJ *Aloc;
1424   Mat        B;
1425   int        ierr,M=baij->M,N=baij->N,*ai,*aj,row,i,*rvals,j,k,col;
1426   int        bs=baij->bs,mbs=baij->mbs;
1427   Scalar     *a;
1428 
1429   PetscFunctionBegin;
1430   if (matout == PETSC_NULL && M != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"Square matrix only for in-place");
1431   ierr = MatCreateMPIBAIJ(A->comm,baij->bs,PETSC_DECIDE,PETSC_DECIDE,N,M,0,PETSC_NULL,0,PETSC_NULL,&B);
1432   CHKERRQ(ierr);
1433 
1434   /* copy over the A part */
1435   Aloc = (Mat_SeqBAIJ*) baij->A->data;
1436   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1437   row = baij->rstart;
1438   rvals = (int *) PetscMalloc(bs*sizeof(int)); CHKPTRQ(rvals);
1439 
1440   for ( i=0; i<mbs; i++ ) {
1441     rvals[0] = bs*(baij->rstart + i);
1442     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1443     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1444       col = (baij->cstart+aj[j])*bs;
1445       for (k=0; k<bs; k++ ) {
1446         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1447         col++; a += bs;
1448       }
1449     }
1450   }
1451   /* copy over the B part */
1452   Aloc = (Mat_SeqBAIJ*) baij->B->data;
1453   ai = Aloc->i; aj = Aloc->j; a = Aloc->a;
1454   row = baij->rstart*bs;
1455   for ( i=0; i<mbs; i++ ) {
1456     rvals[0] = bs*(baij->rstart + i);
1457     for ( j=1; j<bs; j++ ) { rvals[j] = rvals[j-1] + 1; }
1458     for ( j=ai[i]; j<ai[i+1]; j++ ) {
1459       col = baij->garray[aj[j]]*bs;
1460       for (k=0; k<bs; k++ ) {
1461         ierr = MatSetValues(B,1,&col,bs,rvals,a,INSERT_VALUES);CHKERRQ(ierr);
1462         col++; a += bs;
1463       }
1464     }
1465   }
1466   PetscFree(rvals);
1467   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1468   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1469 
1470   if (matout != PETSC_NULL) {
1471     *matout = B;
1472   } else {
1473     /* This isn't really an in-place transpose .... but free data structures from baij */
1474     PetscFree(baij->rowners);
1475     ierr = MatDestroy(baij->A); CHKERRQ(ierr);
1476     ierr = MatDestroy(baij->B); CHKERRQ(ierr);
1477     if (baij->colmap) PetscFree(baij->colmap);
1478     if (baij->garray) PetscFree(baij->garray);
1479     if (baij->lvec) VecDestroy(baij->lvec);
1480     if (baij->Mvctx) VecScatterDestroy(baij->Mvctx);
1481     PetscFree(baij);
1482     PetscMemcpy(A,B,sizeof(struct _p_Mat));
1483     PetscHeaderDestroy(B);
1484   }
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 #undef __FUNC__
1489 #define __FUNC__ "MatDiagonalScale_MPIBAIJ"
1490 int MatDiagonalScale_MPIBAIJ(Mat A,Vec ll,Vec rr)
1491 {
1492   Mat a = ((Mat_MPIBAIJ *) A->data)->A;
1493   Mat b = ((Mat_MPIBAIJ *) A->data)->B;
1494   int ierr,s1,s2,s3;
1495 
1496   PetscFunctionBegin;
1497   if (ll)  {
1498     ierr = VecGetLocalSize(ll,&s1); CHKERRQ(ierr);
1499     ierr = MatGetLocalSize(A,&s2,&s3); CHKERRQ(ierr);
1500     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,0,"non-conforming local sizes");
1501     ierr = MatDiagonalScale(a,ll,0); CHKERRQ(ierr);
1502     ierr = MatDiagonalScale(b,ll,0); CHKERRQ(ierr);
1503   }
1504   if (rr) SETERRQ(PETSC_ERR_SUP,0,"not supported for right vector");
1505   PetscFunctionReturn(0);
1506 }
1507 
1508 #undef __FUNC__
1509 #define __FUNC__ "MatZeroRows_MPIBAIJ"
1510 int MatZeroRows_MPIBAIJ(Mat A,IS is,Scalar *diag)
1511 {
1512   Mat_MPIBAIJ    *l = (Mat_MPIBAIJ *) A->data;
1513   int            i,ierr,N, *rows,*owners = l->rowners,size = l->size;
1514   int            *procs,*nprocs,j,found,idx,nsends,*work,row;
1515   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1516   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1517   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1518   MPI_Comm       comm = A->comm;
1519   MPI_Request    *send_waits,*recv_waits;
1520   MPI_Status     recv_status,*send_status;
1521   IS             istmp;
1522 
1523   PetscFunctionBegin;
1524   ierr = ISGetSize(is,&N); CHKERRQ(ierr);
1525   ierr = ISGetIndices(is,&rows); CHKERRQ(ierr);
1526 
1527   /*  first count number of contributors to each processor */
1528   nprocs = (int *) PetscMalloc( 2*size*sizeof(int) ); CHKPTRQ(nprocs);
1529   PetscMemzero(nprocs,2*size*sizeof(int)); procs = nprocs + size;
1530   owner = (int *) PetscMalloc((N+1)*sizeof(int)); CHKPTRQ(owner); /* see note*/
1531   for ( i=0; i<N; i++ ) {
1532     idx = rows[i];
1533     found = 0;
1534     for ( j=0; j<size; j++ ) {
1535       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1536         nprocs[j]++; procs[j] = 1; owner[i] = j; found = 1; break;
1537       }
1538     }
1539     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Index out of range");
1540   }
1541   nsends = 0;  for ( i=0; i<size; i++ ) { nsends += procs[i];}
1542 
1543   /* inform other processors of number of messages and max length*/
1544   work   = (int *) PetscMalloc( size*sizeof(int) ); CHKPTRQ(work);
1545   ierr   = MPI_Allreduce( procs, work,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
1546   nrecvs = work[rank];
1547   ierr   = MPI_Allreduce( nprocs, work,size,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
1548   nmax   = work[rank];
1549   PetscFree(work);
1550 
1551   /* post receives:   */
1552   rvalues = (int *) PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int)); CHKPTRQ(rvalues);
1553   recv_waits = (MPI_Request *) PetscMalloc((nrecvs+1)*sizeof(MPI_Request));CHKPTRQ(recv_waits);
1554   for ( i=0; i<nrecvs; i++ ) {
1555     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1556   }
1557 
1558   /* do sends:
1559      1) starts[i] gives the starting index in svalues for stuff going to
1560      the ith processor
1561   */
1562   svalues = (int *) PetscMalloc( (N+1)*sizeof(int) ); CHKPTRQ(svalues);
1563   send_waits = (MPI_Request *) PetscMalloc( (nsends+1)*sizeof(MPI_Request));CHKPTRQ(send_waits);
1564   starts = (int *) PetscMalloc( (size+1)*sizeof(int) ); CHKPTRQ(starts);
1565   starts[0] = 0;
1566   for ( i=1; i<size; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1567   for ( i=0; i<N; i++ ) {
1568     svalues[starts[owner[i]]++] = rows[i];
1569   }
1570   ISRestoreIndices(is,&rows);
1571 
1572   starts[0] = 0;
1573   for ( i=1; i<size+1; i++ ) { starts[i] = starts[i-1] + nprocs[i-1];}
1574   count = 0;
1575   for ( i=0; i<size; i++ ) {
1576     if (procs[i]) {
1577       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1578     }
1579   }
1580   PetscFree(starts);
1581 
1582   base = owners[rank]*bs;
1583 
1584   /*  wait on receives */
1585   lens   = (int *) PetscMalloc( 2*(nrecvs+1)*sizeof(int) ); CHKPTRQ(lens);
1586   source = lens + nrecvs;
1587   count  = nrecvs; slen = 0;
1588   while (count) {
1589     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1590     /* unpack receives into our local space */
1591     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1592     source[imdex]  = recv_status.MPI_SOURCE;
1593     lens[imdex]  = n;
1594     slen += n;
1595     count--;
1596   }
1597   PetscFree(recv_waits);
1598 
1599   /* move the data into the send scatter */
1600   lrows = (int *) PetscMalloc( (slen+1)*sizeof(int) ); CHKPTRQ(lrows);
1601   count = 0;
1602   for ( i=0; i<nrecvs; i++ ) {
1603     values = rvalues + i*nmax;
1604     for ( j=0; j<lens[i]; j++ ) {
1605       lrows[count++] = values[j] - base;
1606     }
1607   }
1608   PetscFree(rvalues); PetscFree(lens);
1609   PetscFree(owner); PetscFree(nprocs);
1610 
1611   /* actually zap the local rows */
1612   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1613   PLogObjectParent(A,istmp);
1614 
1615   ierr = MatZeroRows(l->A,istmp,0); CHKERRQ(ierr);
1616   ierr = MatZeroRows(l->B,istmp,0); CHKERRQ(ierr);
1617   ierr = ISDestroy(istmp); CHKERRQ(ierr);
1618 
1619   if (diag) {
1620     for ( i = 0; i < slen; i++ ) {
1621       row = lrows[i] + rstart_bs;
1622       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES); CHKERRQ(ierr);
1623     }
1624     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1625     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
1626   }
1627   PetscFree(lrows);
1628 
1629   /* wait on sends */
1630   if (nsends) {
1631     send_status = (MPI_Status *) PetscMalloc(nsends*sizeof(MPI_Status));CHKPTRQ(send_status);
1632     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1633     PetscFree(send_status);
1634   }
1635   PetscFree(send_waits); PetscFree(svalues);
1636 
1637   PetscFunctionReturn(0);
1638 }
1639 extern int MatPrintHelp_SeqBAIJ(Mat);
1640 #undef __FUNC__
1641 #define __FUNC__ "MatPrintHelp_MPIBAIJ"
1642 int MatPrintHelp_MPIBAIJ(Mat A)
1643 {
1644   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1645   int         ierr;
1646 
1647   PetscFunctionBegin;
1648   if (!a->rank) {
1649     ierr = MatPrintHelp_SeqBAIJ(a->A);CHKERRQ(ierr);
1650   }
1651   PetscFunctionReturn(0);
1652 }
1653 
1654 #undef __FUNC__
1655 #define __FUNC__ "MatSetUnfactored_MPIBAIJ"
1656 int MatSetUnfactored_MPIBAIJ(Mat A)
1657 {
1658   Mat_MPIBAIJ *a   = (Mat_MPIBAIJ*) A->data;
1659   int         ierr;
1660 
1661   PetscFunctionBegin;
1662   ierr = MatSetUnfactored(a->A); CHKERRQ(ierr);
1663   PetscFunctionReturn(0);
1664 }
1665 
1666 static int MatConvertSameType_MPIBAIJ(Mat,Mat *,int);
1667 
1668 /* -------------------------------------------------------------------*/
1669 static struct _MatOps MatOps = {
1670   MatSetValues_MPIBAIJ,MatGetRow_MPIBAIJ,MatRestoreRow_MPIBAIJ,MatMult_MPIBAIJ,
1671   MatMultAdd_MPIBAIJ,MatMultTrans_MPIBAIJ,MatMultTransAdd_MPIBAIJ,0,
1672   0,0,0,0,
1673   0,0,MatTranspose_MPIBAIJ,MatGetInfo_MPIBAIJ,
1674   0,MatGetDiagonal_MPIBAIJ,MatDiagonalScale_MPIBAIJ,MatNorm_MPIBAIJ,
1675   MatAssemblyBegin_MPIBAIJ,MatAssemblyEnd_MPIBAIJ,0,MatSetOption_MPIBAIJ,
1676   MatZeroEntries_MPIBAIJ,MatZeroRows_MPIBAIJ,0,
1677   0,0,0,MatGetSize_MPIBAIJ,
1678   MatGetLocalSize_MPIBAIJ,MatGetOwnershipRange_MPIBAIJ,0,0,
1679   0,0,MatConvertSameType_MPIBAIJ,0,0,
1680   0,0,0,MatGetSubMatrices_MPIBAIJ,
1681   MatIncreaseOverlap_MPIBAIJ,MatGetValues_MPIBAIJ,0,MatPrintHelp_MPIBAIJ,
1682   MatScale_MPIBAIJ,0,0,0,MatGetBlockSize_MPIBAIJ,
1683   0,0,0,0,0,0,MatSetUnfactored_MPIBAIJ,0,MatSetValuesBlocked_MPIBAIJ};
1684 
1685 
1686 #undef __FUNC__
1687 #define __FUNC__ "MatCreateMPIBAIJ"
1688 /*@C
1689    MatCreateMPIBAIJ - Creates a sparse parallel matrix in block AIJ format
1690    (block compressed row).  For good matrix assembly performance
1691    the user should preallocate the matrix storage by setting the parameters
1692    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1693    performance can be increased by more than a factor of 50.
1694 
1695    Input Parameters:
1696 .  comm - MPI communicator
1697 .  bs   - size of blockk
1698 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1699            This value should be the same as the local size used in creating the
1700            y vector for the matrix-vector product y = Ax.
1701 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1702            This value should be the same as the local size used in creating the
1703            x vector for the matrix-vector product y = Ax.
1704 .  M - number of global rows (or PETSC_DECIDE to have calculated if m is given)
1705 .  N - number of global columns (or PETSC_DECIDE to have calculated if n is given)
1706 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1707            submatrix  (same for all local rows)
1708 .  d_nzz - array containing the number of block nonzeros in the various block rows
1709            of the in diagonal portion of the local (possibly different for each block
1710            row) or PETSC_NULL.  You must leave room for the diagonal entry even if
1711            it is zero.
1712 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1713            submatrix (same for all local rows).
1714 .  o_nzz - array containing the number of nonzeros in the various block rows of the
1715            off-diagonal portion of the local submatrix (possibly different for
1716            each block row) or PETSC_NULL.
1717 
1718    Output Parameter:
1719 .  A - the matrix
1720 
1721    Notes:
1722    The user MUST specify either the local or global matrix dimensions
1723    (possibly both).
1724 
1725    Storage Information:
1726    For a square global matrix we define each processor's diagonal portion
1727    to be its local rows and the corresponding columns (a square submatrix);
1728    each processor's off-diagonal portion encompasses the remainder of the
1729    local matrix (a rectangular submatrix).
1730 
1731    The user can specify preallocated storage for the diagonal part of
1732    the local submatrix with either d_nz or d_nnz (not both).  Set
1733    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1734    memory allocation.  Likewise, specify preallocated storage for the
1735    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1736 
1737    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1738    the figure below we depict these three local rows and all columns (0-11).
1739 
1740 $          0 1 2 3 4 5 6 7 8 9 10 11
1741 $         -------------------
1742 $  row 3  |  o o o d d d o o o o o o
1743 $  row 4  |  o o o d d d o o o o o o
1744 $  row 5  |  o o o d d d o o o o o o
1745 $         -------------------
1746 $
1747 
1748    Thus, any entries in the d locations are stored in the d (diagonal)
1749    submatrix, and any entries in the o locations are stored in the
1750    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1751    stored simply in the MATSEQBAIJ format for compressed row storage.
1752 
1753    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1754    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1755    In general, for PDE problems in which most nonzeros are near the diagonal,
1756    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1757    or you will get TERRIBLE performance; see the users' manual chapter on
1758    matrices.
1759 
1760 .keywords: matrix, block, aij, compressed row, sparse, parallel
1761 
1762 .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues()
1763 @*/
1764 int MatCreateMPIBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,
1765                     int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1766 {
1767   Mat          B;
1768   Mat_MPIBAIJ  *b;
1769   int          ierr, i,sum[2],work[2],mbs,nbs,Mbs=PETSC_DECIDE,Nbs=PETSC_DECIDE,size;
1770 
1771   PetscFunctionBegin;
1772   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Invalid block size specified, must be positive");
1773 
1774   MPI_Comm_size(comm,&size);
1775   if (size == 1) {
1776     if (M == PETSC_DECIDE) M = m;
1777     if (N == PETSC_DECIDE) N = n;
1778     ierr = MatCreateSeqBAIJ(comm,bs,M,N,d_nz,d_nnz,A); CHKERRQ(ierr);
1779     PetscFunctionReturn(0);
1780   }
1781 
1782   *A = 0;
1783   PetscHeaderCreate(B,_p_Mat,MAT_COOKIE,MATMPIBAIJ,comm,MatDestroy,MatView);
1784   PLogObjectCreate(B);
1785   B->data       = (void *) (b = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(b);
1786   PetscMemzero(b,sizeof(Mat_MPIBAIJ));
1787   PetscMemcpy(&B->ops,&MatOps,sizeof(struct _MatOps));
1788 
1789   B->destroy    = MatDestroy_MPIBAIJ;
1790   B->view       = MatView_MPIBAIJ;
1791   B->mapping    = 0;
1792   B->factor     = 0;
1793   B->assembled  = PETSC_FALSE;
1794 
1795   B->insertmode = NOT_SET_VALUES;
1796   MPI_Comm_rank(comm,&b->rank);
1797   MPI_Comm_size(comm,&b->size);
1798 
1799   if ( m == PETSC_DECIDE && (d_nnz != PETSC_NULL || o_nnz != PETSC_NULL)) {
1800     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"Cannot have PETSC_DECIDE rows but set d_nnz or o_nnz");
1801   }
1802   if ( M == PETSC_DECIDE && m == PETSC_DECIDE) {
1803     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either M or m should be specified");
1804   }
1805   if ( N == PETSC_DECIDE && n == PETSC_DECIDE) {
1806     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,0,"either N or n should be specified");
1807   }
1808   if ( M != PETSC_DECIDE && m != PETSC_DECIDE) M = PETSC_DECIDE;
1809   if ( N != PETSC_DECIDE && n != PETSC_DECIDE) N = PETSC_DECIDE;
1810 
1811   if (M == PETSC_DECIDE || N == PETSC_DECIDE) {
1812     work[0] = m; work[1] = n;
1813     mbs = m/bs; nbs = n/bs;
1814     ierr = MPI_Allreduce( work, sum,2,MPI_INT,MPI_SUM,comm );CHKERRQ(ierr);
1815     if (M == PETSC_DECIDE) {M = sum[0]; Mbs = M/bs;}
1816     if (N == PETSC_DECIDE) {N = sum[1]; Nbs = N/bs;}
1817   }
1818   if (m == PETSC_DECIDE) {
1819     Mbs = M/bs;
1820     if (Mbs*bs != M) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global rows must be divisible by blocksize");
1821     mbs = Mbs/b->size + ((Mbs % b->size) > b->rank);
1822     m   = mbs*bs;
1823   }
1824   if (n == PETSC_DECIDE) {
1825     Nbs = N/bs;
1826     if (Nbs*bs != N) SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of global cols must be divisible by blocksize");
1827     nbs = Nbs/b->size + ((Nbs % b->size) > b->rank);
1828     n   = nbs*bs;
1829   }
1830   if (mbs*bs != m || nbs*bs != n) {
1831     SETERRQ(PETSC_ERR_ARG_SIZ,0,"No of local rows, cols must be divisible by blocksize");
1832   }
1833 
1834   b->m = m; B->m = m;
1835   b->n = n; B->n = n;
1836   b->N = N; B->N = N;
1837   b->M = M; B->M = M;
1838   b->bs  = bs;
1839   b->bs2 = bs*bs;
1840   b->mbs = mbs;
1841   b->nbs = nbs;
1842   b->Mbs = Mbs;
1843   b->Nbs = Nbs;
1844 
1845   /* build local table of row and column ownerships */
1846   b->rowners = (int *) PetscMalloc(2*(b->size+2)*sizeof(int)); CHKPTRQ(b->rowners);
1847   PLogObjectMemory(B,2*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1848   b->cowners = b->rowners + b->size + 2;
1849   ierr = MPI_Allgather(&mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1850   b->rowners[0] = 0;
1851   for ( i=2; i<=b->size; i++ ) {
1852     b->rowners[i] += b->rowners[i-1];
1853   }
1854   b->rstart    = b->rowners[b->rank];
1855   b->rend      = b->rowners[b->rank+1];
1856   b->rstart_bs = b->rstart * bs;
1857   b->rend_bs   = b->rend * bs;
1858 
1859   ierr = MPI_Allgather(&nbs,1,MPI_INT,b->cowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
1860   b->cowners[0] = 0;
1861   for ( i=2; i<=b->size; i++ ) {
1862     b->cowners[i] += b->cowners[i-1];
1863   }
1864   b->cstart    = b->cowners[b->rank];
1865   b->cend      = b->cowners[b->rank+1];
1866   b->cstart_bs = b->cstart * bs;
1867   b->cend_bs   = b->cend * bs;
1868 
1869 
1870   if (d_nz == PETSC_DEFAULT) d_nz = 5;
1871   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,n,d_nz,d_nnz,&b->A); CHKERRQ(ierr);
1872   PLogObjectParent(B,b->A);
1873   if (o_nz == PETSC_DEFAULT) o_nz = 0;
1874   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,m,N,o_nz,o_nnz,&b->B); CHKERRQ(ierr);
1875   PLogObjectParent(B,b->B);
1876 
1877   /* build cache for off array entries formed */
1878   ierr = StashBuild_Private(&b->stash); CHKERRQ(ierr);
1879   b->donotstash  = 0;
1880   b->colmap      = 0;
1881   b->garray      = 0;
1882   b->roworiented = 1;
1883 
1884   /* stuff used in block assembly */
1885   b->barray       = 0;
1886 
1887   /* stuff used for matrix vector multiply */
1888   b->lvec         = 0;
1889   b->Mvctx        = 0;
1890 
1891   /* stuff for MatGetRow() */
1892   b->rowindices   = 0;
1893   b->rowvalues    = 0;
1894   b->getrowactive = PETSC_FALSE;
1895 
1896   /* hash table stuff */
1897   b->ht          = 0;
1898   b->ht_size     = 0;
1899 
1900   *A = B;
1901   PetscFunctionReturn(0);
1902 }
1903 
1904 #undef __FUNC__
1905 #define __FUNC__ "MatConvertSameType_MPIBAIJ"
1906 static int MatConvertSameType_MPIBAIJ(Mat matin,Mat *newmat,int cpvalues)
1907 {
1908   Mat         mat;
1909   Mat_MPIBAIJ *a,*oldmat = (Mat_MPIBAIJ *) matin->data;
1910   int         ierr, len=0, flg;
1911 
1912   PetscFunctionBegin;
1913   *newmat       = 0;
1914   PetscHeaderCreate(mat,_p_Mat,MAT_COOKIE,MATMPIBAIJ,matin->comm,MatDestroy,MatView);
1915   PLogObjectCreate(mat);
1916   mat->data       = (void *) (a = PetscNew(Mat_MPIBAIJ)); CHKPTRQ(a);
1917   PetscMemcpy(&mat->ops,&MatOps,sizeof(struct _MatOps));
1918   mat->destroy    = MatDestroy_MPIBAIJ;
1919   mat->view       = MatView_MPIBAIJ;
1920   mat->factor     = matin->factor;
1921   mat->assembled  = PETSC_TRUE;
1922 
1923   a->m = mat->m   = oldmat->m;
1924   a->n = mat->n   = oldmat->n;
1925   a->M = mat->M   = oldmat->M;
1926   a->N = mat->N   = oldmat->N;
1927 
1928   a->bs  = oldmat->bs;
1929   a->bs2 = oldmat->bs2;
1930   a->mbs = oldmat->mbs;
1931   a->nbs = oldmat->nbs;
1932   a->Mbs = oldmat->Mbs;
1933   a->Nbs = oldmat->Nbs;
1934 
1935   a->rstart       = oldmat->rstart;
1936   a->rend         = oldmat->rend;
1937   a->cstart       = oldmat->cstart;
1938   a->cend         = oldmat->cend;
1939   a->size         = oldmat->size;
1940   a->rank         = oldmat->rank;
1941   mat->insertmode = NOT_SET_VALUES;
1942   a->rowvalues    = 0;
1943   a->getrowactive = PETSC_FALSE;
1944   a->barray       = 0;
1945 
1946   a->rowners = (int *) PetscMalloc(2*(a->size+2)*sizeof(int)); CHKPTRQ(a->rowners);
1947   PLogObjectMemory(mat,2*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPIBAIJ));
1948   a->cowners = a->rowners + a->size + 2;
1949   PetscMemcpy(a->rowners,oldmat->rowners,2*(a->size+2)*sizeof(int));
1950   ierr = StashInitialize_Private(&a->stash); CHKERRQ(ierr);
1951   if (oldmat->colmap) {
1952     a->colmap = (int *) PetscMalloc((a->Nbs)*sizeof(int));CHKPTRQ(a->colmap);
1953     PLogObjectMemory(mat,(a->Nbs)*sizeof(int));
1954     PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));
1955   } else a->colmap = 0;
1956   if (oldmat->garray && (len = ((Mat_SeqBAIJ *) (oldmat->B->data))->nbs)) {
1957     a->garray = (int *) PetscMalloc(len*sizeof(int)); CHKPTRQ(a->garray);
1958     PLogObjectMemory(mat,len*sizeof(int));
1959     PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));
1960   } else a->garray = 0;
1961 
1962   ierr =  VecDuplicate(oldmat->lvec,&a->lvec); CHKERRQ(ierr);
1963   PLogObjectParent(mat,a->lvec);
1964   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx); CHKERRQ(ierr);
1965   PLogObjectParent(mat,a->Mvctx);
1966   ierr =  MatConvert(oldmat->A,MATSAME,&a->A); CHKERRQ(ierr);
1967   PLogObjectParent(mat,a->A);
1968   ierr =  MatConvert(oldmat->B,MATSAME,&a->B); CHKERRQ(ierr);
1969   PLogObjectParent(mat,a->B);
1970   ierr = OptionsHasName(PETSC_NULL,"-help",&flg); CHKERRQ(ierr);
1971   if (flg) {
1972     ierr = MatPrintHelp(mat); CHKERRQ(ierr);
1973   }
1974   *newmat = mat;
1975   PetscFunctionReturn(0);
1976 }
1977 
1978 #include "sys.h"
1979 
1980 #undef __FUNC__
1981 #define __FUNC__ "MatLoad_MPIBAIJ"
1982 int MatLoad_MPIBAIJ(Viewer viewer,MatType type,Mat *newmat)
1983 {
1984   Mat          A;
1985   int          i, nz, ierr, j,rstart, rend, fd;
1986   Scalar       *vals,*buf;
1987   MPI_Comm     comm = ((PetscObject)viewer)->comm;
1988   MPI_Status   status;
1989   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
1990   int          *locrowlens,*sndcounts = 0,*procsnz = 0, jj,*mycols,*ibuf;
1991   int          flg,tag = ((PetscObject)viewer)->tag,bs=1,bs2,Mbs,mbs,extra_rows;
1992   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
1993   int          dcount,kmax,k,nzcount,tmp;
1994 
1995   PetscFunctionBegin;
1996   ierr = OptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,&flg);CHKERRQ(ierr);
1997   bs2  = bs*bs;
1998 
1999   MPI_Comm_size(comm,&size); MPI_Comm_rank(comm,&rank);
2000   if (!rank) {
2001     ierr = ViewerBinaryGetDescriptor(viewer,&fd); CHKERRQ(ierr);
2002     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT); CHKERRQ(ierr);
2003     if (header[0] != MAT_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"not matrix object");
2004     if (header[3] < 0) {
2005       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,1,"Matrix stored in special format, cannot load as MPIBAIJ");
2006     }
2007   }
2008 
2009   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2010   M = header[1]; N = header[2];
2011 
2012   if (M != N) SETERRQ(PETSC_ERR_SUP,0,"Can only do square matrices");
2013 
2014   /*
2015      This code adds extra rows to make sure the number of rows is
2016      divisible by the blocksize
2017   */
2018   Mbs        = M/bs;
2019   extra_rows = bs - M + bs*(Mbs);
2020   if (extra_rows == bs) extra_rows = 0;
2021   else                  Mbs++;
2022   if (extra_rows &&!rank) {
2023     PLogInfo(0,"MatLoad_MPIBAIJ:Padding loaded matrix to match blocksize\n");
2024   }
2025 
2026   /* determine ownership of all rows */
2027   mbs = Mbs/size + ((Mbs % size) > rank);
2028   m   = mbs * bs;
2029   rowners = (int *) PetscMalloc(2*(size+2)*sizeof(int)); CHKPTRQ(rowners);
2030   browners = rowners + size + 1;
2031   ierr = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2032   rowners[0] = 0;
2033   for ( i=2; i<=size; i++ ) rowners[i] += rowners[i-1];
2034   for ( i=0; i<=size;  i++ ) browners[i] = rowners[i]*bs;
2035   rstart = rowners[rank];
2036   rend   = rowners[rank+1];
2037 
2038   /* distribute row lengths to all processors */
2039   locrowlens = (int*) PetscMalloc( (rend-rstart)*bs*sizeof(int) ); CHKPTRQ(locrowlens);
2040   if (!rank) {
2041     rowlengths = (int*) PetscMalloc( (M+extra_rows)*sizeof(int) ); CHKPTRQ(rowlengths);
2042     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT); CHKERRQ(ierr);
2043     for ( i=0; i<extra_rows; i++ ) rowlengths[M+i] = 1;
2044     sndcounts = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(sndcounts);
2045     for ( i=0; i<size; i++ ) sndcounts[i] = browners[i+1] - browners[i];
2046     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2047     PetscFree(sndcounts);
2048   } else {
2049     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT, 0,comm);CHKERRQ(ierr);
2050   }
2051 
2052   if (!rank) {
2053     /* calculate the number of nonzeros on each processor */
2054     procsnz = (int*) PetscMalloc( size*sizeof(int) ); CHKPTRQ(procsnz);
2055     PetscMemzero(procsnz,size*sizeof(int));
2056     for ( i=0; i<size; i++ ) {
2057       for ( j=rowners[i]*bs; j< rowners[i+1]*bs; j++ ) {
2058         procsnz[i] += rowlengths[j];
2059       }
2060     }
2061     PetscFree(rowlengths);
2062 
2063     /* determine max buffer needed and allocate it */
2064     maxnz = 0;
2065     for ( i=0; i<size; i++ ) {
2066       maxnz = PetscMax(maxnz,procsnz[i]);
2067     }
2068     cols = (int *) PetscMalloc( maxnz*sizeof(int) ); CHKPTRQ(cols);
2069 
2070     /* read in my part of the matrix column indices  */
2071     nz = procsnz[0];
2072     ibuf = (int *) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2073     mycols = ibuf;
2074     if (size == 1)  nz -= extra_rows;
2075     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT); CHKERRQ(ierr);
2076     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2077 
2078     /* read in every ones (except the last) and ship off */
2079     for ( i=1; i<size-1; i++ ) {
2080       nz   = procsnz[i];
2081       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2082       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2083     }
2084     /* read in the stuff for the last proc */
2085     if ( size != 1 ) {
2086       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2087       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT); CHKERRQ(ierr);
2088       for ( i=0; i<extra_rows; i++ ) cols[nz+i] = M+i;
2089       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2090     }
2091     PetscFree(cols);
2092   } else {
2093     /* determine buffer space needed for message */
2094     nz = 0;
2095     for ( i=0; i<m; i++ ) {
2096       nz += locrowlens[i];
2097     }
2098     ibuf   = (int*) PetscMalloc( nz*sizeof(int) ); CHKPTRQ(ibuf);
2099     mycols = ibuf;
2100     /* receive message of column indices*/
2101     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2102     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2103     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2104   }
2105 
2106   /* loop over local rows, determining number of off diagonal entries */
2107   dlens  = (int *) PetscMalloc( 2*(rend-rstart+1)*sizeof(int) ); CHKPTRQ(dlens);
2108   odlens = dlens + (rend-rstart);
2109   mask   = (int *) PetscMalloc( 3*Mbs*sizeof(int) ); CHKPTRQ(mask);
2110   PetscMemzero(mask,3*Mbs*sizeof(int));
2111   masked1 = mask    + Mbs;
2112   masked2 = masked1 + Mbs;
2113   rowcount = 0; nzcount = 0;
2114   for ( i=0; i<mbs; i++ ) {
2115     dcount  = 0;
2116     odcount = 0;
2117     for ( j=0; j<bs; j++ ) {
2118       kmax = locrowlens[rowcount];
2119       for ( k=0; k<kmax; k++ ) {
2120         tmp = mycols[nzcount++]/bs;
2121         if (!mask[tmp]) {
2122           mask[tmp] = 1;
2123           if (tmp < rstart || tmp >= rend ) masked2[odcount++] = tmp;
2124           else masked1[dcount++] = tmp;
2125         }
2126       }
2127       rowcount++;
2128     }
2129 
2130     dlens[i]  = dcount;
2131     odlens[i] = odcount;
2132 
2133     /* zero out the mask elements we set */
2134     for ( j=0; j<dcount; j++ ) mask[masked1[j]] = 0;
2135     for ( j=0; j<odcount; j++ ) mask[masked2[j]] = 0;
2136   }
2137 
2138   /* create our matrix */
2139   ierr = MatCreateMPIBAIJ(comm,bs,m,PETSC_DECIDE,M+extra_rows,N+extra_rows,0,dlens,0,odlens,newmat);
2140          CHKERRQ(ierr);
2141   A = *newmat;
2142   MatSetOption(A,MAT_COLUMNS_SORTED);
2143 
2144   if (!rank) {
2145     buf = (Scalar *) PetscMalloc( maxnz*sizeof(Scalar) ); CHKPTRQ(buf);
2146     /* read in my part of the matrix numerical values  */
2147     nz = procsnz[0];
2148     vals = buf;
2149     mycols = ibuf;
2150     if (size == 1)  nz -= extra_rows;
2151     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2152     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2153 
2154     /* insert into matrix */
2155     jj      = rstart*bs;
2156     for ( i=0; i<m; i++ ) {
2157       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2158       mycols += locrowlens[i];
2159       vals   += locrowlens[i];
2160       jj++;
2161     }
2162     /* read in other processors (except the last one) and ship out */
2163     for ( i=1; i<size-1; i++ ) {
2164       nz   = procsnz[i];
2165       vals = buf;
2166       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2167       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2168     }
2169     /* the last proc */
2170     if ( size != 1 ){
2171       nz   = procsnz[i] - extra_rows;
2172       vals = buf;
2173       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR); CHKERRQ(ierr);
2174       for ( i=0; i<extra_rows; i++ ) vals[nz+i] = 1.0;
2175       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2176     }
2177     PetscFree(procsnz);
2178   } else {
2179     /* receive numeric values */
2180     buf = (Scalar*) PetscMalloc( nz*sizeof(Scalar) ); CHKPTRQ(buf);
2181 
2182     /* receive message of values*/
2183     vals   = buf;
2184     mycols = ibuf;
2185     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2186     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2187     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,0,"something is wrong with file");
2188 
2189     /* insert into matrix */
2190     jj      = rstart*bs;
2191     for ( i=0; i<m; i++ ) {
2192       ierr    = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2193       mycols += locrowlens[i];
2194       vals   += locrowlens[i];
2195       jj++;
2196     }
2197   }
2198   PetscFree(locrowlens);
2199   PetscFree(buf);
2200   PetscFree(ibuf);
2201   PetscFree(rowners);
2202   PetscFree(dlens);
2203   PetscFree(mask);
2204   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2205   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); CHKERRQ(ierr);
2206   PetscFunctionReturn(0);
2207 }
2208 
2209 
2210