xref: /petsc/src/mat/impls/sbaij/mpi/mpisbaij.c (revision 5ed2d596fd9914de76abbaed56c65f4792ae57ae)
1 /*$Id: mpisbaij.c,v 1.61 2001/08/10 03:31:37 bsmith Exp $*/
2 
3 #include "src/mat/impls/baij/mpi/mpibaij.h"    /*I "petscmat.h" I*/
4 #include "src/vec/vecimpl.h"
5 #include "mpisbaij.h"
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 
8 extern int MatSetUpMultiply_MPISBAIJ(Mat);
9 extern int MatSetUpMultiply_MPISBAIJ_2comm(Mat);
10 extern int DisAssemble_MPISBAIJ(Mat);
11 extern int MatIncreaseOverlap_MPISBAIJ(Mat,int,IS *,int);
12 extern int MatGetSubMatrices_MPISBAIJ(Mat,int,IS *,IS *,MatReuse,Mat **);
13 extern int MatGetValues_SeqSBAIJ(Mat,int,int *,int,int *,PetscScalar *);
14 extern int MatSetValues_SeqSBAIJ(Mat,int,int *,int,int *,PetscScalar *,InsertMode);
15 extern int MatSetValuesBlocked_SeqSBAIJ(Mat,int,int*,int,int*,PetscScalar*,InsertMode);
16 extern int MatGetRow_SeqSBAIJ(Mat,int,int*,int**,PetscScalar**);
17 extern int MatRestoreRow_SeqSBAIJ(Mat,int,int*,int**,PetscScalar**);
18 extern int MatPrintHelp_SeqSBAIJ(Mat);
19 extern int MatZeroRows_SeqSBAIJ(Mat,IS,PetscScalar*);
20 extern int MatZeroRows_SeqBAIJ(Mat,IS,PetscScalar *);
21 extern int MatGetRowMax_MPISBAIJ(Mat,Vec);
22 extern int MatRelax_MPISBAIJ(Mat,Vec,PetscReal,MatSORType,PetscReal,int,int,Vec);
23 
24 /*  UGLY, ugly, ugly
25    When MatScalar == PetscScalar the function MatSetValuesBlocked_MPIBAIJ_MatScalar() does
26    not exist. Otherwise ..._MatScalar() takes matrix elements in single precision and
27    inserts them into the single precision data structure. The function MatSetValuesBlocked_MPIBAIJ()
28    converts the entries into single precision and then calls ..._MatScalar() to put them
29    into the single precision data structures.
30 */
31 #if defined(PETSC_USE_MAT_SINGLE)
32 extern int MatSetValuesBlocked_SeqSBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
33 extern int MatSetValues_MPISBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
34 extern int MatSetValuesBlocked_MPISBAIJ_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
35 extern int MatSetValues_MPISBAIJ_HT_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
36 extern int MatSetValuesBlocked_MPISBAIJ_HT_MatScalar(Mat,int,int*,int,int*,MatScalar*,InsertMode);
37 #else
38 #define MatSetValuesBlocked_SeqSBAIJ_MatScalar      MatSetValuesBlocked_SeqSBAIJ
39 #define MatSetValues_MPISBAIJ_MatScalar             MatSetValues_MPISBAIJ
40 #define MatSetValuesBlocked_MPISBAIJ_MatScalar      MatSetValuesBlocked_MPISBAIJ
41 #define MatSetValues_MPISBAIJ_HT_MatScalar          MatSetValues_MPISBAIJ_HT
42 #define MatSetValuesBlocked_MPISBAIJ_HT_MatScalar   MatSetValuesBlocked_MPISBAIJ_HT
43 #endif
44 
45 EXTERN_C_BEGIN
46 #undef __FUNCT__
47 #define __FUNCT__ "MatStoreValues_MPISBAIJ"
48 int MatStoreValues_MPISBAIJ(Mat mat)
49 {
50   Mat_MPISBAIJ *aij = (Mat_MPISBAIJ *)mat->data;
51   int          ierr;
52 
53   PetscFunctionBegin;
54   ierr = MatStoreValues(aij->A);CHKERRQ(ierr);
55   ierr = MatStoreValues(aij->B);CHKERRQ(ierr);
56   PetscFunctionReturn(0);
57 }
58 EXTERN_C_END
59 
60 EXTERN_C_BEGIN
61 #undef __FUNCT__
62 #define __FUNCT__ "MatRetrieveValues_MPISBAIJ"
63 int MatRetrieveValues_MPISBAIJ(Mat mat)
64 {
65   Mat_MPISBAIJ *aij = (Mat_MPISBAIJ *)mat->data;
66   int          ierr;
67 
68   PetscFunctionBegin;
69   ierr = MatRetrieveValues(aij->A);CHKERRQ(ierr);
70   ierr = MatRetrieveValues(aij->B);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 EXTERN_C_END
74 
75 /*
76      Local utility routine that creates a mapping from the global column
77    number to the local number in the off-diagonal part of the local
78    storage of the matrix.  This is done in a non scable way since the
79    length of colmap equals the global matrix length.
80 */
81 #undef __FUNCT__
82 #define __FUNCT__ "CreateColmap_MPISBAIJ_Private"
83 static int CreateColmap_MPISBAIJ_Private(Mat mat)
84 {
85   PetscFunctionBegin;
86   SETERRQ(1,"Function not yet written for SBAIJ format");
87   /* PetscFunctionReturn(0); */
88 }
89 
90 #define CHUNKSIZE  10
91 
92 #define  MatSetValues_SeqSBAIJ_A_Private(row,col,value,addv) \
93 { \
94  \
95     brow = row/bs;  \
96     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
97     rmax = aimax[brow]; nrow = ailen[brow]; \
98       bcol = col/bs; \
99       ridx = row % bs; cidx = col % bs; \
100       low = 0; high = nrow; \
101       while (high-low > 3) { \
102         t = (low+high)/2; \
103         if (rp[t] > bcol) high = t; \
104         else              low  = t; \
105       } \
106       for (_i=low; _i<high; _i++) { \
107         if (rp[_i] > bcol) break; \
108         if (rp[_i] == bcol) { \
109           bap  = ap +  bs2*_i + bs*cidx + ridx; \
110           if (addv == ADD_VALUES) *bap += value;  \
111           else                    *bap  = value;  \
112           goto a_noinsert; \
113         } \
114       } \
115       if (a->nonew == 1) goto a_noinsert; \
116       else if (a->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
117       if (nrow >= rmax) { \
118         /* there is no extra room in row, therefore enlarge */ \
119         int       new_nz = ai[a->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
120         MatScalar *new_a; \
121  \
122         if (a->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
123  \
124         /* malloc new storage space */ \
125         len   = new_nz*(sizeof(int)+bs2*sizeof(MatScalar))+(a->mbs+1)*sizeof(int); \
126         ierr  = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
127         new_j = (int*)(new_a + bs2*new_nz); \
128         new_i = new_j + new_nz; \
129  \
130         /* copy over old data into new slots */ \
131         for (ii=0; ii<brow+1; ii++) {new_i[ii] = ai[ii];} \
132         for (ii=brow+1; ii<a->mbs+1; ii++) {new_i[ii] = ai[ii]+CHUNKSIZE;} \
133         ierr = PetscMemcpy(new_j,aj,(ai[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
134         len = (new_nz - CHUNKSIZE - ai[brow] - nrow); \
135         ierr = PetscMemcpy(new_j+ai[brow]+nrow+CHUNKSIZE,aj+ai[brow]+nrow,len*sizeof(int));CHKERRQ(ierr); \
136         ierr = PetscMemcpy(new_a,aa,(ai[brow]+nrow)*bs2*sizeof(MatScalar));CHKERRQ(ierr); \
137         ierr = PetscMemzero(new_a+bs2*(ai[brow]+nrow),bs2*CHUNKSIZE*sizeof(PetscScalar));CHKERRQ(ierr); \
138         ierr = PetscMemcpy(new_a+bs2*(ai[brow]+nrow+CHUNKSIZE), \
139                     aa+bs2*(ai[brow]+nrow),bs2*len*sizeof(MatScalar));CHKERRQ(ierr);  \
140         /* free up old matrix storage */ \
141         ierr = PetscFree(a->a);CHKERRQ(ierr);  \
142         if (!a->singlemalloc) { \
143           ierr = PetscFree(a->i);CHKERRQ(ierr); \
144           ierr = PetscFree(a->j);CHKERRQ(ierr);\
145         } \
146         aa = a->a = new_a; ai = a->i = new_i; aj = a->j = new_j;  \
147         a->singlemalloc = PETSC_TRUE; \
148  \
149         rp   = aj + ai[brow]; ap = aa + bs2*ai[brow]; \
150         rmax = aimax[brow] = aimax[brow] + CHUNKSIZE; \
151         PetscLogObjectMemory(A,CHUNKSIZE*(sizeof(int) + bs2*sizeof(MatScalar))); \
152         a->s_maxnz += bs2*CHUNKSIZE; \
153         a->reallocs++; \
154         a->s_nz++; \
155       } \
156       N = nrow++ - 1;  \
157       /* shift up all the later entries in this row */ \
158       for (ii=N; ii>=_i; ii--) { \
159         rp[ii+1] = rp[ii]; \
160         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(MatScalar));CHKERRQ(ierr); \
161       } \
162       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(MatScalar));CHKERRQ(ierr); }  \
163       rp[_i]                      = bcol;  \
164       ap[bs2*_i + bs*cidx + ridx] = value;  \
165       a_noinsert:; \
166     ailen[brow] = nrow; \
167 }
168 #ifndef MatSetValues_SeqBAIJ_B_Private
169 #define  MatSetValues_SeqSBAIJ_B_Private(row,col,value,addv) \
170 { \
171     brow = row/bs;  \
172     rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
173     rmax = bimax[brow]; nrow = bilen[brow]; \
174       bcol = col/bs; \
175       ridx = row % bs; cidx = col % bs; \
176       low = 0; high = nrow; \
177       while (high-low > 3) { \
178         t = (low+high)/2; \
179         if (rp[t] > bcol) high = t; \
180         else              low  = t; \
181       } \
182       for (_i=low; _i<high; _i++) { \
183         if (rp[_i] > bcol) break; \
184         if (rp[_i] == bcol) { \
185           bap  = ap +  bs2*_i + bs*cidx + ridx; \
186           if (addv == ADD_VALUES) *bap += value;  \
187           else                    *bap  = value;  \
188           goto b_noinsert; \
189         } \
190       } \
191       if (b->nonew == 1) goto b_noinsert; \
192       else if (b->nonew == -1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero into matrix"); \
193       if (nrow >= rmax) { \
194         /* there is no extra room in row, therefore enlarge */ \
195         int       new_nz = bi[b->mbs] + CHUNKSIZE,len,*new_i,*new_j; \
196         MatScalar *new_a; \
197  \
198         if (b->nonew == -2) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero in the matrix"); \
199  \
200         /* malloc new storage space */ \
201         len   = new_nz*(sizeof(int)+bs2*sizeof(MatScalar))+(b->mbs+1)*sizeof(int); \
202         ierr  = PetscMalloc(len,&new_a);CHKERRQ(ierr); \
203         new_j = (int*)(new_a + bs2*new_nz); \
204         new_i = new_j + new_nz; \
205  \
206         /* copy over old data into new slots */ \
207         for (ii=0; ii<brow+1; ii++) {new_i[ii] = bi[ii];} \
208         for (ii=brow+1; ii<b->mbs+1; ii++) {new_i[ii] = bi[ii]+CHUNKSIZE;} \
209         ierr = PetscMemcpy(new_j,bj,(bi[brow]+nrow)*sizeof(int));CHKERRQ(ierr); \
210         len  = (new_nz - CHUNKSIZE - bi[brow] - nrow); \
211         ierr = PetscMemcpy(new_j+bi[brow]+nrow+CHUNKSIZE,bj+bi[brow]+nrow,len*sizeof(int));CHKERRQ(ierr); \
212         ierr = PetscMemcpy(new_a,ba,(bi[brow]+nrow)*bs2*sizeof(MatScalar));CHKERRQ(ierr); \
213         ierr = PetscMemzero(new_a+bs2*(bi[brow]+nrow),bs2*CHUNKSIZE*sizeof(MatScalar));CHKERRQ(ierr); \
214         ierr = PetscMemcpy(new_a+bs2*(bi[brow]+nrow+CHUNKSIZE), \
215                     ba+bs2*(bi[brow]+nrow),bs2*len*sizeof(MatScalar));CHKERRQ(ierr);  \
216         /* free up old matrix storage */ \
217         ierr = PetscFree(b->a);CHKERRQ(ierr);  \
218         if (!b->singlemalloc) { \
219           ierr = PetscFree(b->i);CHKERRQ(ierr); \
220           ierr = PetscFree(b->j);CHKERRQ(ierr); \
221         } \
222         ba = b->a = new_a; bi = b->i = new_i; bj = b->j = new_j;  \
223         b->singlemalloc = PETSC_TRUE; \
224  \
225         rp   = bj + bi[brow]; ap = ba + bs2*bi[brow]; \
226         rmax = bimax[brow] = bimax[brow] + CHUNKSIZE; \
227         PetscLogObjectMemory(B,CHUNKSIZE*(sizeof(int) + bs2*sizeof(MatScalar))); \
228         b->maxnz += bs2*CHUNKSIZE; \
229         b->reallocs++; \
230         b->nz++; \
231       } \
232       N = nrow++ - 1;  \
233       /* shift up all the later entries in this row */ \
234       for (ii=N; ii>=_i; ii--) { \
235         rp[ii+1] = rp[ii]; \
236         ierr = PetscMemcpy(ap+bs2*(ii+1),ap+bs2*(ii),bs2*sizeof(MatScalar));CHKERRQ(ierr); \
237       } \
238       if (N>=_i) { ierr = PetscMemzero(ap+bs2*_i,bs2*sizeof(MatScalar));CHKERRQ(ierr);}  \
239       rp[_i]                      = bcol;  \
240       ap[bs2*_i + bs*cidx + ridx] = value;  \
241       b_noinsert:; \
242     bilen[brow] = nrow; \
243 }
244 #endif
245 
246 #if defined(PETSC_USE_MAT_SINGLE)
247 #undef __FUNCT__
248 #define __FUNCT__ "MatSetValues_MPISBAIJ"
249 int MatSetValues_MPISBAIJ(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
250 {
251   Mat_MPISBAIJ *b = (Mat_MPISBAIJ*)mat->data;
252   int          ierr,i,N = m*n;
253   MatScalar    *vsingle;
254 
255   PetscFunctionBegin;
256   if (N > b->setvalueslen) {
257     if (b->setvaluescopy) {ierr = PetscFree(b->setvaluescopy);CHKERRQ(ierr);}
258     ierr = PetscMalloc(N*sizeof(MatScalar),&b->setvaluescopy);CHKERRQ(ierr);
259     b->setvalueslen  = N;
260   }
261   vsingle = b->setvaluescopy;
262 
263   for (i=0; i<N; i++) {
264     vsingle[i] = v[i];
265   }
266   ierr = MatSetValues_MPISBAIJ_MatScalar(mat,m,im,n,in,vsingle,addv);CHKERRQ(ierr);
267   PetscFunctionReturn(0);
268 }
269 
270 #undef __FUNCT__
271 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ"
272 int MatSetValuesBlocked_MPISBAIJ(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
273 {
274   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
275   int         ierr,i,N = m*n*b->bs2;
276   MatScalar   *vsingle;
277 
278   PetscFunctionBegin;
279   if (N > b->setvalueslen) {
280     if (b->setvaluescopy) {ierr = PetscFree(b->setvaluescopy);CHKERRQ(ierr);}
281     ierr = PetscMalloc(N*sizeof(MatScalar),&b->setvaluescopy);CHKERRQ(ierr);
282     b->setvalueslen  = N;
283   }
284   vsingle = b->setvaluescopy;
285   for (i=0; i<N; i++) {
286     vsingle[i] = v[i];
287   }
288   ierr = MatSetValuesBlocked_MPISBAIJ_MatScalar(mat,m,im,n,in,vsingle,addv);CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 #undef __FUNCT__
293 #define __FUNCT__ "MatSetValues_MPISBAIJ_HT"
294 int MatSetValues_MPISBAIJ_HT(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
295 {
296   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
297   int         ierr,i,N = m*n;
298   MatScalar   *vsingle;
299 
300   PetscFunctionBegin;
301   SETERRQ(1,"Function not yet written for SBAIJ format");
302   /* PetscFunctionReturn(0); */
303 }
304 
305 #undef __FUNCT__
306 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ_HT"
307 int MatSetValuesBlocked_MPISBAIJ_HT(Mat mat,int m,int *im,int n,int *in,PetscScalar *v,InsertMode addv)
308 {
309   Mat_MPIBAIJ *b = (Mat_MPIBAIJ*)mat->data;
310   int         ierr,i,N = m*n*b->bs2;
311   MatScalar   *vsingle;
312 
313   PetscFunctionBegin;
314   SETERRQ(1,"Function not yet written for SBAIJ format");
315   /* PetscFunctionReturn(0); */
316 }
317 #endif
318 
319 /* Only add/insert a(i,j) with i<=j (blocks).
320    Any a(i,j) with i>j input by user is ingored.
321 */
322 #undef __FUNCT__
323 #define __FUNCT__ "MatSetValues_MPIBAIJ"
324 int MatSetValues_MPISBAIJ_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
325 {
326   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
327   MatScalar    value;
328   PetscTruth   roworiented = baij->roworiented;
329   int          ierr,i,j,row,col;
330   int          rstart_orig=baij->rstart_bs;
331   int          rend_orig=baij->rend_bs,cstart_orig=baij->cstart_bs;
332   int          cend_orig=baij->cend_bs,bs=baij->bs;
333 
334   /* Some Variables required in the macro */
335   Mat          A = baij->A;
336   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)(A)->data;
337   int          *aimax=a->imax,*ai=a->i,*ailen=a->ilen,*aj=a->j;
338   MatScalar    *aa=a->a;
339 
340   Mat          B = baij->B;
341   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ*)(B)->data;
342   int          *bimax=b->imax,*bi=b->i,*bilen=b->ilen,*bj=b->j;
343   MatScalar    *ba=b->a;
344 
345   int          *rp,ii,nrow,_i,rmax,N,brow,bcol;
346   int          low,high,t,ridx,cidx,bs2=a->bs2;
347   MatScalar    *ap,*bap;
348 
349   /* for stash */
350   int          n_loc, *in_loc=0;
351   MatScalar    *v_loc=0;
352 
353   PetscFunctionBegin;
354 
355   if(!baij->donotstash){
356     ierr = PetscMalloc(n*sizeof(int),&in_loc);CHKERRQ(ierr);
357     ierr = PetscMalloc(n*sizeof(MatScalar),&v_loc);CHKERRQ(ierr);
358   }
359 
360   for (i=0; i<m; i++) {
361     if (im[i] < 0) continue;
362 #if defined(PETSC_USE_BOPT_g)
363     if (im[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
364 #endif
365     if (im[i] >= rstart_orig && im[i] < rend_orig) { /* this processor entry */
366       row = im[i] - rstart_orig;              /* local row index */
367       for (j=0; j<n; j++) {
368         if (im[i]/bs > in[j]/bs) continue;    /* ignore lower triangular blocks */
369         if (in[j] >= cstart_orig && in[j] < cend_orig){  /* diag entry (A) */
370           col = in[j] - cstart_orig;          /* local col index */
371           brow = row/bs; bcol = col/bs;
372           if (brow > bcol) continue;  /* ignore lower triangular blocks of A */
373           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
374           MatSetValues_SeqSBAIJ_A_Private(row,col,value,addv);
375           /* ierr = MatSetValues_SeqBAIJ(baij->A,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
376         } else if (in[j] < 0) continue;
377 #if defined(PETSC_USE_BOPT_g)
378         else if (in[j] >= mat->N) {SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Col too large");}
379 #endif
380         else {  /* off-diag entry (B) */
381           if (mat->was_assembled) {
382             if (!baij->colmap) {
383               ierr = CreateColmap_MPISBAIJ_Private(mat);CHKERRQ(ierr);
384             }
385 #if defined (PETSC_USE_CTABLE)
386             ierr = PetscTableFind(baij->colmap,in[j]/bs + 1,&col);CHKERRQ(ierr);
387             col  = col - 1;
388 #else
389             col = baij->colmap[in[j]/bs] - 1;
390 #endif
391             if (col < 0 && !((Mat_SeqSBAIJ*)(baij->A->data))->nonew) {
392               ierr = DisAssemble_MPISBAIJ(mat);CHKERRQ(ierr);
393               col =  in[j];
394               /* Reinitialize the variables required by MatSetValues_SeqBAIJ_B_Private() */
395               B = baij->B;
396               b = (Mat_SeqBAIJ*)(B)->data;
397               bimax=b->imax;bi=b->i;bilen=b->ilen;bj=b->j;
398               ba=b->a;
399             } else col += in[j]%bs;
400           } else col = in[j];
401           if (roworiented) value = v[i*n+j]; else value = v[i+j*m];
402           MatSetValues_SeqSBAIJ_B_Private(row,col,value,addv);
403           /* ierr = MatSetValues_SeqBAIJ(baij->B,1,&row,1,&col,&value,addv);CHKERRQ(ierr); */
404         }
405       }
406     } else {  /* off processor entry */
407       if (!baij->donotstash) {
408         n_loc = 0;
409         for (j=0; j<n; j++){
410           if (im[i]/bs > in[j]/bs) continue; /* ignore lower triangular blocks */
411           in_loc[n_loc] = in[j];
412           if (roworiented) {
413             v_loc[n_loc] = v[i*n+j];
414           } else {
415             v_loc[n_loc] = v[j*m+i];
416           }
417           n_loc++;
418         }
419         ierr = MatStashValuesRow_Private(&mat->stash,im[i],n_loc,in_loc,v_loc);CHKERRQ(ierr);
420       }
421     }
422   }
423 
424   if(!baij->donotstash){
425     ierr = PetscFree(in_loc);CHKERRQ(ierr);
426     ierr = PetscFree(v_loc);CHKERRQ(ierr);
427   }
428   PetscFunctionReturn(0);
429 }
430 
431 #undef __FUNCT__
432 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ"
433 int MatSetValuesBlocked_MPISBAIJ_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
434 {
435   PetscFunctionBegin;
436   SETERRQ(1,"Function not yet written for SBAIJ format");
437   /* PetscFunctionReturn(0); */
438 }
439 
440 #define HASH_KEY 0.6180339887
441 #define HASH(size,key,tmp) (tmp = (key)*HASH_KEY,(int)((size)*(tmp-(int)tmp)))
442 /* #define HASH(size,key) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
443 /* #define HASH(size,key,tmp) ((int)((size)*fmod(((key)*HASH_KEY),1))) */
444 #undef __FUNCT__
445 #define __FUNCT__ "MatSetValues_MPISBAIJ_HT_MatScalar"
446 int MatSetValues_MPISBAIJ_HT_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
447 {
448   PetscFunctionBegin;
449   SETERRQ(1,"Function not yet written for SBAIJ format");
450   /* PetscFunctionReturn(0); */
451 }
452 
453 #undef __FUNCT__
454 #define __FUNCT__ "MatSetValuesBlocked_MPISBAIJ_HT_MatScalar"
455 int MatSetValuesBlocked_MPISBAIJ_HT_MatScalar(Mat mat,int m,int *im,int n,int *in,MatScalar *v,InsertMode addv)
456 {
457   PetscFunctionBegin;
458   SETERRQ(1,"Function not yet written for SBAIJ format");
459   /* PetscFunctionReturn(0); */
460 }
461 
462 #undef __FUNCT__
463 #define __FUNCT__ "MatGetValues_MPISBAIJ"
464 int MatGetValues_MPISBAIJ(Mat mat,int m,int *idxm,int n,int *idxn,PetscScalar *v)
465 {
466   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
467   int          bs=baij->bs,ierr,i,j,bsrstart = baij->rstart*bs,bsrend = baij->rend*bs;
468   int          bscstart = baij->cstart*bs,bscend = baij->cend*bs,row,col,data;
469 
470   PetscFunctionBegin;
471   for (i=0; i<m; i++) {
472     if (idxm[i] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative row");
473     if (idxm[i] >= mat->M) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Row too large");
474     if (idxm[i] >= bsrstart && idxm[i] < bsrend) {
475       row = idxm[i] - bsrstart;
476       for (j=0; j<n; j++) {
477         if (idxn[j] < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative column");
478         if (idxn[j] >= mat->N) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Column too large");
479         if (idxn[j] >= bscstart && idxn[j] < bscend){
480           col = idxn[j] - bscstart;
481           ierr = MatGetValues_SeqSBAIJ(baij->A,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
482         } else {
483           if (!baij->colmap) {
484             ierr = CreateColmap_MPISBAIJ_Private(mat);CHKERRQ(ierr);
485           }
486 #if defined (PETSC_USE_CTABLE)
487           ierr = PetscTableFind(baij->colmap,idxn[j]/bs+1,&data);CHKERRQ(ierr);
488           data --;
489 #else
490           data = baij->colmap[idxn[j]/bs]-1;
491 #endif
492           if((data < 0) || (baij->garray[data/bs] != idxn[j]/bs)) *(v+i*n+j) = 0.0;
493           else {
494             col  = data + idxn[j]%bs;
495             ierr = MatGetValues_SeqSBAIJ(baij->B,1,&row,1,&col,v+i*n+j);CHKERRQ(ierr);
496           }
497         }
498       }
499     } else {
500       SETERRQ(PETSC_ERR_SUP,"Only local values currently supported");
501     }
502   }
503  PetscFunctionReturn(0);
504 }
505 
506 #undef __FUNCT__
507 #define __FUNCT__ "MatNorm_MPISBAIJ"
508 int MatNorm_MPISBAIJ(Mat mat,NormType type,PetscReal *norm)
509 {
510   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
511   /* Mat_SeqSBAIJ *amat = (Mat_SeqSBAIJ*)baij->A->data; */
512   /* Mat_SeqBAIJ  *bmat = (Mat_SeqBAIJ*)baij->B->data; */
513   int        ierr;
514   PetscReal  sum[2],*lnorm2;
515 
516   PetscFunctionBegin;
517   if (baij->size == 1) {
518     ierr =  MatNorm(baij->A,type,norm);CHKERRQ(ierr);
519   } else {
520     if (type == NORM_FROBENIUS) {
521       ierr = PetscMalloc(2*sizeof(PetscReal),&lnorm2);CHKERRQ(ierr);
522       ierr =  MatNorm(baij->A,type,lnorm2);CHKERRQ(ierr);
523       *lnorm2 = (*lnorm2)*(*lnorm2); lnorm2++;            /* squar power of norm(A) */
524       ierr =  MatNorm(baij->B,type,lnorm2);CHKERRQ(ierr);
525       *lnorm2 = (*lnorm2)*(*lnorm2); lnorm2--;             /* squar power of norm(B) */
526       /*
527       ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
528       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d], lnorm2=%g, %g\n",rank,lnorm2[0],lnorm2[1]);
529       */
530       ierr = MPI_Allreduce(lnorm2,&sum,2,MPIU_REAL,MPI_SUM,mat->comm);CHKERRQ(ierr);
531       /*
532       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d], sum=%g, %g\n",rank,sum[0],sum[1]);
533       PetscSynchronizedFlush(PETSC_COMM_WORLD); */
534 
535       *norm = sqrt(sum[0] + 2*sum[1]);
536       ierr = PetscFree(lnorm2);CHKERRQ(ierr);
537     } else {
538       SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
539     }
540   }
541   PetscFunctionReturn(0);
542 }
543 
544 /*
545   Creates the hash table, and sets the table
546   This table is created only once.
547   If new entried need to be added to the matrix
548   then the hash table has to be destroyed and
549   recreated.
550 */
551 #undef __FUNCT__
552 #define __FUNCT__ "MatCreateHashTable_MPISBAIJ_Private"
553 int MatCreateHashTable_MPISBAIJ_Private(Mat mat,PetscReal factor)
554 {
555   PetscFunctionBegin;
556   SETERRQ(1,"Function not yet written for SBAIJ format");
557   /* PetscFunctionReturn(0); */
558 }
559 
560 #undef __FUNCT__
561 #define __FUNCT__ "MatAssemblyBegin_MPISBAIJ"
562 int MatAssemblyBegin_MPISBAIJ(Mat mat,MatAssemblyType mode)
563 {
564   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
565   int         ierr,nstash,reallocs;
566   InsertMode  addv;
567 
568   PetscFunctionBegin;
569   if (baij->donotstash) {
570     PetscFunctionReturn(0);
571   }
572 
573   /* make sure all processors are either in INSERTMODE or ADDMODE */
574   ierr = MPI_Allreduce(&mat->insertmode,&addv,1,MPI_INT,MPI_BOR,mat->comm);CHKERRQ(ierr);
575   if (addv == (ADD_VALUES|INSERT_VALUES)) {
576     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
577   }
578   mat->insertmode = addv; /* in case this processor had no cache */
579 
580   ierr = MatStashScatterBegin_Private(&mat->stash,baij->rowners_bs);CHKERRQ(ierr);
581   ierr = MatStashScatterBegin_Private(&mat->bstash,baij->rowners);CHKERRQ(ierr);
582   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
583   PetscLogInfo(0,"MatAssemblyBegin_MPISBAIJ:Stash has %d entries,uses %d mallocs.\n",nstash,reallocs);
584   ierr = MatStashGetInfo_Private(&mat->stash,&nstash,&reallocs);CHKERRQ(ierr);
585   PetscLogInfo(0,"MatAssemblyBegin_MPISBAIJ:Block-Stash has %d entries, uses %d mallocs.\n",nstash,reallocs);
586   PetscFunctionReturn(0);
587 }
588 
589 #undef __FUNCT__
590 #define __FUNCT__ "MatAssemblyEnd_MPISBAIJ"
591 int MatAssemblyEnd_MPISBAIJ(Mat mat,MatAssemblyType mode)
592 {
593   Mat_MPISBAIJ *baij=(Mat_MPISBAIJ*)mat->data;
594   Mat_SeqSBAIJ  *a=(Mat_SeqSBAIJ*)baij->A->data;
595   Mat_SeqBAIJ  *b=(Mat_SeqBAIJ*)baij->B->data;
596   int         i,j,rstart,ncols,n,ierr,flg,bs2=baij->bs2;
597   int         *row,*col,other_disassembled;
598   PetscTruth  r1,r2,r3;
599   MatScalar   *val;
600   InsertMode  addv = mat->insertmode;
601 
602   PetscFunctionBegin;
603 
604   if (!baij->donotstash) {
605     while (1) {
606       ierr = MatStashScatterGetMesg_Private(&mat->stash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
607       /*
608       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d]: in AssemblyEnd, stash, flg=%d\n",rank,flg);
609       PetscSynchronizedFlush(PETSC_COMM_WORLD);
610       */
611       if (!flg) break;
612 
613       for (i=0; i<n;) {
614         /* Now identify the consecutive vals belonging to the same row */
615         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
616         if (j < n) ncols = j-i;
617         else       ncols = n-i;
618         /* Now assemble all these values with a single function call */
619         ierr = MatSetValues_MPISBAIJ_MatScalar(mat,1,row+i,ncols,col+i,val+i,addv);CHKERRQ(ierr);
620         i = j;
621       }
622     }
623     ierr = MatStashScatterEnd_Private(&mat->stash);CHKERRQ(ierr);
624     /* Now process the block-stash. Since the values are stashed column-oriented,
625        set the roworiented flag to column oriented, and after MatSetValues()
626        restore the original flags */
627     r1 = baij->roworiented;
628     r2 = a->roworiented;
629     r3 = b->roworiented;
630     baij->roworiented = PETSC_FALSE;
631     a->roworiented    = PETSC_FALSE;
632     b->roworiented    = PETSC_FALSE;
633     while (1) {
634       ierr = MatStashScatterGetMesg_Private(&mat->bstash,&n,&row,&col,&val,&flg);CHKERRQ(ierr);
635       if (!flg) break;
636 
637       for (i=0; i<n;) {
638         /* Now identify the consecutive vals belonging to the same row */
639         for (j=i,rstart=row[j]; j<n; j++) { if (row[j] != rstart) break; }
640         if (j < n) ncols = j-i;
641         else       ncols = n-i;
642         ierr = MatSetValuesBlocked_MPISBAIJ_MatScalar(mat,1,row+i,ncols,col+i,val+i*bs2,addv);CHKERRQ(ierr);
643         i = j;
644       }
645     }
646     ierr = MatStashScatterEnd_Private(&mat->bstash);CHKERRQ(ierr);
647     baij->roworiented = r1;
648     a->roworiented    = r2;
649     b->roworiented    = r3;
650   }
651 
652   ierr = MatAssemblyBegin(baij->A,mode);CHKERRQ(ierr);
653   ierr = MatAssemblyEnd(baij->A,mode);CHKERRQ(ierr);
654 
655   /* determine if any processor has disassembled, if so we must
656      also disassemble ourselfs, in order that we may reassemble. */
657   /*
658      if nonzero structure of submatrix B cannot change then we know that
659      no processor disassembled thus we can skip this stuff
660   */
661   if (!((Mat_SeqBAIJ*)baij->B->data)->nonew)  {
662     ierr = MPI_Allreduce(&mat->was_assembled,&other_disassembled,1,MPI_INT,MPI_PROD,mat->comm);CHKERRQ(ierr);
663     if (mat->was_assembled && !other_disassembled) {
664       ierr = DisAssemble_MPISBAIJ(mat);CHKERRQ(ierr);
665     }
666   }
667 
668   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
669     ierr = MatSetUpMultiply_MPISBAIJ(mat);CHKERRQ(ierr); /* setup Mvctx and sMvctx */
670   }
671   ierr = MatAssemblyBegin(baij->B,mode);CHKERRQ(ierr);
672   ierr = MatAssemblyEnd(baij->B,mode);CHKERRQ(ierr);
673 
674 #if defined(PETSC_USE_BOPT_g)
675   if (baij->ht && mode== MAT_FINAL_ASSEMBLY) {
676     PetscLogInfo(0,"MatAssemblyEnd_MPISBAIJ:Average Hash Table Search in MatSetValues = %5.2f\n",((PetscReal)baij->ht_total_ct)/baij->ht_insert_ct);
677     baij->ht_total_ct  = 0;
678     baij->ht_insert_ct = 0;
679   }
680 #endif
681   if (baij->ht_flag && !baij->ht && mode == MAT_FINAL_ASSEMBLY) {
682     ierr = MatCreateHashTable_MPISBAIJ_Private(mat,baij->ht_fact);CHKERRQ(ierr);
683     mat->ops->setvalues        = MatSetValues_MPISBAIJ_HT;
684     mat->ops->setvaluesblocked = MatSetValuesBlocked_MPISBAIJ_HT;
685   }
686 
687   if (baij->rowvalues) {
688     ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);
689     baij->rowvalues = 0;
690   }
691   PetscFunctionReturn(0);
692 }
693 
694 #undef __FUNCT__
695 #define __FUNCT__ "MatView_MPISBAIJ_ASCIIorDraworSocket"
696 static int MatView_MPISBAIJ_ASCIIorDraworSocket(Mat mat,PetscViewer viewer)
697 {
698   Mat_MPISBAIJ      *baij = (Mat_MPISBAIJ*)mat->data;
699   int               ierr,bs = baij->bs,size = baij->size,rank = baij->rank;
700   PetscTruth        isascii,isdraw;
701   PetscViewer       sviewer;
702   PetscViewerFormat format;
703 
704   PetscFunctionBegin;
705   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
706   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
707   if (isascii) {
708     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
709     if (format == PETSC_VIEWER_ASCII_INFO_LONG) {
710       MatInfo info;
711       ierr = MPI_Comm_rank(mat->comm,&rank);CHKERRQ(ierr);
712       ierr = MatGetInfo(mat,MAT_LOCAL,&info);CHKERRQ(ierr);
713       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Local rows %d nz %d nz alloced %d bs %d mem %d\n",
714               rank,mat->m,(int)info.nz_used*bs,(int)info.nz_allocated*bs,
715               baij->bs,(int)info.memory);CHKERRQ(ierr);
716       ierr = MatGetInfo(baij->A,MAT_LOCAL,&info);CHKERRQ(ierr);
717       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] on-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);CHKERRQ(ierr);
718       ierr = MatGetInfo(baij->B,MAT_LOCAL,&info);CHKERRQ(ierr);
719       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] off-diagonal part: nz %d \n",rank,(int)info.nz_used*bs);CHKERRQ(ierr);
720       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
721       ierr = VecScatterView(baij->Mvctx,viewer);CHKERRQ(ierr);
722       PetscFunctionReturn(0);
723     } else if (format == PETSC_VIEWER_ASCII_INFO) {
724       ierr = PetscViewerASCIIPrintf(viewer,"  block size is %d\n",bs);CHKERRQ(ierr);
725       PetscFunctionReturn(0);
726     }
727   }
728 
729   if (isdraw) {
730     PetscDraw       draw;
731     PetscTruth isnull;
732     ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
733     ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr); if (isnull) PetscFunctionReturn(0);
734   }
735 
736   if (size == 1) {
737     ierr = PetscObjectSetName((PetscObject)baij->A,mat->name);CHKERRQ(ierr);
738     ierr = MatView(baij->A,viewer);CHKERRQ(ierr);
739   } else {
740     /* assemble the entire matrix onto first processor. */
741     Mat         A;
742     Mat_SeqSBAIJ *Aloc;
743     Mat_SeqBAIJ *Bloc;
744     int         M = mat->M,N = mat->N,*ai,*aj,col,i,j,k,*rvals,mbs = baij->mbs;
745     MatScalar   *a;
746 
747     if (!rank) {
748       ierr = MatCreateMPISBAIJ(mat->comm,baij->bs,M,N,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
749     } else {
750       ierr = MatCreateMPISBAIJ(mat->comm,baij->bs,0,0,M,N,0,PETSC_NULL,0,PETSC_NULL,&A);CHKERRQ(ierr);
751     }
752     PetscLogObjectParent(mat,A);
753 
754     /* copy over the A part */
755     Aloc  = (Mat_SeqSBAIJ*)baij->A->data;
756     ai    = Aloc->i; aj = Aloc->j; a = Aloc->a;
757     ierr  = PetscMalloc(bs*sizeof(int),&rvals);CHKERRQ(ierr);
758 
759     for (i=0; i<mbs; i++) {
760       rvals[0] = bs*(baij->rstart + i);
761       for (j=1; j<bs; j++) { rvals[j] = rvals[j-1] + 1; }
762       for (j=ai[i]; j<ai[i+1]; j++) {
763         col = (baij->cstart+aj[j])*bs;
764         for (k=0; k<bs; k++) {
765           ierr = MatSetValues_MPISBAIJ_MatScalar(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
766           col++; a += bs;
767         }
768       }
769     }
770     /* copy over the B part */
771     Bloc = (Mat_SeqBAIJ*)baij->B->data;
772     ai = Bloc->i; aj = Bloc->j; a = Bloc->a;
773     for (i=0; i<mbs; i++) {
774       rvals[0] = bs*(baij->rstart + i);
775       for (j=1; j<bs; j++) { rvals[j] = rvals[j-1] + 1; }
776       for (j=ai[i]; j<ai[i+1]; j++) {
777         col = baij->garray[aj[j]]*bs;
778         for (k=0; k<bs; k++) {
779           ierr = MatSetValues_MPISBAIJ_MatScalar(A,bs,rvals,1,&col,a,INSERT_VALUES);CHKERRQ(ierr);
780           col++; a += bs;
781         }
782       }
783     }
784     ierr = PetscFree(rvals);CHKERRQ(ierr);
785     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
786     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
787     /*
788        Everyone has to call to draw the matrix since the graphics waits are
789        synchronized across all processors that share the PetscDraw object
790     */
791     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
792     if (!rank) {
793       ierr = PetscObjectSetName((PetscObject)((Mat_MPISBAIJ*)(A->data))->A,mat->name);CHKERRQ(ierr);
794       ierr = MatView(((Mat_MPISBAIJ*)(A->data))->A,sviewer);CHKERRQ(ierr);
795     }
796     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
797     ierr = MatDestroy(A);CHKERRQ(ierr);
798   }
799   PetscFunctionReturn(0);
800 }
801 
802 #undef __FUNCT__
803 #define __FUNCT__ "MatView_MPISBAIJ"
804 int MatView_MPISBAIJ(Mat mat,PetscViewer viewer)
805 {
806   int        ierr;
807   PetscTruth isascii,isdraw,issocket,isbinary;
808 
809   PetscFunctionBegin;
810   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);CHKERRQ(ierr);
811   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_DRAW,&isdraw);CHKERRQ(ierr);
812   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_SOCKET,&issocket);CHKERRQ(ierr);
813   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_BINARY,&isbinary);CHKERRQ(ierr);
814   if (isascii || isdraw || issocket || isbinary) {
815     ierr = MatView_MPISBAIJ_ASCIIorDraworSocket(mat,viewer);CHKERRQ(ierr);
816   } else {
817     SETERRQ1(1,"Viewer type %s not supported by MPISBAIJ matrices",((PetscObject)viewer)->type_name);
818   }
819   PetscFunctionReturn(0);
820 }
821 
822 #undef __FUNCT__
823 #define __FUNCT__ "MatDestroy_MPISBAIJ"
824 int MatDestroy_MPISBAIJ(Mat mat)
825 {
826   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
827   int         ierr;
828 
829   PetscFunctionBegin;
830 #if defined(PETSC_USE_LOG)
831   PetscLogObjectState((PetscObject)mat,"Rows=%d,Cols=%d",mat->M,mat->N);
832 #endif
833   ierr = MatStashDestroy_Private(&mat->stash);CHKERRQ(ierr);
834   ierr = MatStashDestroy_Private(&mat->bstash);CHKERRQ(ierr);
835   ierr = PetscFree(baij->rowners);CHKERRQ(ierr);
836   ierr = MatDestroy(baij->A);CHKERRQ(ierr);
837   ierr = MatDestroy(baij->B);CHKERRQ(ierr);
838 #if defined (PETSC_USE_CTABLE)
839   if (baij->colmap) {ierr = PetscTableDelete(baij->colmap);CHKERRQ(ierr);}
840 #else
841   if (baij->colmap) {ierr = PetscFree(baij->colmap);CHKERRQ(ierr);}
842 #endif
843   if (baij->garray) {ierr = PetscFree(baij->garray);CHKERRQ(ierr);}
844   if (baij->lvec)   {ierr = VecDestroy(baij->lvec);CHKERRQ(ierr);}
845   if (baij->Mvctx)  {ierr = VecScatterDestroy(baij->Mvctx);CHKERRQ(ierr);}
846   if (baij->slvec0) {
847     ierr = VecDestroy(baij->slvec0);CHKERRQ(ierr);
848     ierr = VecDestroy(baij->slvec0b);CHKERRQ(ierr);
849   }
850   if (baij->slvec1) {
851     ierr = VecDestroy(baij->slvec1);CHKERRQ(ierr);
852     ierr = VecDestroy(baij->slvec1a);CHKERRQ(ierr);
853     ierr = VecDestroy(baij->slvec1b);CHKERRQ(ierr);
854   }
855   if (baij->sMvctx)  {ierr = VecScatterDestroy(baij->sMvctx);CHKERRQ(ierr);}
856   if (baij->rowvalues) {ierr = PetscFree(baij->rowvalues);CHKERRQ(ierr);}
857   if (baij->barray) {ierr = PetscFree(baij->barray);CHKERRQ(ierr);}
858   if (baij->hd) {ierr = PetscFree(baij->hd);CHKERRQ(ierr);}
859 #if defined(PETSC_USE_MAT_SINGLE)
860   if (baij->setvaluescopy) {ierr = PetscFree(baij->setvaluescopy);CHKERRQ(ierr);}
861 #endif
862   ierr = PetscFree(baij);CHKERRQ(ierr);
863   PetscFunctionReturn(0);
864 }
865 
866 #undef __FUNCT__
867 #define __FUNCT__ "MatMult_MPISBAIJ"
868 int MatMult_MPISBAIJ(Mat A,Vec xx,Vec yy)
869 {
870   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
871   int         ierr,nt,mbs=a->mbs,bs=a->bs;
872   PetscScalar *x,*from,zero=0.0;
873 
874   PetscFunctionBegin;
875   /*
876   PetscSynchronizedPrintf(PETSC_COMM_WORLD," _1comm is called ...\n");
877   PetscSynchronizedFlush(PETSC_COMM_WORLD);
878   */
879   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
880   if (nt != A->n) {
881     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible partition of A and xx");
882   }
883   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
884   if (nt != A->m) {
885     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible parition of A and yy");
886   }
887 
888   /* diagonal part */
889   ierr = (*a->A->ops->mult)(a->A,xx,a->slvec1a);CHKERRQ(ierr);
890   ierr = VecSet(&zero,a->slvec1b);CHKERRQ(ierr);
891 
892   /* subdiagonal part */
893   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->slvec0b);CHKERRQ(ierr);
894 
895   /* copy x into the vec slvec0 */
896   ierr = VecGetArray(a->slvec0,&from);CHKERRQ(ierr);
897   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
898   ierr = PetscMemcpy(from,x,bs*mbs*sizeof(MatScalar));CHKERRQ(ierr);
899   ierr = VecRestoreArray(a->slvec0,&from);CHKERRQ(ierr);
900 
901   ierr = VecScatterBegin(a->slvec0,a->slvec1,ADD_VALUES,SCATTER_FORWARD,a->sMvctx);CHKERRQ(ierr);
902   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
903   ierr = VecScatterEnd(a->slvec0,a->slvec1,ADD_VALUES,SCATTER_FORWARD,a->sMvctx);CHKERRQ(ierr);
904 
905   /* supperdiagonal part */
906   ierr = (*a->B->ops->multadd)(a->B,a->slvec1b,a->slvec1a,yy);CHKERRQ(ierr);
907 
908   PetscFunctionReturn(0);
909 }
910 
911 #undef __FUNCT__
912 #define __FUNCT__ "MatMult_MPISBAIJ_2comm"
913 int MatMult_MPISBAIJ_2comm(Mat A,Vec xx,Vec yy)
914 {
915   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
916   int         ierr,nt;
917 
918   PetscFunctionBegin;
919   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
920   if (nt != A->n) {
921     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible partition of A and xx");
922   }
923   ierr = VecGetLocalSize(yy,&nt);CHKERRQ(ierr);
924   if (nt != A->m) {
925     SETERRQ(PETSC_ERR_ARG_SIZ,"Incompatible parition of A and yy");
926   }
927 
928   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
929   /* do diagonal part */
930   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
931   /* do supperdiagonal part */
932   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
933   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
934   /* do subdiagonal part */
935   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
936   ierr = VecScatterBegin(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
937   ierr = VecScatterEnd(a->lvec,yy,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
938 
939   PetscFunctionReturn(0);
940 }
941 
942 #undef __FUNCT__
943 #define __FUNCT__ "MatMultAdd_MPISBAIJ"
944 int MatMultAdd_MPISBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
945 {
946   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
947   int          ierr,mbs=a->mbs,bs=a->bs;
948   PetscScalar  *x,*from,zero=0.0;
949 
950   PetscFunctionBegin;
951   /*
952   PetscSynchronizedPrintf(PETSC_COMM_WORLD," MatMultAdd is called ...\n");
953   PetscSynchronizedFlush(PETSC_COMM_WORLD);
954   */
955   /* diagonal part */
956   ierr = (*a->A->ops->multadd)(a->A,xx,yy,a->slvec1a);CHKERRQ(ierr);
957   ierr = VecSet(&zero,a->slvec1b);CHKERRQ(ierr);
958 
959   /* subdiagonal part */
960   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->slvec0b);CHKERRQ(ierr);
961 
962   /* copy x into the vec slvec0 */
963   ierr = VecGetArray(a->slvec0,&from);CHKERRQ(ierr);
964   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
965   ierr = PetscMemcpy(from,x,bs*mbs*sizeof(MatScalar));CHKERRQ(ierr);
966   ierr = VecRestoreArray(a->slvec0,&from);CHKERRQ(ierr);
967 
968   ierr = VecScatterBegin(a->slvec0,a->slvec1,ADD_VALUES,SCATTER_FORWARD,a->sMvctx);CHKERRQ(ierr);
969   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
970   ierr = VecScatterEnd(a->slvec0,a->slvec1,ADD_VALUES,SCATTER_FORWARD,a->sMvctx);CHKERRQ(ierr);
971 
972   /* supperdiagonal part */
973   ierr = (*a->B->ops->multadd)(a->B,a->slvec1b,a->slvec1a,zz);CHKERRQ(ierr);
974 
975   PetscFunctionReturn(0);
976 }
977 
978 #undef __FUNCT__
979 #define __FUNCT__ "MatMultAdd_MPISBAIJ_2comm"
980 int MatMultAdd_MPISBAIJ_2comm(Mat A,Vec xx,Vec yy,Vec zz)
981 {
982   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
983   int        ierr;
984 
985   PetscFunctionBegin;
986   ierr = VecScatterBegin(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
987   /* do diagonal part */
988   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
989   /* do supperdiagonal part */
990   ierr = VecScatterEnd(xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD,a->Mvctx);CHKERRQ(ierr);
991   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
992 
993   /* do subdiagonal part */
994   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
995   ierr = VecScatterBegin(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
996   ierr = VecScatterEnd(a->lvec,zz,ADD_VALUES,SCATTER_REVERSE,a->Mvctx);CHKERRQ(ierr);
997 
998   PetscFunctionReturn(0);
999 }
1000 
1001 #undef __FUNCT__
1002 #define __FUNCT__ "MatMultTranspose_MPISBAIJ"
1003 int MatMultTranspose_MPISBAIJ(Mat A,Vec xx,Vec yy)
1004 {
1005   PetscFunctionBegin;
1006   SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1007   /* PetscFunctionReturn(0); */
1008 }
1009 
1010 #undef __FUNCT__
1011 #define __FUNCT__ "MatMultTransposeAdd_MPISBAIJ"
1012 int MatMultTransposeAdd_MPISBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1013 {
1014   PetscFunctionBegin;
1015   SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1016   /* PetscFunctionReturn(0); */
1017 }
1018 
1019 /*
1020   This only works correctly for square matrices where the subblock A->A is the
1021    diagonal block
1022 */
1023 #undef __FUNCT__
1024 #define __FUNCT__ "MatGetDiagonal_MPISBAIJ"
1025 int MatGetDiagonal_MPISBAIJ(Mat A,Vec v)
1026 {
1027   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
1028   int         ierr;
1029 
1030   PetscFunctionBegin;
1031   /* if (a->M != a->N) SETERRQ(PETSC_ERR_SUP,"Supports only square matrix where A->A is diag block"); */
1032   ierr = MatGetDiagonal(a->A,v);CHKERRQ(ierr);
1033   PetscFunctionReturn(0);
1034 }
1035 
1036 #undef __FUNCT__
1037 #define __FUNCT__ "MatScale_MPISBAIJ"
1038 int MatScale_MPISBAIJ(PetscScalar *aa,Mat A)
1039 {
1040   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
1041   int         ierr;
1042 
1043   PetscFunctionBegin;
1044   ierr = MatScale(aa,a->A);CHKERRQ(ierr);
1045   ierr = MatScale(aa,a->B);CHKERRQ(ierr);
1046   PetscFunctionReturn(0);
1047 }
1048 
1049 #undef __FUNCT__
1050 #define __FUNCT__ "MatGetRow_MPISBAIJ"
1051 int MatGetRow_MPISBAIJ(Mat matin,int row,int *nz,int **idx,PetscScalar **v)
1052 {
1053   Mat_MPISBAIJ   *mat = (Mat_MPISBAIJ*)matin->data;
1054   PetscScalar    *vworkA,*vworkB,**pvA,**pvB,*v_p;
1055   int            bs = mat->bs,bs2 = mat->bs2,i,ierr,*cworkA,*cworkB,**pcA,**pcB;
1056   int            nztot,nzA,nzB,lrow,brstart = mat->rstart*bs,brend = mat->rend*bs;
1057   int            *cmap,*idx_p,cstart = mat->cstart;
1058 
1059   PetscFunctionBegin;
1060   if (mat->getrowactive == PETSC_TRUE) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Already active");
1061   mat->getrowactive = PETSC_TRUE;
1062 
1063   if (!mat->rowvalues && (idx || v)) {
1064     /*
1065         allocate enough space to hold information from the longest row.
1066     */
1067     Mat_SeqSBAIJ *Aa = (Mat_SeqSBAIJ*)mat->A->data;
1068     Mat_SeqBAIJ  *Ba = (Mat_SeqBAIJ*)mat->B->data;
1069     int     max = 1,mbs = mat->mbs,tmp;
1070     for (i=0; i<mbs; i++) {
1071       tmp = Aa->i[i+1] - Aa->i[i] + Ba->i[i+1] - Ba->i[i]; /* row length */
1072       if (max < tmp) { max = tmp; }
1073     }
1074     ierr = PetscMalloc(max*bs2*(sizeof(int)+sizeof(PetscScalar)),&mat->rowvalues);CHKERRQ(ierr);
1075     mat->rowindices = (int*)(mat->rowvalues + max*bs2);
1076   }
1077 
1078   if (row < brstart || row >= brend) SETERRQ(PETSC_ERR_SUP,"Only local rows")
1079   lrow = row - brstart;  /* local row index */
1080 
1081   pvA = &vworkA; pcA = &cworkA; pvB = &vworkB; pcB = &cworkB;
1082   if (!v)   {pvA = 0; pvB = 0;}
1083   if (!idx) {pcA = 0; if (!v) pcB = 0;}
1084   ierr = (*mat->A->ops->getrow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1085   ierr = (*mat->B->ops->getrow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1086   nztot = nzA + nzB;
1087 
1088   cmap  = mat->garray;
1089   if (v  || idx) {
1090     if (nztot) {
1091       /* Sort by increasing column numbers, assuming A and B already sorted */
1092       int imark = -1;
1093       if (v) {
1094         *v = v_p = mat->rowvalues;
1095         for (i=0; i<nzB; i++) {
1096           if (cmap[cworkB[i]/bs] < cstart)   v_p[i] = vworkB[i];
1097           else break;
1098         }
1099         imark = i;
1100         for (i=0; i<nzA; i++)     v_p[imark+i] = vworkA[i];
1101         for (i=imark; i<nzB; i++) v_p[nzA+i]   = vworkB[i];
1102       }
1103       if (idx) {
1104         *idx = idx_p = mat->rowindices;
1105         if (imark > -1) {
1106           for (i=0; i<imark; i++) {
1107             idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs;
1108           }
1109         } else {
1110           for (i=0; i<nzB; i++) {
1111             if (cmap[cworkB[i]/bs] < cstart)
1112               idx_p[i] = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1113             else break;
1114           }
1115           imark = i;
1116         }
1117         for (i=0; i<nzA; i++)     idx_p[imark+i] = cstart*bs + cworkA[i];
1118         for (i=imark; i<nzB; i++) idx_p[nzA+i]   = cmap[cworkB[i]/bs]*bs + cworkB[i]%bs ;
1119       }
1120     } else {
1121       if (idx) *idx = 0;
1122       if (v)   *v   = 0;
1123     }
1124   }
1125   *nz = nztot;
1126   ierr = (*mat->A->ops->restorerow)(mat->A,lrow,&nzA,pcA,pvA);CHKERRQ(ierr);
1127   ierr = (*mat->B->ops->restorerow)(mat->B,lrow,&nzB,pcB,pvB);CHKERRQ(ierr);
1128   PetscFunctionReturn(0);
1129 }
1130 
1131 #undef __FUNCT__
1132 #define __FUNCT__ "MatRestoreRow_MPISBAIJ"
1133 int MatRestoreRow_MPISBAIJ(Mat mat,int row,int *nz,int **idx,PetscScalar **v)
1134 {
1135   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1136 
1137   PetscFunctionBegin;
1138   if (baij->getrowactive == PETSC_FALSE) {
1139     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"MatGetRow not called");
1140   }
1141   baij->getrowactive = PETSC_FALSE;
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 #undef __FUNCT__
1146 #define __FUNCT__ "MatGetBlockSize_MPISBAIJ"
1147 int MatGetBlockSize_MPISBAIJ(Mat mat,int *bs)
1148 {
1149   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1150 
1151   PetscFunctionBegin;
1152   *bs = baij->bs;
1153   PetscFunctionReturn(0);
1154 }
1155 
1156 #undef __FUNCT__
1157 #define __FUNCT__ "MatZeroEntries_MPISBAIJ"
1158 int MatZeroEntries_MPISBAIJ(Mat A)
1159 {
1160   Mat_MPISBAIJ *l = (Mat_MPISBAIJ*)A->data;
1161   int         ierr;
1162 
1163   PetscFunctionBegin;
1164   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
1165   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
1166   PetscFunctionReturn(0);
1167 }
1168 
1169 #undef __FUNCT__
1170 #define __FUNCT__ "MatGetInfo_MPISBAIJ"
1171 int MatGetInfo_MPISBAIJ(Mat matin,MatInfoType flag,MatInfo *info)
1172 {
1173   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)matin->data;
1174   Mat         A = a->A,B = a->B;
1175   int         ierr;
1176   PetscReal   isend[5],irecv[5];
1177 
1178   PetscFunctionBegin;
1179   info->block_size     = (PetscReal)a->bs;
1180   ierr = MatGetInfo(A,MAT_LOCAL,info);CHKERRQ(ierr);
1181   isend[0] = info->nz_used; isend[1] = info->nz_allocated; isend[2] = info->nz_unneeded;
1182   isend[3] = info->memory;  isend[4] = info->mallocs;
1183   ierr = MatGetInfo(B,MAT_LOCAL,info);CHKERRQ(ierr);
1184   isend[0] += info->nz_used; isend[1] += info->nz_allocated; isend[2] += info->nz_unneeded;
1185   isend[3] += info->memory;  isend[4] += info->mallocs;
1186   if (flag == MAT_LOCAL) {
1187     info->nz_used      = isend[0];
1188     info->nz_allocated = isend[1];
1189     info->nz_unneeded  = isend[2];
1190     info->memory       = isend[3];
1191     info->mallocs      = isend[4];
1192   } else if (flag == MAT_GLOBAL_MAX) {
1193     ierr = MPI_Allreduce(isend,irecv,5,MPIU_REAL,MPI_MAX,matin->comm);CHKERRQ(ierr);
1194     info->nz_used      = irecv[0];
1195     info->nz_allocated = irecv[1];
1196     info->nz_unneeded  = irecv[2];
1197     info->memory       = irecv[3];
1198     info->mallocs      = irecv[4];
1199   } else if (flag == MAT_GLOBAL_SUM) {
1200     ierr = MPI_Allreduce(isend,irecv,5,MPIU_REAL,MPI_SUM,matin->comm);CHKERRQ(ierr);
1201     info->nz_used      = irecv[0];
1202     info->nz_allocated = irecv[1];
1203     info->nz_unneeded  = irecv[2];
1204     info->memory       = irecv[3];
1205     info->mallocs      = irecv[4];
1206   } else {
1207     SETERRQ1(1,"Unknown MatInfoType argument %d",flag);
1208   }
1209   info->rows_global       = (PetscReal)A->M;
1210   info->columns_global    = (PetscReal)A->N;
1211   info->rows_local        = (PetscReal)A->m;
1212   info->columns_local     = (PetscReal)A->N;
1213   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1214   info->fill_ratio_needed = 0;
1215   info->factor_mallocs    = 0;
1216   PetscFunctionReturn(0);
1217 }
1218 
1219 #undef __FUNCT__
1220 #define __FUNCT__ "MatSetOption_MPISBAIJ"
1221 int MatSetOption_MPISBAIJ(Mat A,MatOption op)
1222 {
1223   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
1224   int         ierr;
1225 
1226   PetscFunctionBegin;
1227   switch (op) {
1228   case MAT_NO_NEW_NONZERO_LOCATIONS:
1229   case MAT_YES_NEW_NONZERO_LOCATIONS:
1230   case MAT_COLUMNS_UNSORTED:
1231   case MAT_COLUMNS_SORTED:
1232   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1233   case MAT_KEEP_ZEROED_ROWS:
1234   case MAT_NEW_NONZERO_LOCATION_ERR:
1235     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1236     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1237     break;
1238   case MAT_ROW_ORIENTED:
1239     a->roworiented = PETSC_TRUE;
1240     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1241     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1242     break;
1243   case MAT_ROWS_SORTED:
1244   case MAT_ROWS_UNSORTED:
1245   case MAT_YES_NEW_DIAGONALS:
1246   case MAT_USE_SINGLE_PRECISION_SOLVES:
1247     PetscLogInfo(A,"Info:MatSetOption_MPIBAIJ:Option ignored\n");
1248     break;
1249   case MAT_COLUMN_ORIENTED:
1250     a->roworiented = PETSC_FALSE;
1251     ierr = MatSetOption(a->A,op);CHKERRQ(ierr);
1252     ierr = MatSetOption(a->B,op);CHKERRQ(ierr);
1253     break;
1254   case MAT_IGNORE_OFF_PROC_ENTRIES:
1255     a->donotstash = PETSC_TRUE;
1256     break;
1257   case MAT_NO_NEW_DIAGONALS:
1258     SETERRQ(PETSC_ERR_SUP,"MAT_NO_NEW_DIAGONALS");
1259   case MAT_USE_HASH_TABLE:
1260     a->ht_flag = PETSC_TRUE;
1261     break;
1262   default:
1263     SETERRQ(PETSC_ERR_SUP,"unknown option");
1264   }
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 #undef __FUNCT__
1269 #define __FUNCT__ "MatTranspose_MPISBAIJ("
1270 int MatTranspose_MPISBAIJ(Mat A,Mat *matout)
1271 {
1272   PetscFunctionBegin;
1273   SETERRQ(1,"Matrix is symmetric. MatTranspose() should not be called");
1274   /* PetscFunctionReturn(0); */
1275 }
1276 
1277 #undef __FUNCT__
1278 #define __FUNCT__ "MatDiagonalScale_MPISBAIJ"
1279 int MatDiagonalScale_MPISBAIJ(Mat mat,Vec ll,Vec rr)
1280 {
1281   Mat_MPISBAIJ *baij = (Mat_MPISBAIJ*)mat->data;
1282   Mat         a = baij->A,b = baij->B;
1283   int         ierr,s1,s2,s3;
1284 
1285   PetscFunctionBegin;
1286   if (ll != rr) {
1287     SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1288   }
1289   ierr = MatGetLocalSize(mat,&s2,&s3);CHKERRQ(ierr);
1290   if (rr) {
1291     ierr = VecGetLocalSize(rr,&s1);CHKERRQ(ierr);
1292     if (s1!=s3) SETERRQ(PETSC_ERR_ARG_SIZ,"right vector non-conforming local size");
1293     /* Overlap communication with computation. */
1294     ierr = VecScatterBegin(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1295     /*} if (ll) { */
1296     ierr = VecGetLocalSize(ll,&s1);CHKERRQ(ierr);
1297     if (s1!=s2) SETERRQ(PETSC_ERR_ARG_SIZ,"left vector non-conforming local size");
1298     ierr = (*b->ops->diagonalscale)(b,ll,PETSC_NULL);CHKERRQ(ierr);
1299     /* } */
1300   /* scale  the diagonal block */
1301   ierr = (*a->ops->diagonalscale)(a,ll,rr);CHKERRQ(ierr);
1302 
1303   /* if (rr) { */
1304     /* Do a scatter end and then right scale the off-diagonal block */
1305     ierr = VecScatterEnd(rr,baij->lvec,INSERT_VALUES,SCATTER_FORWARD,baij->Mvctx);CHKERRQ(ierr);
1306     ierr = (*b->ops->diagonalscale)(b,PETSC_NULL,baij->lvec);CHKERRQ(ierr);
1307   }
1308 
1309   PetscFunctionReturn(0);
1310 }
1311 
1312 #undef __FUNCT__
1313 #define __FUNCT__ "MatZeroRows_MPISBAIJ"
1314 int MatZeroRows_MPISBAIJ(Mat A,IS is,PetscScalar *diag)
1315 {
1316   Mat_MPISBAIJ   *l = (Mat_MPISBAIJ*)A->data;
1317   int            i,ierr,N,*rows,*owners = l->rowners,size = l->size;
1318   int            *procs,*nprocs,j,idx,nsends,*work,row;
1319   int            nmax,*svalues,*starts,*owner,nrecvs,rank = l->rank;
1320   int            *rvalues,tag = A->tag,count,base,slen,n,*source;
1321   int            *lens,imdex,*lrows,*values,bs=l->bs,rstart_bs=l->rstart_bs;
1322   MPI_Comm       comm = A->comm;
1323   MPI_Request    *send_waits,*recv_waits;
1324   MPI_Status     recv_status,*send_status;
1325   IS             istmp;
1326   PetscTruth     found;
1327 
1328   PetscFunctionBegin;
1329   ierr = ISGetSize(is,&N);CHKERRQ(ierr);
1330   ierr = ISGetIndices(is,&rows);CHKERRQ(ierr);
1331 
1332   /*  first count number of contributors to each processor */
1333   ierr  = PetscMalloc(2*size*sizeof(int),&nprocs);CHKERRQ(ierr);
1334   ierr  = PetscMemzero(nprocs,2*size*sizeof(int));CHKERRQ(ierr);
1335   procs = nprocs + size;
1336   ierr  = PetscMalloc((N+1)*sizeof(int),&owner);CHKERRQ(ierr); /* see note*/
1337   for (i=0; i<N; i++) {
1338     idx   = rows[i];
1339     found = PETSC_FALSE;
1340     for (j=0; j<size; j++) {
1341       if (idx >= owners[j]*bs && idx < owners[j+1]*bs) {
1342         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
1343       }
1344     }
1345     if (!found) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index out of range");
1346   }
1347   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}
1348 
1349   /* inform other processors of number of messages and max length*/
1350   ierr   = PetscMalloc(2*size*sizeof(int),&work);CHKERRQ(ierr);
1351   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);CHKERRQ(ierr);
1352   nmax   = work[rank];
1353   nrecvs = work[size+rank];
1354   ierr   = PetscFree(work);CHKERRQ(ierr);
1355 
1356   /* post receives:   */
1357   ierr = PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);CHKERRQ(ierr);
1358   ierr = PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);CHKERRQ(ierr);
1359   for (i=0; i<nrecvs; i++) {
1360     ierr = MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);CHKERRQ(ierr);
1361   }
1362 
1363   /* do sends:
1364      1) starts[i] gives the starting index in svalues for stuff going to
1365      the ith processor
1366   */
1367   ierr = PetscMalloc((N+1)*sizeof(int),&svalues);CHKERRQ(ierr);
1368   ierr = PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);CHKERRQ(ierr);
1369   ierr = PetscMalloc((size+1)*sizeof(int),&starts);CHKERRQ(ierr);
1370   starts[0]  = 0;
1371   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1372   for (i=0; i<N; i++) {
1373     svalues[starts[owner[i]]++] = rows[i];
1374   }
1375   ierr = ISRestoreIndices(is,&rows);CHKERRQ(ierr);
1376 
1377   starts[0] = 0;
1378   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1379   count = 0;
1380   for (i=0; i<size; i++) {
1381     if (procs[i]) {
1382       ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
1383     }
1384   }
1385   ierr = PetscFree(starts);CHKERRQ(ierr);
1386 
1387   base = owners[rank]*bs;
1388 
1389   /*  wait on receives */
1390   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);CHKERRQ(ierr);
1391   source = lens + nrecvs;
1392   count  = nrecvs; slen = 0;
1393   while (count) {
1394     ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
1395     /* unpack receives into our local space */
1396     ierr = MPI_Get_count(&recv_status,MPI_INT,&n);CHKERRQ(ierr);
1397     source[imdex]  = recv_status.MPI_SOURCE;
1398     lens[imdex]    = n;
1399     slen          += n;
1400     count--;
1401   }
1402   ierr = PetscFree(recv_waits);CHKERRQ(ierr);
1403 
1404   /* move the data into the send scatter */
1405   ierr = PetscMalloc((slen+1)*sizeof(int),&lrows);CHKERRQ(ierr);
1406   count = 0;
1407   for (i=0; i<nrecvs; i++) {
1408     values = rvalues + i*nmax;
1409     for (j=0; j<lens[i]; j++) {
1410       lrows[count++] = values[j] - base;
1411     }
1412   }
1413   ierr = PetscFree(rvalues);CHKERRQ(ierr);
1414   ierr = PetscFree(lens);CHKERRQ(ierr);
1415   ierr = PetscFree(owner);CHKERRQ(ierr);
1416   ierr = PetscFree(nprocs);CHKERRQ(ierr);
1417 
1418   /* actually zap the local rows */
1419   ierr = ISCreateGeneral(PETSC_COMM_SELF,slen,lrows,&istmp);CHKERRQ(ierr);
1420   PetscLogObjectParent(A,istmp);
1421 
1422   /*
1423         Zero the required rows. If the "diagonal block" of the matrix
1424      is square and the user wishes to set the diagonal we use seperate
1425      code so that MatSetValues() is not called for each diagonal allocating
1426      new memory, thus calling lots of mallocs and slowing things down.
1427 
1428        Contributed by: Mathew Knepley
1429   */
1430   /* must zero l->B before l->A because the (diag) case below may put values into l->B*/
1431   ierr = MatZeroRows_SeqBAIJ(l->B,istmp,0);CHKERRQ(ierr);
1432   if (diag && (l->A->M == l->A->N)) {
1433     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,diag);CHKERRQ(ierr);
1434   } else if (diag) {
1435     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,0);CHKERRQ(ierr);
1436     if (((Mat_SeqSBAIJ*)l->A->data)->nonew) {
1437       SETERRQ(PETSC_ERR_SUP,"MatZeroRows() on rectangular matrices cannot be used with the Mat options \n\
1438 MAT_NO_NEW_NONZERO_LOCATIONS,MAT_NEW_NONZERO_LOCATION_ERR,MAT_NEW_NONZERO_ALLOCATION_ERR");
1439     }
1440     for (i=0; i<slen; i++) {
1441       row  = lrows[i] + rstart_bs;
1442       ierr = MatSetValues(A,1,&row,1,&row,diag,INSERT_VALUES);CHKERRQ(ierr);
1443     }
1444     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1445     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1446   } else {
1447     ierr = MatZeroRows_SeqSBAIJ(l->A,istmp,0);CHKERRQ(ierr);
1448   }
1449 
1450   ierr = ISDestroy(istmp);CHKERRQ(ierr);
1451   ierr = PetscFree(lrows);CHKERRQ(ierr);
1452 
1453   /* wait on sends */
1454   if (nsends) {
1455     ierr = PetscMalloc(nsends*sizeof(MPI_Status),&send_status);CHKERRQ(ierr);
1456     ierr        = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
1457     ierr        = PetscFree(send_status);CHKERRQ(ierr);
1458   }
1459   ierr = PetscFree(send_waits);CHKERRQ(ierr);
1460   ierr = PetscFree(svalues);CHKERRQ(ierr);
1461 
1462   PetscFunctionReturn(0);
1463 }
1464 
1465 #undef __FUNCT__
1466 #define __FUNCT__ "MatPrintHelp_MPISBAIJ"
1467 int MatPrintHelp_MPISBAIJ(Mat A)
1468 {
1469   Mat_MPISBAIJ *a   = (Mat_MPISBAIJ*)A->data;
1470   MPI_Comm    comm = A->comm;
1471   static int  called = 0;
1472   int         ierr;
1473 
1474   PetscFunctionBegin;
1475   if (!a->rank) {
1476     ierr = MatPrintHelp_SeqSBAIJ(a->A);CHKERRQ(ierr);
1477   }
1478   if (called) {PetscFunctionReturn(0);} else called = 1;
1479   ierr = (*PetscHelpPrintf)(comm," Options for MATMPISBAIJ matrix format (the defaults):\n");CHKERRQ(ierr);
1480   ierr = (*PetscHelpPrintf)(comm,"  -mat_use_hash_table <factor>: Use hashtable for efficient matrix assembly\n");CHKERRQ(ierr);
1481   PetscFunctionReturn(0);
1482 }
1483 
1484 #undef __FUNCT__
1485 #define __FUNCT__ "MatSetUnfactored_MPISBAIJ"
1486 int MatSetUnfactored_MPISBAIJ(Mat A)
1487 {
1488   Mat_MPISBAIJ *a   = (Mat_MPISBAIJ*)A->data;
1489   int         ierr;
1490 
1491   PetscFunctionBegin;
1492   ierr = MatSetUnfactored(a->A);CHKERRQ(ierr);
1493   PetscFunctionReturn(0);
1494 }
1495 
1496 static int MatDuplicate_MPISBAIJ(Mat,MatDuplicateOption,Mat *);
1497 
1498 #undef __FUNCT__
1499 #define __FUNCT__ "MatEqual_MPISBAIJ"
1500 int MatEqual_MPISBAIJ(Mat A,Mat B,PetscTruth *flag)
1501 {
1502   Mat_MPISBAIJ *matB = (Mat_MPISBAIJ*)B->data,*matA = (Mat_MPISBAIJ*)A->data;
1503   Mat         a,b,c,d;
1504   PetscTruth  flg;
1505   int         ierr;
1506 
1507   PetscFunctionBegin;
1508   ierr = PetscTypeCompare((PetscObject)B,MATMPISBAIJ,&flg);CHKERRQ(ierr);
1509   if (!flg) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1510   a = matA->A; b = matA->B;
1511   c = matB->A; d = matB->B;
1512 
1513   ierr = MatEqual(a,c,&flg);CHKERRQ(ierr);
1514   if (flg == PETSC_TRUE) {
1515     ierr = MatEqual(b,d,&flg);CHKERRQ(ierr);
1516   }
1517   ierr = MPI_Allreduce(&flg,flag,1,MPI_INT,MPI_LAND,A->comm);CHKERRQ(ierr);
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 #undef __FUNCT__
1522 #define __FUNCT__ "MatSetUpPreallocation_MPISBAIJ"
1523 int MatSetUpPreallocation_MPISBAIJ(Mat A)
1524 {
1525   int        ierr;
1526 
1527   PetscFunctionBegin;
1528   ierr = MatMPISBAIJSetPreallocation(A,1,PETSC_DEFAULT,0,PETSC_DEFAULT,0);CHKERRQ(ierr);
1529   PetscFunctionReturn(0);
1530 }
1531 /* -------------------------------------------------------------------*/
1532 static struct _MatOps MatOps_Values = {
1533   MatSetValues_MPISBAIJ,
1534   MatGetRow_MPISBAIJ,
1535   MatRestoreRow_MPISBAIJ,
1536   MatMult_MPISBAIJ,
1537   MatMultAdd_MPISBAIJ,
1538   MatMultTranspose_MPISBAIJ,
1539   MatMultTransposeAdd_MPISBAIJ,
1540   0,
1541   0,
1542   0,
1543   0,
1544   0,
1545   0,
1546   MatRelax_MPISBAIJ,
1547   MatTranspose_MPISBAIJ,
1548   MatGetInfo_MPISBAIJ,
1549   MatEqual_MPISBAIJ,
1550   MatGetDiagonal_MPISBAIJ,
1551   MatDiagonalScale_MPISBAIJ,
1552   MatNorm_MPISBAIJ,
1553   MatAssemblyBegin_MPISBAIJ,
1554   MatAssemblyEnd_MPISBAIJ,
1555   0,
1556   MatSetOption_MPISBAIJ,
1557   MatZeroEntries_MPISBAIJ,
1558   MatZeroRows_MPISBAIJ,
1559   0,
1560   0,
1561   0,
1562   0,
1563   MatSetUpPreallocation_MPISBAIJ,
1564   0,
1565   0,
1566   0,
1567   0,
1568   MatDuplicate_MPISBAIJ,
1569   0,
1570   0,
1571   0,
1572   0,
1573   0,
1574   MatGetSubMatrices_MPISBAIJ,
1575   MatIncreaseOverlap_MPISBAIJ,
1576   MatGetValues_MPISBAIJ,
1577   0,
1578   MatPrintHelp_MPISBAIJ,
1579   MatScale_MPISBAIJ,
1580   0,
1581   0,
1582   0,
1583   MatGetBlockSize_MPISBAIJ,
1584   0,
1585   0,
1586   0,
1587   0,
1588   0,
1589   0,
1590   MatSetUnfactored_MPISBAIJ,
1591   0,
1592   MatSetValuesBlocked_MPISBAIJ,
1593   0,
1594   0,
1595   0,
1596   MatGetPetscMaps_Petsc,
1597   0,
1598   0,
1599   0,
1600   0,
1601   0,
1602   0,
1603   MatGetRowMax_MPISBAIJ};
1604 
1605 
1606 EXTERN_C_BEGIN
1607 #undef __FUNCT__
1608 #define __FUNCT__ "MatGetDiagonalBlock_MPISBAIJ"
1609 int MatGetDiagonalBlock_MPISBAIJ(Mat A,PetscTruth *iscopy,MatReuse reuse,Mat *a)
1610 {
1611   PetscFunctionBegin;
1612   *a      = ((Mat_MPISBAIJ *)A->data)->A;
1613   *iscopy = PETSC_FALSE;
1614   PetscFunctionReturn(0);
1615 }
1616 EXTERN_C_END
1617 
1618 EXTERN_C_BEGIN
1619 #undef __FUNCT__
1620 #define __FUNCT__ "MatCreate_MPISBAIJ"
1621 int MatCreate_MPISBAIJ(Mat B)
1622 {
1623   Mat_MPISBAIJ *b;
1624   int          ierr;
1625   PetscTruth   flg;
1626 
1627   PetscFunctionBegin;
1628 
1629   ierr    = PetscNew(Mat_MPISBAIJ,&b);CHKERRQ(ierr);
1630   B->data = (void*)b;
1631   ierr    = PetscMemzero(b,sizeof(Mat_MPISBAIJ));CHKERRQ(ierr);
1632   ierr    = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1633 
1634   B->ops->destroy    = MatDestroy_MPISBAIJ;
1635   B->ops->view       = MatView_MPISBAIJ;
1636   B->mapping    = 0;
1637   B->factor     = 0;
1638   B->assembled  = PETSC_FALSE;
1639 
1640   B->insertmode = NOT_SET_VALUES;
1641   ierr = MPI_Comm_rank(B->comm,&b->rank);CHKERRQ(ierr);
1642   ierr = MPI_Comm_size(B->comm,&b->size);CHKERRQ(ierr);
1643 
1644   /* build local table of row and column ownerships */
1645   ierr          = PetscMalloc(3*(b->size+2)*sizeof(int),&b->rowners);CHKERRQ(ierr);
1646   b->cowners    = b->rowners + b->size + 2;
1647   b->rowners_bs = b->cowners + b->size + 2;
1648   PetscLogObjectMemory(B,3*(b->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPISBAIJ));
1649 
1650   /* build cache for off array entries formed */
1651   ierr = MatStashCreate_Private(B->comm,1,&B->stash);CHKERRQ(ierr);
1652   b->donotstash  = PETSC_FALSE;
1653   b->colmap      = PETSC_NULL;
1654   b->garray      = PETSC_NULL;
1655   b->roworiented = PETSC_TRUE;
1656 
1657 #if defined(PETSC_USE_MAT_SINGLE)
1658   /* stuff for MatSetValues_XXX in single precision */
1659   b->setvalueslen     = 0;
1660   b->setvaluescopy    = PETSC_NULL;
1661 #endif
1662 
1663   /* stuff used in block assembly */
1664   b->barray       = 0;
1665 
1666   /* stuff used for matrix vector multiply */
1667   b->lvec         = 0;
1668   b->Mvctx        = 0;
1669   b->slvec0       = 0;
1670   b->slvec0b      = 0;
1671   b->slvec1       = 0;
1672   b->slvec1a      = 0;
1673   b->slvec1b      = 0;
1674   b->sMvctx       = 0;
1675 
1676   /* stuff for MatGetRow() */
1677   b->rowindices   = 0;
1678   b->rowvalues    = 0;
1679   b->getrowactive = PETSC_FALSE;
1680 
1681   /* hash table stuff */
1682   b->ht           = 0;
1683   b->hd           = 0;
1684   b->ht_size      = 0;
1685   b->ht_flag      = PETSC_FALSE;
1686   b->ht_fact      = 0;
1687   b->ht_total_ct  = 0;
1688   b->ht_insert_ct = 0;
1689 
1690   ierr = PetscOptionsHasName(PETSC_NULL,"-mat_use_hash_table",&flg);CHKERRQ(ierr);
1691   if (flg) {
1692     PetscReal fact = 1.39;
1693     ierr = MatSetOption(B,MAT_USE_HASH_TABLE);CHKERRQ(ierr);
1694     ierr = PetscOptionsGetReal(PETSC_NULL,"-mat_use_hash_table",&fact,PETSC_NULL);CHKERRQ(ierr);
1695     if (fact <= 1.0) fact = 1.39;
1696     ierr = MatMPIBAIJSetHashTableFactor(B,fact);CHKERRQ(ierr);
1697     PetscLogInfo(0,"MatCreateMPISBAIJ:Hash table Factor used %5.2f\n",fact);
1698   }
1699   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatStoreValues_C",
1700                                      "MatStoreValues_MPISBAIJ",
1701                                      MatStoreValues_MPISBAIJ);CHKERRQ(ierr);
1702   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatRetrieveValues_C",
1703                                      "MatRetrieveValues_MPISBAIJ",
1704                                      MatRetrieveValues_MPISBAIJ);CHKERRQ(ierr);
1705   ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetDiagonalBlock_C",
1706                                      "MatGetDiagonalBlock_MPISBAIJ",
1707                                      MatGetDiagonalBlock_MPISBAIJ);CHKERRQ(ierr);
1708   PetscFunctionReturn(0);
1709 }
1710 EXTERN_C_END
1711 
1712 #undef __FUNCT__
1713 #define __FUNCT__ "MatMPISBAIJSetPreallocation"
1714 /*@C
1715    MatMPISBAIJSetPreallocation - For good matrix assembly performance
1716    the user should preallocate the matrix storage by setting the parameters
1717    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1718    performance can be increased by more than a factor of 50.
1719 
1720    Collective on Mat
1721 
1722    Input Parameters:
1723 +  A - the matrix
1724 .  bs   - size of blockk
1725 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1726            submatrix  (same for all local rows)
1727 .  d_nnz - array containing the number of block nonzeros in the various block rows
1728            of the in diagonal portion of the local (possibly different for each block
1729            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1730 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1731            submatrix (same for all local rows).
1732 -  o_nnz - array containing the number of nonzeros in the various block rows of the
1733            off-diagonal portion of the local submatrix (possibly different for
1734            each block row) or PETSC_NULL.
1735 
1736 
1737    Options Database Keys:
1738 .   -mat_no_unroll - uses code that does not unroll the loops in the
1739                      block calculations (much slower)
1740 .   -mat_block_size - size of the blocks to use
1741 
1742    Notes:
1743 
1744    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1745    than it must be used on all processors that share the object for that argument.
1746 
1747    Storage Information:
1748    For a square global matrix we define each processor's diagonal portion
1749    to be its local rows and the corresponding columns (a square submatrix);
1750    each processor's off-diagonal portion encompasses the remainder of the
1751    local matrix (a rectangular submatrix).
1752 
1753    The user can specify preallocated storage for the diagonal part of
1754    the local submatrix with either d_nz or d_nnz (not both).  Set
1755    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1756    memory allocation.  Likewise, specify preallocated storage for the
1757    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1758 
1759    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1760    the figure below we depict these three local rows and all columns (0-11).
1761 
1762 .vb
1763            0 1 2 3 4 5 6 7 8 9 10 11
1764           -------------------
1765    row 3  |  o o o d d d o o o o o o
1766    row 4  |  o o o d d d o o o o o o
1767    row 5  |  o o o d d d o o o o o o
1768           -------------------
1769 .ve
1770 
1771    Thus, any entries in the d locations are stored in the d (diagonal)
1772    submatrix, and any entries in the o locations are stored in the
1773    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1774    stored simply in the MATSEQBAIJ format for compressed row storage.
1775 
1776    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1777    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1778    In general, for PDE problems in which most nonzeros are near the diagonal,
1779    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1780    or you will get TERRIBLE performance; see the users' manual chapter on
1781    matrices.
1782 
1783    Level: intermediate
1784 
1785 .keywords: matrix, block, aij, compressed row, sparse, parallel
1786 
1787 .seealso: MatCreate(), MatCreateSeqSBAIJ(), MatSetValues(), MatCreateMPIBAIJ()
1788 @*/
1789 
1790 int MatMPISBAIJSetPreallocation(Mat B,int bs,int d_nz,int *d_nnz,int o_nz,int *o_nnz)
1791 {
1792   Mat_MPISBAIJ *b;
1793   int          ierr,i,mbs,Mbs;
1794   PetscTruth   flg2;
1795 
1796   PetscFunctionBegin;
1797   ierr = PetscTypeCompare((PetscObject)B,MATMPISBAIJ,&flg2);CHKERRQ(ierr);
1798   if (!flg2) PetscFunctionReturn(0);
1799 
1800   ierr = PetscOptionsGetInt(PETSC_NULL,"-mat_block_size",&bs,PETSC_NULL);CHKERRQ(ierr);
1801 
1802   if (bs < 1) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Invalid block size specified, must be positive");
1803   if (d_nz == PETSC_DECIDE || d_nz == PETSC_DEFAULT) d_nz = 3;
1804   if (o_nz == PETSC_DECIDE || o_nz == PETSC_DEFAULT) o_nz = 1;
1805   if (d_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %d",d_nz);
1806   if (o_nz < 0) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %d",o_nz);
1807   if (d_nnz) {
1808     for (i=0; i<B->m/bs; i++) {
1809       if (d_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than -1: local row %d value %d",i,d_nnz[i]);
1810     }
1811   }
1812   if (o_nnz) {
1813     for (i=0; i<B->m/bs; i++) {
1814       if (o_nnz[i] < 0) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than -1: local row %d value %d",i,o_nnz[i]);
1815     }
1816   }
1817   B->preallocated = PETSC_TRUE;
1818   ierr = PetscSplitOwnershipBlock(B->comm,bs,&B->m,&B->M);CHKERRQ(ierr);
1819   ierr = PetscSplitOwnershipBlock(B->comm,bs,&B->n,&B->N);CHKERRQ(ierr);
1820   ierr = PetscMapCreateMPI(B->comm,B->m,B->M,&B->rmap);CHKERRQ(ierr);
1821   ierr = PetscMapCreateMPI(B->comm,B->m,B->M,&B->cmap);CHKERRQ(ierr);
1822 
1823   b   = (Mat_MPISBAIJ*)B->data;
1824   mbs = B->m/bs;
1825   Mbs = B->M/bs;
1826   if (mbs*bs != B->m) {
1827     SETERRQ2(PETSC_ERR_ARG_SIZ,"No of local rows %d must be divisible by blocksize %d",B->m,bs);
1828   }
1829 
1830   b->bs  = bs;
1831   b->bs2 = bs*bs;
1832   b->mbs = mbs;
1833   b->nbs = mbs;
1834   b->Mbs = Mbs;
1835   b->Nbs = Mbs;
1836 
1837   ierr = MPI_Allgather(&b->mbs,1,MPI_INT,b->rowners+1,1,MPI_INT,B->comm);CHKERRQ(ierr);
1838   b->rowners[0]    = 0;
1839   for (i=2; i<=b->size; i++) {
1840     b->rowners[i] += b->rowners[i-1];
1841   }
1842   b->rstart    = b->rowners[b->rank];
1843   b->rend      = b->rowners[b->rank+1];
1844   b->cstart    = b->rstart;
1845   b->cend      = b->rend;
1846   for (i=0; i<=b->size; i++) {
1847     b->rowners_bs[i] = b->rowners[i]*bs;
1848   }
1849   b->rstart_bs = b-> rstart*bs;
1850   b->rend_bs   = b->rend*bs;
1851 
1852   b->cstart_bs = b->cstart*bs;
1853   b->cend_bs   = b->cend*bs;
1854 
1855 
1856   ierr = MatCreateSeqSBAIJ(PETSC_COMM_SELF,bs,B->m,B->m,d_nz,d_nnz,&b->A);CHKERRQ(ierr);
1857   PetscLogObjectParent(B,b->A);
1858   ierr = MatCreateSeqBAIJ(PETSC_COMM_SELF,bs,B->m,B->M,o_nz,o_nnz,&b->B);CHKERRQ(ierr);
1859   PetscLogObjectParent(B,b->B);
1860 
1861   /* build cache for off array entries formed */
1862   ierr = MatStashCreate_Private(B->comm,bs,&B->bstash);CHKERRQ(ierr);
1863 
1864   PetscFunctionReturn(0);
1865 }
1866 
1867 #undef __FUNCT__
1868 #define __FUNCT__ "MatCreateMPISBAIJ"
1869 /*@C
1870    MatCreateMPISBAIJ - Creates a sparse parallel matrix in symmetric block AIJ format
1871    (block compressed row).  For good matrix assembly performance
1872    the user should preallocate the matrix storage by setting the parameters
1873    d_nz (or d_nnz) and o_nz (or o_nnz).  By setting these parameters accurately,
1874    performance can be increased by more than a factor of 50.
1875 
1876    Collective on MPI_Comm
1877 
1878    Input Parameters:
1879 +  comm - MPI communicator
1880 .  bs   - size of blockk
1881 .  m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1882            This value should be the same as the local size used in creating the
1883            y vector for the matrix-vector product y = Ax.
1884 .  n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1885            This value should be the same as the local size used in creating the
1886            x vector for the matrix-vector product y = Ax.
1887 .  M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1888 .  N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1889 .  d_nz  - number of block nonzeros per block row in diagonal portion of local
1890            submatrix  (same for all local rows)
1891 .  d_nnz - array containing the number of block nonzeros in the various block rows
1892            of the in diagonal portion of the local (possibly different for each block
1893            row) or PETSC_NULL.  You must leave room for the diagonal entry even if it is zero.
1894 .  o_nz  - number of block nonzeros per block row in the off-diagonal portion of local
1895            submatrix (same for all local rows).
1896 -  o_nnz - array containing the number of nonzeros in the various block rows of the
1897            off-diagonal portion of the local submatrix (possibly different for
1898            each block row) or PETSC_NULL.
1899 
1900    Output Parameter:
1901 .  A - the matrix
1902 
1903    Options Database Keys:
1904 .   -mat_no_unroll - uses code that does not unroll the loops in the
1905                      block calculations (much slower)
1906 .   -mat_block_size - size of the blocks to use
1907 .   -mat_mpi - use the parallel matrix data structures even on one processor
1908                (defaults to using SeqBAIJ format on one processor)
1909 
1910    Notes:
1911    The user MUST specify either the local or global matrix dimensions
1912    (possibly both).
1913 
1914    If PETSC_DECIDE or  PETSC_DETERMINE is used for a particular argument on one processor
1915    than it must be used on all processors that share the object for that argument.
1916 
1917    Storage Information:
1918    For a square global matrix we define each processor's diagonal portion
1919    to be its local rows and the corresponding columns (a square submatrix);
1920    each processor's off-diagonal portion encompasses the remainder of the
1921    local matrix (a rectangular submatrix).
1922 
1923    The user can specify preallocated storage for the diagonal part of
1924    the local submatrix with either d_nz or d_nnz (not both).  Set
1925    d_nz=PETSC_DEFAULT and d_nnz=PETSC_NULL for PETSc to control dynamic
1926    memory allocation.  Likewise, specify preallocated storage for the
1927    off-diagonal part of the local submatrix with o_nz or o_nnz (not both).
1928 
1929    Consider a processor that owns rows 3, 4 and 5 of a parallel matrix. In
1930    the figure below we depict these three local rows and all columns (0-11).
1931 
1932 .vb
1933            0 1 2 3 4 5 6 7 8 9 10 11
1934           -------------------
1935    row 3  |  o o o d d d o o o o o o
1936    row 4  |  o o o d d d o o o o o o
1937    row 5  |  o o o d d d o o o o o o
1938           -------------------
1939 .ve
1940 
1941    Thus, any entries in the d locations are stored in the d (diagonal)
1942    submatrix, and any entries in the o locations are stored in the
1943    o (off-diagonal) submatrix.  Note that the d and the o submatrices are
1944    stored simply in the MATSEQBAIJ format for compressed row storage.
1945 
1946    Now d_nz should indicate the number of block nonzeros per row in the d matrix,
1947    and o_nz should indicate the number of block nonzeros per row in the o matrix.
1948    In general, for PDE problems in which most nonzeros are near the diagonal,
1949    one expects d_nz >> o_nz.   For large problems you MUST preallocate memory
1950    or you will get TERRIBLE performance; see the users' manual chapter on
1951    matrices.
1952 
1953    Level: intermediate
1954 
1955 .keywords: matrix, block, aij, compressed row, sparse, parallel
1956 
1957 .seealso: MatCreate(), MatCreateSeqSBAIJ(), MatSetValues(), MatCreateMPIBAIJ()
1958 @*/
1959 
1960 int MatCreateMPISBAIJ(MPI_Comm comm,int bs,int m,int n,int M,int N,int d_nz,int *d_nnz,int o_nz,int *o_nnz,Mat *A)
1961 {
1962   int ierr,size;
1963 
1964   PetscFunctionBegin;
1965   ierr = MatCreate(comm,m,n,M,N,A);CHKERRQ(ierr);
1966   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1967   if (size > 1) {
1968     ierr = MatSetType(*A,MATMPISBAIJ);CHKERRQ(ierr);
1969     ierr = MatMPISBAIJSetPreallocation(*A,bs,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
1970   } else {
1971     ierr = MatSetType(*A,MATSEQSBAIJ);CHKERRQ(ierr);
1972     ierr = MatSeqSBAIJSetPreallocation(*A,bs,d_nz,d_nnz);CHKERRQ(ierr);
1973   }
1974   PetscFunctionReturn(0);
1975 }
1976 
1977 
1978 #undef __FUNCT__
1979 #define __FUNCT__ "MatDuplicate_MPISBAIJ"
1980 static int MatDuplicate_MPISBAIJ(Mat matin,MatDuplicateOption cpvalues,Mat *newmat)
1981 {
1982   Mat          mat;
1983   Mat_MPISBAIJ *a,*oldmat = (Mat_MPISBAIJ*)matin->data;
1984   int          ierr,len=0;
1985 
1986   PetscFunctionBegin;
1987   *newmat       = 0;
1988   ierr = MatCreate(matin->comm,matin->m,matin->n,matin->M,matin->N,&mat);CHKERRQ(ierr);
1989   ierr = MatSetType(mat,MATMPISBAIJ);CHKERRQ(ierr);
1990   mat->preallocated = PETSC_TRUE;
1991   a = (Mat_MPISBAIJ*)mat->data;
1992   a->bs  = oldmat->bs;
1993   a->bs2 = oldmat->bs2;
1994   a->mbs = oldmat->mbs;
1995   a->nbs = oldmat->nbs;
1996   a->Mbs = oldmat->Mbs;
1997   a->Nbs = oldmat->Nbs;
1998 
1999   a->rstart       = oldmat->rstart;
2000   a->rend         = oldmat->rend;
2001   a->cstart       = oldmat->cstart;
2002   a->cend         = oldmat->cend;
2003   a->size         = oldmat->size;
2004   a->rank         = oldmat->rank;
2005   a->donotstash   = oldmat->donotstash;
2006   a->roworiented  = oldmat->roworiented;
2007   a->rowindices   = 0;
2008   a->rowvalues    = 0;
2009   a->getrowactive = PETSC_FALSE;
2010   a->barray       = 0;
2011   a->rstart_bs    = oldmat->rstart_bs;
2012   a->rend_bs      = oldmat->rend_bs;
2013   a->cstart_bs    = oldmat->cstart_bs;
2014   a->cend_bs      = oldmat->cend_bs;
2015 
2016   /* hash table stuff */
2017   a->ht           = 0;
2018   a->hd           = 0;
2019   a->ht_size      = 0;
2020   a->ht_flag      = oldmat->ht_flag;
2021   a->ht_fact      = oldmat->ht_fact;
2022   a->ht_total_ct  = 0;
2023   a->ht_insert_ct = 0;
2024 
2025   ierr = PetscMalloc(3*(a->size+2)*sizeof(int),&a->rowners);CHKERRQ(ierr);
2026   PetscLogObjectMemory(mat,3*(a->size+2)*sizeof(int)+sizeof(struct _p_Mat)+sizeof(Mat_MPISBAIJ));
2027   a->cowners    = a->rowners + a->size + 2;
2028   a->rowners_bs = a->cowners + a->size + 2;
2029   ierr = PetscMemcpy(a->rowners,oldmat->rowners,3*(a->size+2)*sizeof(int));CHKERRQ(ierr);
2030   ierr = MatStashCreate_Private(matin->comm,1,&mat->stash);CHKERRQ(ierr);
2031   ierr = MatStashCreate_Private(matin->comm,oldmat->bs,&mat->bstash);CHKERRQ(ierr);
2032   if (oldmat->colmap) {
2033 #if defined (PETSC_USE_CTABLE)
2034     ierr = PetscTableCreateCopy(oldmat->colmap,&a->colmap);CHKERRQ(ierr);
2035 #else
2036     ierr = PetscMalloc((a->Nbs)*sizeof(int),&a->colmap);CHKERRQ(ierr);
2037     PetscLogObjectMemory(mat,(a->Nbs)*sizeof(int));
2038     ierr = PetscMemcpy(a->colmap,oldmat->colmap,(a->Nbs)*sizeof(int));CHKERRQ(ierr);
2039 #endif
2040   } else a->colmap = 0;
2041   if (oldmat->garray && (len = ((Mat_SeqBAIJ*)(oldmat->B->data))->nbs)) {
2042     ierr = PetscMalloc(len*sizeof(int),&a->garray);CHKERRQ(ierr);
2043     PetscLogObjectMemory(mat,len*sizeof(int));
2044     ierr = PetscMemcpy(a->garray,oldmat->garray,len*sizeof(int));CHKERRQ(ierr);
2045   } else a->garray = 0;
2046 
2047   ierr =  VecDuplicate(oldmat->lvec,&a->lvec);CHKERRQ(ierr);
2048   PetscLogObjectParent(mat,a->lvec);
2049   ierr =  VecScatterCopy(oldmat->Mvctx,&a->Mvctx);CHKERRQ(ierr);
2050 
2051   PetscLogObjectParent(mat,a->Mvctx);
2052   ierr =  MatDuplicate(oldmat->A,cpvalues,&a->A);CHKERRQ(ierr);
2053   PetscLogObjectParent(mat,a->A);
2054   ierr =  MatDuplicate(oldmat->B,cpvalues,&a->B);CHKERRQ(ierr);
2055   PetscLogObjectParent(mat,a->B);
2056   ierr = PetscFListDuplicate(mat->qlist,&matin->qlist);CHKERRQ(ierr);
2057   *newmat = mat;
2058   PetscFunctionReturn(0);
2059 }
2060 
2061 #include "petscsys.h"
2062 
2063 EXTERN_C_BEGIN
2064 #undef __FUNCT__
2065 #define __FUNCT__ "MatLoad_MPISBAIJ"
2066 int MatLoad_MPISBAIJ(PetscViewer viewer,MatType type,Mat *newmat)
2067 {
2068   Mat          A;
2069   int          i,nz,ierr,j,rstart,rend,fd;
2070   PetscScalar  *vals,*buf;
2071   MPI_Comm     comm = ((PetscObject)viewer)->comm;
2072   MPI_Status   status;
2073   int          header[4],rank,size,*rowlengths = 0,M,N,m,*rowners,*browners,maxnz,*cols;
2074   int          *locrowlens,*sndcounts = 0,*procsnz = 0,jj,*mycols,*ibuf;
2075   int          tag = ((PetscObject)viewer)->tag,bs=1,Mbs,mbs,extra_rows;
2076   int          *dlens,*odlens,*mask,*masked1,*masked2,rowcount,odcount;
2077   int          dcount,kmax,k,nzcount,tmp;
2078 
2079   PetscFunctionBegin;
2080   ierr = PetscOptionsGetInt(PETSC_NULL,"-matload_block_size",&bs,PETSC_NULL);CHKERRQ(ierr);
2081 
2082   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2083   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
2084   if (!rank) {
2085     ierr = PetscViewerBinaryGetDescriptor(viewer,&fd);CHKERRQ(ierr);
2086     ierr = PetscBinaryRead(fd,(char *)header,4,PETSC_INT);CHKERRQ(ierr);
2087     if (header[0] != MAT_FILE_COOKIE) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"not matrix object");
2088     if (header[3] < 0) {
2089       SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"Matrix stored in special format, cannot load as MPISBAIJ");
2090     }
2091   }
2092 
2093   ierr = MPI_Bcast(header+1,3,MPI_INT,0,comm);CHKERRQ(ierr);
2094   M = header[1]; N = header[2];
2095 
2096   if (M != N) SETERRQ(PETSC_ERR_SUP,"Can only do square matrices");
2097 
2098   /*
2099      This code adds extra rows to make sure the number of rows is
2100      divisible by the blocksize
2101   */
2102   Mbs        = M/bs;
2103   extra_rows = bs - M + bs*(Mbs);
2104   if (extra_rows == bs) extra_rows = 0;
2105   else                  Mbs++;
2106   if (extra_rows &&!rank) {
2107     PetscLogInfo(0,"MatLoad_MPISBAIJ:Padding loaded matrix to match blocksize\n");
2108   }
2109 
2110   /* determine ownership of all rows */
2111   mbs        = Mbs/size + ((Mbs % size) > rank);
2112   m          = mbs*bs;
2113   ierr       = PetscMalloc(2*(size+2)*sizeof(int),&rowners);CHKERRQ(ierr);
2114   browners   = rowners + size + 1;
2115   ierr       = MPI_Allgather(&mbs,1,MPI_INT,rowners+1,1,MPI_INT,comm);CHKERRQ(ierr);
2116   rowners[0] = 0;
2117   for (i=2; i<=size; i++) rowners[i] += rowners[i-1];
2118   for (i=0; i<=size;  i++) browners[i] = rowners[i]*bs;
2119   rstart = rowners[rank];
2120   rend   = rowners[rank+1];
2121 
2122   /* distribute row lengths to all processors */
2123   ierr = PetscMalloc((rend-rstart)*bs*sizeof(int),&locrowlens);CHKERRQ(ierr);
2124   if (!rank) {
2125     ierr = PetscMalloc((M+extra_rows)*sizeof(int),&rowlengths);CHKERRQ(ierr);
2126     ierr = PetscBinaryRead(fd,rowlengths,M,PETSC_INT);CHKERRQ(ierr);
2127     for (i=0; i<extra_rows; i++) rowlengths[M+i] = 1;
2128     ierr = PetscMalloc(size*sizeof(int),&sndcounts);CHKERRQ(ierr);
2129     for (i=0; i<size; i++) sndcounts[i] = browners[i+1] - browners[i];
2130     ierr = MPI_Scatterv(rowlengths,sndcounts,browners,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2131     ierr = PetscFree(sndcounts);CHKERRQ(ierr);
2132   } else {
2133     ierr = MPI_Scatterv(0,0,0,MPI_INT,locrowlens,(rend-rstart)*bs,MPI_INT,0,comm);CHKERRQ(ierr);
2134   }
2135 
2136   if (!rank) {   /* procs[0] */
2137     /* calculate the number of nonzeros on each processor */
2138     ierr = PetscMalloc(size*sizeof(int),&procsnz);CHKERRQ(ierr);
2139     ierr = PetscMemzero(procsnz,size*sizeof(int));CHKERRQ(ierr);
2140     for (i=0; i<size; i++) {
2141       for (j=rowners[i]*bs; j< rowners[i+1]*bs; j++) {
2142         procsnz[i] += rowlengths[j];
2143       }
2144     }
2145     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
2146 
2147     /* determine max buffer needed and allocate it */
2148     maxnz = 0;
2149     for (i=0; i<size; i++) {
2150       maxnz = PetscMax(maxnz,procsnz[i]);
2151     }
2152     ierr = PetscMalloc(maxnz*sizeof(int),&cols);CHKERRQ(ierr);
2153 
2154     /* read in my part of the matrix column indices  */
2155     nz     = procsnz[0];
2156     ierr   = PetscMalloc(nz*sizeof(int),&ibuf);CHKERRQ(ierr);
2157     mycols = ibuf;
2158     if (size == 1)  nz -= extra_rows;
2159     ierr = PetscBinaryRead(fd,mycols,nz,PETSC_INT);CHKERRQ(ierr);
2160     if (size == 1)  for (i=0; i< extra_rows; i++) { mycols[nz+i] = M+i; }
2161 
2162     /* read in every ones (except the last) and ship off */
2163     for (i=1; i<size-1; i++) {
2164       nz   = procsnz[i];
2165       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2166       ierr = MPI_Send(cols,nz,MPI_INT,i,tag,comm);CHKERRQ(ierr);
2167     }
2168     /* read in the stuff for the last proc */
2169     if (size != 1) {
2170       nz   = procsnz[size-1] - extra_rows;  /* the extra rows are not on the disk */
2171       ierr = PetscBinaryRead(fd,cols,nz,PETSC_INT);CHKERRQ(ierr);
2172       for (i=0; i<extra_rows; i++) cols[nz+i] = M+i;
2173       ierr = MPI_Send(cols,nz+extra_rows,MPI_INT,size-1,tag,comm);CHKERRQ(ierr);
2174     }
2175     ierr = PetscFree(cols);CHKERRQ(ierr);
2176   } else {  /* procs[i], i>0 */
2177     /* determine buffer space needed for message */
2178     nz = 0;
2179     for (i=0; i<m; i++) {
2180       nz += locrowlens[i];
2181     }
2182     ierr   = PetscMalloc(nz*sizeof(int),&ibuf);CHKERRQ(ierr);
2183     mycols = ibuf;
2184     /* receive message of column indices*/
2185     ierr = MPI_Recv(mycols,nz,MPI_INT,0,tag,comm,&status);CHKERRQ(ierr);
2186     ierr = MPI_Get_count(&status,MPI_INT,&maxnz);CHKERRQ(ierr);
2187     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
2188   }
2189 
2190   /* loop over local rows, determining number of off diagonal entries */
2191   ierr     = PetscMalloc(2*(rend-rstart+1)*sizeof(int),&dlens);CHKERRQ(ierr);
2192   odlens   = dlens + (rend-rstart);
2193   ierr     = PetscMalloc(3*Mbs*sizeof(int),&mask);CHKERRQ(ierr);
2194   ierr     = PetscMemzero(mask,3*Mbs*sizeof(int));CHKERRQ(ierr);
2195   masked1  = mask    + Mbs;
2196   masked2  = masked1 + Mbs;
2197   rowcount = 0; nzcount = 0;
2198   for (i=0; i<mbs; i++) {
2199     dcount  = 0;
2200     odcount = 0;
2201     for (j=0; j<bs; j++) {
2202       kmax = locrowlens[rowcount];
2203       for (k=0; k<kmax; k++) {
2204         tmp = mycols[nzcount++]/bs; /* block col. index */
2205         if (!mask[tmp]) {
2206           mask[tmp] = 1;
2207           if (tmp < rstart || tmp >= rend) masked2[odcount++] = tmp; /* entry in off-diag portion */
2208           else masked1[dcount++] = tmp; /* entry in diag portion */
2209         }
2210       }
2211       rowcount++;
2212     }
2213 
2214     dlens[i]  = dcount;  /* d_nzz[i] */
2215     odlens[i] = odcount; /* o_nzz[i] */
2216 
2217     /* zero out the mask elements we set */
2218     for (j=0; j<dcount; j++) mask[masked1[j]] = 0;
2219     for (j=0; j<odcount; j++) mask[masked2[j]] = 0;
2220   }
2221 
2222   /* create our matrix */
2223   ierr = MatCreateMPISBAIJ(comm,bs,m,m,PETSC_DETERMINE,PETSC_DETERMINE,0,dlens,0,odlens,newmat);
2224   CHKERRQ(ierr);
2225   A = *newmat;
2226   ierr = MatSetOption(A,MAT_COLUMNS_SORTED);CHKERRQ(ierr);
2227 
2228   if (!rank) {
2229     ierr = PetscMalloc(maxnz*sizeof(PetscScalar),&buf);CHKERRQ(ierr);
2230     /* read in my part of the matrix numerical values  */
2231     nz = procsnz[0];
2232     vals = buf;
2233     mycols = ibuf;
2234     if (size == 1)  nz -= extra_rows;
2235     ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2236     if (size == 1)  for (i=0; i< extra_rows; i++) { vals[nz+i] = 1.0; }
2237 
2238     /* insert into matrix */
2239     jj      = rstart*bs;
2240     for (i=0; i<m; i++) {
2241       ierr = MatSetValues(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2242       mycols += locrowlens[i];
2243       vals   += locrowlens[i];
2244       jj++;
2245     }
2246 
2247     /* read in other processors (except the last one) and ship out */
2248     for (i=1; i<size-1; i++) {
2249       nz   = procsnz[i];
2250       vals = buf;
2251       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2252       ierr = MPI_Send(vals,nz,MPIU_SCALAR,i,A->tag,comm);CHKERRQ(ierr);
2253     }
2254     /* the last proc */
2255     if (size != 1){
2256       nz   = procsnz[i] - extra_rows;
2257       vals = buf;
2258       ierr = PetscBinaryRead(fd,vals,nz,PETSC_SCALAR);CHKERRQ(ierr);
2259       for (i=0; i<extra_rows; i++) vals[nz+i] = 1.0;
2260       ierr = MPI_Send(vals,nz+extra_rows,MPIU_SCALAR,size-1,A->tag,comm);CHKERRQ(ierr);
2261     }
2262     ierr = PetscFree(procsnz);CHKERRQ(ierr);
2263 
2264   } else {
2265     /* receive numeric values */
2266     ierr = PetscMalloc(nz*sizeof(PetscScalar),&buf);CHKERRQ(ierr);
2267 
2268     /* receive message of values*/
2269     vals   = buf;
2270     mycols = ibuf;
2271     ierr   = MPI_Recv(vals,nz,MPIU_SCALAR,0,A->tag,comm,&status);CHKERRQ(ierr);
2272     ierr   = MPI_Get_count(&status,MPIU_SCALAR,&maxnz);CHKERRQ(ierr);
2273     if (maxnz != nz) SETERRQ(PETSC_ERR_FILE_UNEXPECTED,"something is wrong with file");
2274 
2275     /* insert into matrix */
2276     jj      = rstart*bs;
2277     for (i=0; i<m; i++) {
2278       ierr    = MatSetValues_MPISBAIJ(A,1,&jj,locrowlens[i],mycols,vals,INSERT_VALUES);CHKERRQ(ierr);
2279       mycols += locrowlens[i];
2280       vals   += locrowlens[i];
2281       jj++;
2282     }
2283   }
2284 
2285   ierr = PetscFree(locrowlens);CHKERRQ(ierr);
2286   ierr = PetscFree(buf);CHKERRQ(ierr);
2287   ierr = PetscFree(ibuf);CHKERRQ(ierr);
2288   ierr = PetscFree(rowners);CHKERRQ(ierr);
2289   ierr = PetscFree(dlens);CHKERRQ(ierr);
2290   ierr = PetscFree(mask);CHKERRQ(ierr);
2291   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2292   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2293   PetscFunctionReturn(0);
2294 }
2295 EXTERN_C_END
2296 
2297 #undef __FUNCT__
2298 #define __FUNCT__ "MatMPISBAIJSetHashTableFactor"
2299 /*@
2300    MatMPISBAIJSetHashTableFactor - Sets the factor required to compute the size of the HashTable.
2301 
2302    Input Parameters:
2303 .  mat  - the matrix
2304 .  fact - factor
2305 
2306    Collective on Mat
2307 
2308    Level: advanced
2309 
2310   Notes:
2311    This can also be set by the command line option: -mat_use_hash_table fact
2312 
2313 .keywords: matrix, hashtable, factor, HT
2314 
2315 .seealso: MatSetOption()
2316 @*/
2317 int MatMPISBAIJSetHashTableFactor(Mat mat,PetscReal fact)
2318 {
2319   PetscFunctionBegin;
2320   SETERRQ(1,"Function not yet written for SBAIJ format");
2321   /* PetscFunctionReturn(0); */
2322 }
2323 
2324 #undef __FUNCT__
2325 #define __FUNCT__ "MatGetRowMax_MPISBAIJ"
2326 int MatGetRowMax_MPISBAIJ(Mat A,Vec v)
2327 {
2328   Mat_MPISBAIJ *a = (Mat_MPISBAIJ*)A->data;
2329   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ*)(a->B)->data;
2330   PetscReal    atmp;
2331   PetscReal    *work,*svalues,*rvalues;
2332   int          ierr,i,bs,mbs,*bi,*bj,brow,j,ncols,krow,kcol,col,row,Mbs,bcol;
2333   int          rank,size,*rowners_bs,dest,count,source;
2334   PetscScalar  *va;
2335   MatScalar    *ba;
2336   MPI_Status   stat;
2337 
2338   PetscFunctionBegin;
2339   ierr = MatGetRowMax(a->A,v);CHKERRQ(ierr);
2340   ierr = VecGetArray(v,&va);CHKERRQ(ierr);
2341 
2342   ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
2343   ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
2344 
2345   bs   = a->bs;
2346   mbs  = a->mbs;
2347   Mbs  = a->Mbs;
2348   ba   = b->a;
2349   bi   = b->i;
2350   bj   = b->j;
2351   /*
2352   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] M: %d, bs: %d, mbs: %d \n",rank,bs*Mbs,bs,mbs);
2353   PetscSynchronizedFlush(PETSC_COMM_WORLD);
2354   */
2355 
2356   /* find ownerships */
2357   rowners_bs = a->rowners_bs;
2358   /*
2359   if (!rank){
2360     for (i=0; i<size+1; i++) PetscPrintf(PETSC_COMM_SELF," rowners_bs[%d]: %d\n",i,rowners_bs[i]);
2361   }
2362   */
2363 
2364   /* each proc creates an array to be distributed */
2365   ierr = PetscMalloc(bs*Mbs*sizeof(PetscReal),&work);CHKERRQ(ierr);
2366   ierr = PetscMemzero(work,bs*Mbs*sizeof(PetscReal));CHKERRQ(ierr);
2367 
2368   /* row_max for B */
2369   if (rank != size-1){
2370     for (i=0; i<mbs; i++) {
2371       ncols = bi[1] - bi[0]; bi++;
2372       brow  = bs*i;
2373       for (j=0; j<ncols; j++){
2374         bcol = bs*(*bj);
2375         for (kcol=0; kcol<bs; kcol++){
2376           col = bcol + kcol;                 /* local col index */
2377           col += rowners_bs[rank+1];      /* global col index */
2378           /* PetscPrintf(PETSC_COMM_SELF,"[%d], col: %d\n",rank,col); */
2379           for (krow=0; krow<bs; krow++){
2380             atmp = PetscAbsScalar(*ba); ba++;
2381             row = brow + krow;    /* local row index */
2382             /* printf("val[%d,%d]: %g\n",row,col,atmp); */
2383             if (PetscRealPart(va[row]) < atmp) va[row] = atmp;
2384             if (work[col] < atmp) work[col] = atmp;
2385           }
2386         }
2387         bj++;
2388       }
2389     }
2390     /*
2391       PetscPrintf(PETSC_COMM_SELF,"[%d], work: ",rank);
2392       for (i=0; i<bs*Mbs; i++) PetscPrintf(PETSC_COMM_SELF,"%g ",work[i]);
2393       PetscPrintf(PETSC_COMM_SELF,"[%d]: \n");
2394       */
2395 
2396     /* send values to its owners */
2397     for (dest=rank+1; dest<size; dest++){
2398       svalues = work + rowners_bs[dest];
2399       count   = rowners_bs[dest+1]-rowners_bs[dest];
2400       ierr    = MPI_Send(svalues,count,MPIU_REAL,dest,rank,PETSC_COMM_WORLD);CHKERRQ(ierr);
2401       /*
2402       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] sends %d values to [%d]: %g, %g, %g, %g\n",rank,count,dest,svalues[0],svalues[1],svalues[2],svalues[3]);
2403       PetscSynchronizedFlush(PETSC_COMM_WORLD);
2404       */
2405     }
2406   }
2407 
2408   /* receive values */
2409   if (rank){
2410     rvalues = work;
2411     count   = rowners_bs[rank+1]-rowners_bs[rank];
2412     for (source=0; source<rank; source++){
2413       ierr = MPI_Recv(rvalues,count,MPIU_REAL,MPI_ANY_SOURCE,MPI_ANY_TAG,PETSC_COMM_WORLD,&stat);CHKERRQ(ierr);
2414       /* process values */
2415       for (i=0; i<count; i++){
2416         if (PetscRealPart(va[i]) < rvalues[i]) va[i] = rvalues[i];
2417       }
2418       /*
2419       PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] received %d values from [%d]: %g, %g, %g, %g \n",rank,count,stat.MPI_SOURCE,rvalues[0],rvalues[1],rvalues[2],rvalues[3]);
2420       PetscSynchronizedFlush(PETSC_COMM_WORLD);
2421       */
2422     }
2423   }
2424 
2425   ierr = VecRestoreArray(v,&va);CHKERRQ(ierr);
2426   ierr = PetscFree(work);CHKERRQ(ierr);
2427   PetscFunctionReturn(0);
2428 }
2429 
2430 #undef __FUNCT__
2431 #define __FUNCT__ "MatRelax_MPISBAIJ"
2432 int MatRelax_MPISBAIJ(Mat matin,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,int its,int lits,Vec xx)
2433 {
2434   Mat_MPISBAIJ   *mat = (Mat_MPISBAIJ*)matin->data;
2435   int            ierr,mbs=mat->mbs,bs=mat->bs;
2436   PetscScalar    mone=-1.0,*x,*b,*ptr,zero=0.0;
2437   Vec            bb1;
2438 
2439   PetscFunctionBegin;
2440   if (its <= 0 || lits <= 0) SETERRQ2(PETSC_ERR_ARG_WRONG,"Relaxation requires global its %d and local its %d both positive",its,lits);
2441   if (bs > 1)
2442     SETERRQ(PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented");
2443 
2444   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
2445     if ( flag & SOR_ZERO_INITIAL_GUESS ) {
2446       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,lits,lits,xx);CHKERRQ(ierr);
2447       its--;
2448     }
2449 
2450     ierr = VecDuplicate(bb,&bb1);CHKERRQ(ierr);
2451     while (its--){
2452 
2453       /* lower triangular part: slvec0b = - B^T*xx */
2454       ierr = (*mat->B->ops->multtranspose)(mat->B,xx,mat->slvec0b);CHKERRQ(ierr);
2455 
2456       /* copy xx into slvec0a */
2457       ierr = VecGetArray(mat->slvec0,&ptr);CHKERRQ(ierr);
2458       ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2459       ierr = PetscMemcpy(ptr,x,bs*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2460       ierr = VecRestoreArray(mat->slvec0,&ptr);CHKERRQ(ierr);
2461 
2462       ierr = VecScale(&mone,mat->slvec0);CHKERRQ(ierr);
2463 
2464       /* copy bb into slvec1a */
2465       ierr = VecGetArray(mat->slvec1,&ptr);CHKERRQ(ierr);
2466       ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2467       ierr = PetscMemcpy(ptr,b,bs*mbs*sizeof(MatScalar));CHKERRQ(ierr);
2468       ierr = VecRestoreArray(mat->slvec1,&ptr);CHKERRQ(ierr);
2469 
2470       /* set slvec1b = 0 */
2471       ierr = VecSet(&zero,mat->slvec1b);CHKERRQ(ierr);
2472 
2473       ierr = VecScatterBegin(mat->slvec0,mat->slvec1,ADD_VALUES,SCATTER_FORWARD,mat->sMvctx);CHKERRQ(ierr);
2474       ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2475       ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2476       ierr = VecScatterEnd(mat->slvec0,mat->slvec1,ADD_VALUES,SCATTER_FORWARD,mat->sMvctx);CHKERRQ(ierr);
2477 
2478       /* upper triangular part: bb1 = bb1 - B*x */
2479       ierr = (*mat->B->ops->multadd)(mat->B,mat->slvec1b,mat->slvec1a,bb1);CHKERRQ(ierr);
2480 
2481       /* local diagonal sweep */
2482       ierr = (*mat->A->ops->relax)(mat->A,bb1,omega,SOR_SYMMETRIC_SWEEP,fshift,lits,lits,xx);CHKERRQ(ierr);
2483     }
2484     ierr = VecDestroy(bb1);CHKERRQ(ierr);
2485   } else {
2486     SETERRQ(PETSC_ERR_SUP,"MatSORType is not supported for SBAIJ matrix format");
2487   }
2488   PetscFunctionReturn(0);
2489 }
2490 
2491 #undef __FUNCT__
2492 #define __FUNCT__ "MatRelax_MPISBAIJ_2comm"
2493 int MatRelax_MPISBAIJ_2comm(Mat matin,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,int its,int lits,Vec xx)
2494 {
2495   Mat_MPISBAIJ   *mat = (Mat_MPISBAIJ*)matin->data;
2496   int            ierr;
2497   PetscScalar    mone=-1.0;
2498   Vec            lvec1,bb1;
2499 
2500   PetscFunctionBegin;
2501   if (its <= 0 || lits <= 0) SETERRQ2(PETSC_ERR_ARG_WRONG,"Relaxation requires global its %d and local its %d both positive",its,lits);
2502   if (mat->bs > 1)
2503     SETERRQ(PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented");
2504 
2505   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP){
2506     if ( flag & SOR_ZERO_INITIAL_GUESS ) {
2507       ierr = (*mat->A->ops->relax)(mat->A,bb,omega,flag,fshift,lits,lits,xx);CHKERRQ(ierr);
2508       its--;
2509     }
2510 
2511     ierr = VecDuplicate(mat->lvec,&lvec1);CHKERRQ(ierr);
2512     ierr = VecDuplicate(bb,&bb1);CHKERRQ(ierr);
2513     while (its--){
2514       ierr = VecScatterBegin(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
2515 
2516       /* lower diagonal part: bb1 = bb - B^T*xx */
2517       ierr = (*mat->B->ops->multtranspose)(mat->B,xx,lvec1);CHKERRQ(ierr);
2518       ierr = VecScale(&mone,lvec1);CHKERRQ(ierr);
2519 
2520       ierr = VecScatterEnd(xx,mat->lvec,INSERT_VALUES,SCATTER_FORWARD,mat->Mvctx);CHKERRQ(ierr);
2521       ierr = VecCopy(bb,bb1);CHKERRQ(ierr);
2522       ierr = VecScatterBegin(lvec1,bb1,ADD_VALUES,SCATTER_REVERSE,mat->Mvctx);CHKERRQ(ierr);
2523 
2524       /* upper diagonal part: bb1 = bb1 - B*x */
2525       ierr = VecScale(&mone,mat->lvec);CHKERRQ(ierr);
2526       ierr = (*mat->B->ops->multadd)(mat->B,mat->lvec,bb1,bb1);CHKERRQ(ierr);
2527 
2528       ierr = VecScatterEnd(lvec1,bb1,ADD_VALUES,SCATTER_REVERSE,mat->Mvctx);CHKERRQ(ierr);
2529 
2530       /* diagonal sweep */
2531       ierr = (*mat->A->ops->relax)(mat->A,bb1,omega,SOR_SYMMETRIC_SWEEP,fshift,lits,lits,xx);CHKERRQ(ierr);
2532     }
2533     ierr = VecDestroy(lvec1);CHKERRQ(ierr);
2534     ierr = VecDestroy(bb1);CHKERRQ(ierr);
2535   } else {
2536     SETERRQ(PETSC_ERR_SUP,"MatSORType is not supported for SBAIJ matrix format");
2537   }
2538   PetscFunctionReturn(0);
2539 }
2540 
2541