xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision abe6c858dfcdc3f456cce7b6c5613444c2586b44)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscMPIInt    rank;
421   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
422   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
423   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
424   PetscInt       *rbuf_i,kmax,rbuf_0;
425   PetscBT        xtable;
426 
427   PetscFunctionBegin;
428   rank   = c->rank;
429   m      = C->rmap->N;
430   rstart = C->rmap->rstart;
431   cstart = C->cmap->rstart;
432   ai     = a->i;
433   aj     = a->j;
434   bi     = b->i;
435   bj     = b->j;
436   garray = c->garray;
437 
438 
439   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
440     rbuf_i  =  rbuf[i];
441     rbuf_0  =  rbuf_i[0];
442     ct     += rbuf_0;
443     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
444   }
445 
446   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
447   else      max1 = 1;
448   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
449   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
450   ++no_malloc;
451   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
452   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
453 
454   ct3 = 0;
455   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
456     rbuf_i =  rbuf[i];
457     rbuf_0 =  rbuf_i[0];
458     ct1    =  2*rbuf_0+1;
459     ct2    =  ct1;
460     ct3    += ct1;
461     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
462       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
463       oct2 = ct2;
464       kmax = rbuf_i[2*j];
465       for (k=0; k<kmax; k++,ct1++) {
466         row = rbuf_i[ct1];
467         if (!PetscBTLookupSet(xtable,row)) {
468           if (!(ct3 < mem_estimate)) {
469             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
470             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
471             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
472             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
473             xdata[0]     = tmp;
474             mem_estimate = new_estimate; ++no_malloc;
475             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
476           }
477           xdata[i][ct2++] = row;
478           ct3++;
479         }
480       }
481       for (k=oct2,max2=ct2; k<max2; k++) {
482         row   = xdata[i][k] - rstart;
483         start = ai[row];
484         end   = ai[row+1];
485         for (l=start; l<end; l++) {
486           val = aj[l] + cstart;
487           if (!PetscBTLookupSet(xtable,val)) {
488             if (!(ct3 < mem_estimate)) {
489               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
490               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
491               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
492               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
493               xdata[0]     = tmp;
494               mem_estimate = new_estimate; ++no_malloc;
495               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
496             }
497             xdata[i][ct2++] = val;
498             ct3++;
499           }
500         }
501         start = bi[row];
502         end   = bi[row+1];
503         for (l=start; l<end; l++) {
504           val = garray[bj[l]];
505           if (!PetscBTLookupSet(xtable,val)) {
506             if (!(ct3 < mem_estimate)) {
507               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
508               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
509               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
510               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
511               xdata[0]     = tmp;
512               mem_estimate = new_estimate; ++no_malloc;
513               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
514             }
515             xdata[i][ct2++] = val;
516             ct3++;
517           }
518         }
519       }
520       /* Update the header*/
521       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
522       xdata[i][2*j-1] = rbuf_i[2*j-1];
523     }
524     xdata[i][0] = rbuf_0;
525     xdata[i+1]  = xdata[i] + ct2;
526     isz1[i]     = ct2; /* size of each message */
527   }
528   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
529   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
530   PetscFunctionReturn(0);
531 }
532 /* -------------------------------------------------------------------------*/
533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
535 /*
536     Every processor gets the entire matrix
537 */
538 #undef __FUNCT__
539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
541 {
542   Mat            B;
543   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
544   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
545   PetscErrorCode ierr;
546   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
547   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
548   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
549   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
550 
551   PetscFunctionBegin;
552   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
553   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
554 
555   if (scall == MAT_INITIAL_MATRIX) {
556     /* ----------------------------------------------------------------
557          Tell every processor the number of nonzeros per row
558     */
559     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
560     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
561       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
562     }
563     sendcount = A->rmap->rend - A->rmap->rstart;
564     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
565     for (i=0; i<size; i++) {
566       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
567       displs[i]     = A->rmap->range[i];
568     }
569 #if defined(PETSC_HAVE_MPI_IN_PLACE)
570     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #else
572     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
573 #endif
574     /* ---------------------------------------------------------------
575          Create the sequential matrix of the same type as the local block diagonal
576     */
577     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
578     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
579     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
580     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
581     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
582     **Bin = B;
583     b = (Mat_SeqAIJ *)B->data;
584 
585     /*--------------------------------------------------------------------
586        Copy my part of matrix column indices over
587     */
588     sendcount  = ad->nz + bd->nz;
589     jsendbuf   = b->j + b->i[rstarts[rank]];
590     a_jsendbuf = ad->j;
591     b_jsendbuf = bd->j;
592     n          = A->rmap->rend - A->rmap->rstart;
593     cnt        = 0;
594     for (i=0; i<n; i++) {
595 
596       /* put in lower diagonal portion */
597       m = bd->i[i+1] - bd->i[i];
598       while (m > 0) {
599         /* is it above diagonal (in bd (compressed) numbering) */
600         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
601         jsendbuf[cnt++] = garray[*b_jsendbuf++];
602         m--;
603       }
604 
605       /* put in diagonal portion */
606       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
607         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
608       }
609 
610       /* put in upper diagonal portion */
611       while (m-- > 0) {
612         jsendbuf[cnt++] = garray[*b_jsendbuf++];
613       }
614     }
615     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
616 
617     /*--------------------------------------------------------------------
618        Gather all column indices to all processors
619     */
620     for (i=0; i<size; i++) {
621       recvcounts[i] = 0;
622       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
623         recvcounts[i] += lens[j];
624       }
625     }
626     displs[0]  = 0;
627     for (i=1; i<size; i++) {
628       displs[i] = displs[i-1] + recvcounts[i-1];
629     }
630 #if defined(PETSC_HAVE_MPI_IN_PLACE)
631     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
632 #else
633     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
634 #endif
635     /*--------------------------------------------------------------------
636         Assemble the matrix into useable form (note numerical values not yet set)
637     */
638     /* set the b->ilen (length of each row) values */
639     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
640     /* set the b->i indices */
641     b->i[0] = 0;
642     for (i=1; i<=A->rmap->N; i++) {
643       b->i[i] = b->i[i-1] + lens[i-1];
644     }
645     ierr = PetscFree(lens);CHKERRQ(ierr);
646     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
648 
649   } else {
650     B  = **Bin;
651     b = (Mat_SeqAIJ *)B->data;
652   }
653 
654   /*--------------------------------------------------------------------
655        Copy my part of matrix numerical values into the values location
656   */
657   if (flag == MAT_GET_VALUES){
658     sendcount = ad->nz + bd->nz;
659     sendbuf   = b->a + b->i[rstarts[rank]];
660     a_sendbuf = ad->a;
661     b_sendbuf = bd->a;
662     b_sendj   = bd->j;
663     n         = A->rmap->rend - A->rmap->rstart;
664     cnt       = 0;
665     for (i=0; i<n; i++) {
666 
667       /* put in lower diagonal portion */
668       m = bd->i[i+1] - bd->i[i];
669       while (m > 0) {
670         /* is it above diagonal (in bd (compressed) numbering) */
671         if (garray[*b_sendj] > A->rmap->rstart + i) break;
672         sendbuf[cnt++] = *b_sendbuf++;
673         m--;
674         b_sendj++;
675       }
676 
677       /* put in diagonal portion */
678       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
679         sendbuf[cnt++] = *a_sendbuf++;
680       }
681 
682       /* put in upper diagonal portion */
683       while (m-- > 0) {
684         sendbuf[cnt++] = *b_sendbuf++;
685         b_sendj++;
686       }
687     }
688     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
689 
690     /* -----------------------------------------------------------------
691        Gather all numerical values to all processors
692     */
693     if (!recvcounts) {
694       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
695     }
696     for (i=0; i<size; i++) {
697       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
698     }
699     displs[0]  = 0;
700     for (i=1; i<size; i++) {
701       displs[i] = displs[i-1] + recvcounts[i-1];
702     }
703     recvbuf   = b->a;
704 #if defined(PETSC_HAVE_MPI_IN_PLACE)
705     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
706 #else
707     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
708 #endif
709   }  /* endof (flag == MAT_GET_VALUES) */
710   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
711 
712   if (A->symmetric){
713     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
714   } else if (A->hermitian) {
715     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
716   } else if (A->structurally_symmetric) {
717     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
718   }
719   PetscFunctionReturn(0);
720 }
721 
722 
723 
724 #undef __FUNCT__
725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
727 {
728   PetscErrorCode ierr;
729   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
730   PetscBool      rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
731 
732   PetscFunctionBegin;
733   /*
734        Check for special case each processor gets entire matrix
735   */
736   if (ismax == 1 && C->rmap->N == C->cmap->N) {
737     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
738     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
739     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
740     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
741     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
742       wantallmatrix = PETSC_TRUE;
743       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
744     }
745   }
746   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
747   if (twantallmatrix) {
748     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
749     PetscFunctionReturn(0);
750   }
751 
752   /* Allocate memory to hold all the submatrices */
753   if (scall != MAT_REUSE_MATRIX) {
754     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
755   }
756   /* Determine the number of stages through which submatrices are done */
757   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
758   /*
759      Each stage will extract nmax submatrices.
760      nmax is determined by the matrix column dimension.
761      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
762   */
763   if (!nmax) nmax = 1;
764   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
765 
766   /* Make sure every processor loops through the nstages */
767   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
768 
769   for (i=0,pos=0; i<nstages; i++) {
770     if (pos+nmax <= ismax) max_no = nmax;
771     else if (pos == ismax) max_no = 0;
772     else                   max_no = ismax-pos;
773     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
774     pos += max_no;
775   }
776   PetscFunctionReturn(0);
777 }
778 
779 /* -------------------------------------------------------------------------*/
780 #undef __FUNCT__
781 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
782 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
783 {
784   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
785   Mat            A = c->A;
786   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
787   const PetscInt **icol,**irow;
788   PetscInt       *nrow,*ncol,start;
789   PetscErrorCode ierr;
790   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
791   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
792   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
793   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
794   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
795 #if defined (PETSC_USE_CTABLE)
796   PetscTable     *cmap,cmap_i,*rmap,rmap_i;
797 #else
798   PetscInt       **cmap,*cmap_i,**rmap,*rmap_i;
799 #endif
800   const PetscInt *irow_i;
801   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
802   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
803   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
804   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
805   MPI_Status     *r_status3,*r_status4,*s_status4;
806   MPI_Comm       comm;
807   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
808   PetscBool      sorted;
809   PetscMPIInt    *onodes1,*olengths1;
810   PetscMPIInt    idex,idex2,end;
811 
812   PetscFunctionBegin;
813   comm   = ((PetscObject)C)->comm;
814   tag0   = ((PetscObject)C)->tag;
815   size   = c->size;
816   rank   = c->rank;
817 
818   /* Get some new tags to keep the communication clean */
819   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
820   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
821   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
822 
823     /* Check if the col indices are sorted */
824   for (i=0; i<ismax; i++) {
825     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
826     /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
827     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
828     /*    if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
829   }
830   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
831 
832   for (i=0; i<ismax; i++) {
833     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
834     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
835     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
836     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
837   }
838 
839   /* evaluate communication - mesg to who, length of mesg, and buffer space
840      required. Based on this, buffers are allocated, and data copied into them*/
841   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
842   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
843   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
844   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
845   for (i=0; i<ismax; i++) {
846     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
847     jmax   = nrow[i];
848     irow_i = irow[i];
849     for (j=0; j<jmax; j++) {
850       l = 0;
851       row  = irow_i[j];
852       while (row >= C->rmap->range[l+1]) l++;
853       proc = l;
854       w4[proc]++;
855     }
856     for (j=0; j<size; j++) {
857       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
858     }
859   }
860 
861   nrqs     = 0;              /* no of outgoing messages */
862   msz      = 0;              /* total mesg length (for all procs) */
863   w1[rank] = 0;              /* no mesg sent to self */
864   w3[rank] = 0;
865   for (i=0; i<size; i++) {
866     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
867   }
868   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
869   for (i=0,j=0; i<size; i++) {
870     if (w1[i]) { pa[j] = i; j++; }
871   }
872 
873   /* Each message would have a header = 1 + 2*(no of IS) + data */
874   for (i=0; i<nrqs; i++) {
875     j     = pa[i];
876     w1[j] += w2[j] + 2* w3[j];
877     msz   += w1[j];
878   }
879   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
880 
881   /* Determine the number of messages to expect, their lengths, from from-ids */
882   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
883   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
884 
885   /* Now post the Irecvs corresponding to these messages */
886   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
887 
888   ierr = PetscFree(onodes1);CHKERRQ(ierr);
889   ierr = PetscFree(olengths1);CHKERRQ(ierr);
890 
891   /* Allocate Memory for outgoing messages */
892   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
893   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
894   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
895 
896   {
897     PetscInt *iptr = tmp,ict = 0;
898     for (i=0; i<nrqs; i++) {
899       j         = pa[i];
900       iptr     += ict;
901       sbuf1[j]  = iptr;
902       ict       = w1[j];
903     }
904   }
905 
906   /* Form the outgoing messages */
907   /* Initialize the header space */
908   for (i=0; i<nrqs; i++) {
909     j           = pa[i];
910     sbuf1[j][0] = 0;
911     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
912     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
913   }
914 
915   /* Parse the isrow and copy data into outbuf */
916   for (i=0; i<ismax; i++) {
917     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
918     irow_i = irow[i];
919     jmax   = nrow[i];
920     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
921       l = 0;
922       row  = irow_i[j];
923       while (row >= C->rmap->range[l+1]) l++;
924       proc = l;
925       if (proc != rank) { /* copy to the outgoing buf*/
926         ctr[proc]++;
927         *ptr[proc] = row;
928         ptr[proc]++;
929       }
930     }
931     /* Update the headers for the current IS */
932     for (j=0; j<size; j++) { /* Can Optimise this loop too */
933       if ((ctr_j = ctr[j])) {
934         sbuf1_j        = sbuf1[j];
935         k              = ++sbuf1_j[0];
936         sbuf1_j[2*k]   = ctr_j;
937         sbuf1_j[2*k-1] = i;
938       }
939     }
940   }
941 
942   /*  Now  post the sends */
943   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
944   for (i=0; i<nrqs; ++i) {
945     j    = pa[i];
946     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
947   }
948 
949   /* Post Receives to capture the buffer size */
950   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
951   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
952   rbuf2[0] = tmp + msz;
953   for (i=1; i<nrqs; ++i) {
954     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
955   }
956   for (i=0; i<nrqs; ++i) {
957     j    = pa[i];
958     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
959   }
960 
961   /* Send to other procs the buf size they should allocate */
962 
963 
964   /* Receive messages*/
965   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
966   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
967   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
968   {
969     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
970     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
971     PetscInt    *sbuf2_i;
972 
973     for (i=0; i<nrqr; ++i) {
974       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
975       req_size[idex] = 0;
976       rbuf1_i         = rbuf1[idex];
977       start           = 2*rbuf1_i[0] + 1;
978       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
979       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
980       sbuf2_i         = sbuf2[idex];
981       for (j=start; j<end; j++) {
982         id               = rbuf1_i[j] - rstart;
983         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
984         sbuf2_i[j]       = ncols;
985         req_size[idex] += ncols;
986       }
987       req_source[idex] = r_status1[i].MPI_SOURCE;
988       /* form the header */
989       sbuf2_i[0]   = req_size[idex];
990       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
991       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
992     }
993   }
994   ierr = PetscFree(r_status1);CHKERRQ(ierr);
995   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
996 
997   /*  recv buffer sizes */
998   /* Receive messages*/
999 
1000   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1001   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1002   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1003   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1004   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1005 
1006   for (i=0; i<nrqs; ++i) {
1007     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1008     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1009     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1010     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1011     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1012   }
1013   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1014   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1015 
1016   /* Wait on sends1 and sends2 */
1017   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1018   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1019 
1020   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1021   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1022   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1023   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1024   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1025   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1026 
1027   /* Now allocate buffers for a->j, and send them off */
1028   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1029   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1030   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1031   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1032 
1033   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1034   {
1035     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1036     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1037     PetscInt cend = C->cmap->rend;
1038     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1039 
1040     for (i=0; i<nrqr; i++) {
1041       rbuf1_i   = rbuf1[i];
1042       sbuf_aj_i = sbuf_aj[i];
1043       ct1       = 2*rbuf1_i[0] + 1;
1044       ct2       = 0;
1045       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1046         kmax = rbuf1[i][2*j];
1047         for (k=0; k<kmax; k++,ct1++) {
1048           row    = rbuf1_i[ct1] - rstart;
1049           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1050           ncols  = nzA + nzB;
1051           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1052 
1053           /* load the column indices for this row into cols*/
1054           cols  = sbuf_aj_i + ct2;
1055 
1056 	  lwrite = 0;
1057           for (l=0; l<nzB; l++) {
1058             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1059           }
1060           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1061           for (l=0; l<nzB; l++) {
1062             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1063           }
1064 
1065           ct2 += ncols;
1066         }
1067       }
1068       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1069     }
1070   }
1071   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1072   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1073 
1074   /* Allocate buffers for a->a, and send them off */
1075   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1076   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1077   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1078   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1079 
1080   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1081   {
1082     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1083     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1084     PetscInt    cend = C->cmap->rend;
1085     PetscInt    *b_j = b->j;
1086     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1087 
1088     for (i=0; i<nrqr; i++) {
1089       rbuf1_i   = rbuf1[i];
1090       sbuf_aa_i = sbuf_aa[i];
1091       ct1       = 2*rbuf1_i[0]+1;
1092       ct2       = 0;
1093       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1094         kmax = rbuf1_i[2*j];
1095         for (k=0; k<kmax; k++,ct1++) {
1096           row    = rbuf1_i[ct1] - rstart;
1097           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1098           ncols  = nzA + nzB;
1099           cworkB = b_j + b_i[row];
1100           vworkA = a_a + a_i[row];
1101           vworkB = b_a + b_i[row];
1102 
1103           /* load the column values for this row into vals*/
1104           vals  = sbuf_aa_i+ct2;
1105 
1106 	  lwrite = 0;
1107           for (l=0; l<nzB; l++) {
1108             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1109           }
1110           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1111           for (l=0; l<nzB; l++) {
1112             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1113           }
1114 
1115           ct2 += ncols;
1116         }
1117       }
1118       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1119     }
1120   }
1121   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1122   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1123   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1124   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1125 
1126   /* Form the matrix */
1127   /* create col map */
1128   {
1129     const PetscInt *icol_i;
1130 #if defined (PETSC_USE_CTABLE)
1131     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1132     for (i=0; i<ismax; i++) {
1133       ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr);
1134       jmax   = ncol[i];
1135       icol_i = icol[i];
1136       cmap_i = cmap[i];
1137       for (j=0; j<jmax; j++) {
1138         ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr);
1139       }
1140     }
1141 #else
1142     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1143     ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1144     ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1145     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1146     for (i=0; i<ismax; i++) {
1147       jmax   = ncol[i];
1148       icol_i = icol[i];
1149       cmap_i = cmap[i];
1150       for (j=0; j<jmax; j++) {
1151         cmap_i[icol_i[j]] = j+1;
1152       }
1153     }
1154 #endif
1155   }
1156 
1157   /* Create lens which is required for MatCreate... */
1158   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1159   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1160   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1161   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1162   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1163 
1164   /* Update lens from local data */
1165   for (i=0; i<ismax; i++) {
1166     jmax   = nrow[i];
1167     cmap_i = cmap[i];
1168     irow_i = irow[i];
1169     lens_i = lens[i];
1170     for (j=0; j<jmax; j++) {
1171       l = 0;
1172       row  = irow_i[j];
1173       while (row >= C->rmap->range[l+1]) l++;
1174       proc = l;
1175       if (proc == rank) {
1176         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1177         for (k=0; k<ncols; k++) {
1178 #if defined (PETSC_USE_CTABLE)
1179           ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1180 #else
1181           tcol = cmap_i[cols[k]];
1182 #endif
1183           if (tcol) { lens_i[j]++;}
1184         }
1185         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1186       }
1187     }
1188   }
1189 
1190   /* Create row map*/
1191 #if defined (PETSC_USE_CTABLE)
1192   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1193     for (i=0; i<ismax; i++) {
1194       ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr);
1195       rmap_i = rmap[i];
1196       irow_i = irow[i];
1197       jmax   = nrow[i];
1198       for (j=0; j<jmax; j++) {
1199         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr);
1200       }
1201     }
1202 #else
1203   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1204   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1205   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1206   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1207   for (i=0; i<ismax; i++) {
1208     rmap_i = rmap[i];
1209     irow_i = irow[i];
1210     jmax   = nrow[i];
1211     for (j=0; j<jmax; j++) {
1212       rmap_i[irow_i[j]] = j;
1213     }
1214   }
1215 #endif
1216 
1217   /* Update lens from offproc data */
1218   {
1219     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1220 
1221     for (tmp2=0; tmp2<nrqs; tmp2++) {
1222       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1223       idex   = pa[idex2];
1224       sbuf1_i = sbuf1[idex];
1225       jmax    = sbuf1_i[0];
1226       ct1     = 2*jmax+1;
1227       ct2     = 0;
1228       rbuf2_i = rbuf2[idex2];
1229       rbuf3_i = rbuf3[idex2];
1230       for (j=1; j<=jmax; j++) {
1231         is_no   = sbuf1_i[2*j-1];
1232         max1    = sbuf1_i[2*j];
1233         lens_i  = lens[is_no];
1234         cmap_i  = cmap[is_no];
1235         rmap_i  = rmap[is_no];
1236         for (k=0; k<max1; k++,ct1++) {
1237 #if defined (PETSC_USE_CTABLE)
1238           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1239           row--;
1240           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1241 #else
1242           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1243 #endif
1244           max2 = rbuf2_i[ct1];
1245           for (l=0; l<max2; l++,ct2++) {
1246 #if defined (PETSC_USE_CTABLE)
1247             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1248 #else
1249             tcol = cmap_i[rbuf3_i[ct2]];
1250 #endif
1251             if (tcol) {
1252               lens_i[row]++;
1253             }
1254           }
1255         }
1256       }
1257     }
1258   }
1259   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1260   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1261   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1262   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1263   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1264 
1265   /* Create the submatrices */
1266   if (scall == MAT_REUSE_MATRIX) {
1267     PetscBool  flag;
1268 
1269     /*
1270         Assumes new rows are same length as the old rows,hence bug!
1271     */
1272     for (i=0; i<ismax; i++) {
1273       mat = (Mat_SeqAIJ *)(submats[i]->data);
1274       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1275       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1276       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1277       /* Initial matrix as if empty */
1278       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1279       submats[i]->factortype = C->factortype;
1280     }
1281   } else {
1282     for (i=0; i<ismax; i++) {
1283       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1284       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1285       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1286       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1287     }
1288   }
1289 
1290   /* Assemble the matrices */
1291   /* First assemble the local rows */
1292   {
1293     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1294     PetscScalar *imat_a;
1295 
1296     for (i=0; i<ismax; i++) {
1297       mat       = (Mat_SeqAIJ*)submats[i]->data;
1298       imat_ilen = mat->ilen;
1299       imat_j    = mat->j;
1300       imat_i    = mat->i;
1301       imat_a    = mat->a;
1302       cmap_i    = cmap[i];
1303       rmap_i    = rmap[i];
1304       irow_i    = irow[i];
1305       jmax      = nrow[i];
1306       for (j=0; j<jmax; j++) {
1307 	l = 0;
1308         row      = irow_i[j];
1309         while (row >= C->rmap->range[l+1]) l++;
1310         proc = l;
1311         if (proc == rank) {
1312           old_row  = row;
1313 #if defined (PETSC_USE_CTABLE)
1314           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1315           row--;
1316 #else
1317           row      = rmap_i[row];
1318 #endif
1319           ilen_row = imat_ilen[row];
1320           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1321           mat_i    = imat_i[row] ;
1322           mat_a    = imat_a + mat_i;
1323           mat_j    = imat_j + mat_i;
1324           for (k=0; k<ncols; k++) {
1325 #if defined (PETSC_USE_CTABLE)
1326             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1327 #else
1328             tcol = cmap_i[cols[k]];
1329 #endif
1330             if (tcol){
1331               *mat_j++ = tcol - 1;
1332               *mat_a++ = vals[k];
1333               ilen_row++;
1334             }
1335           }
1336           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1337           imat_ilen[row] = ilen_row;
1338         }
1339       }
1340     }
1341   }
1342 
1343   /*   Now assemble the off proc rows*/
1344   {
1345     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1346     PetscInt    *imat_j,*imat_i;
1347     PetscScalar *imat_a,*rbuf4_i;
1348 
1349     for (tmp2=0; tmp2<nrqs; tmp2++) {
1350       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1351       idex   = pa[idex2];
1352       sbuf1_i = sbuf1[idex];
1353       jmax    = sbuf1_i[0];
1354       ct1     = 2*jmax + 1;
1355       ct2     = 0;
1356       rbuf2_i = rbuf2[idex2];
1357       rbuf3_i = rbuf3[idex2];
1358       rbuf4_i = rbuf4[idex2];
1359       for (j=1; j<=jmax; j++) {
1360         is_no     = sbuf1_i[2*j-1];
1361         rmap_i    = rmap[is_no];
1362         cmap_i    = cmap[is_no];
1363         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1364         imat_ilen = mat->ilen;
1365         imat_j    = mat->j;
1366         imat_i    = mat->i;
1367         imat_a    = mat->a;
1368         max1      = sbuf1_i[2*j];
1369         for (k=0; k<max1; k++,ct1++) {
1370           row   = sbuf1_i[ct1];
1371 #if defined (PETSC_USE_CTABLE)
1372           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1373           row--;
1374 #else
1375           row   = rmap_i[row];
1376 #endif
1377           ilen  = imat_ilen[row];
1378           mat_i = imat_i[row] ;
1379           mat_a = imat_a + mat_i;
1380           mat_j = imat_j + mat_i;
1381           max2 = rbuf2_i[ct1];
1382           for (l=0; l<max2; l++,ct2++) {
1383 
1384 #if defined (PETSC_USE_CTABLE)
1385             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1386 #else
1387             tcol = cmap_i[rbuf3_i[ct2]];
1388 #endif
1389             if (tcol) {
1390               *mat_j++ = tcol - 1;
1391               *mat_a++ = rbuf4_i[ct2];
1392               ilen++;
1393             }
1394           }
1395           imat_ilen[row] = ilen;
1396         }
1397       }
1398     }
1399   }
1400   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1401   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1402   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1403   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1404   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1405 
1406   /* Restore the indices */
1407   for (i=0; i<ismax; i++) {
1408     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1409     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1410   }
1411 
1412   /* Destroy allocated memory */
1413   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1414   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1415   ierr = PetscFree(pa);CHKERRQ(ierr);
1416 
1417   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1418   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1419   for (i=0; i<nrqr; ++i) {
1420     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1421   }
1422   for (i=0; i<nrqs; ++i) {
1423     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1424     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1425   }
1426 
1427   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1428   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1429   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1430   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1431   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1432   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1433   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1434 #if defined (PETSC_USE_CTABLE)
1435   for (i=0; i<ismax; i++) {
1436     ierr = PetscTableDestroy((PetscTable)cmap[i]);CHKERRQ(ierr);
1437     ierr = PetscTableDestroy((PetscTable)rmap[i]);CHKERRQ(ierr);
1438   }
1439 #else
1440   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1441   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1442 #endif
1443   ierr = PetscFree(cmap);CHKERRQ(ierr);
1444   ierr = PetscFree(rmap);CHKERRQ(ierr);
1445   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1446   ierr = PetscFree(lens);CHKERRQ(ierr);
1447 
1448   for (i=0; i<ismax; i++) {
1449     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1450     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1451   }
1452   PetscFunctionReturn(0);
1453 }
1454 
1455 /*
1456  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1457  Be careful not to destroy them elsewhere.
1458  */
1459 #undef __FUNCT__
1460 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1461 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1462 {
1463   /* If making this function public, change the error returned in this function away from _PLIB. */
1464   PetscErrorCode ierr;
1465   Mat_MPIAIJ *aij;
1466   PetscBool seqaij;
1467 
1468   PetscFunctionBegin;
1469   /* Check to make sure the component matrices are compatible with C. */
1470   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1471   if(!seqaij) {
1472     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1473   }
1474   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1475   if(!seqaij) {
1476     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1477   }
1478   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1479     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1480   }
1481   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1482   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1483   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1484   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1485   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1486   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1487   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1488     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1489   }
1490   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1491   aij = (Mat_MPIAIJ*)((*C)->data);
1492   aij->A = A;
1493   aij->B = B;
1494   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1495   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1496   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1497   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1498   PetscFunctionReturn(0);
1499 }
1500 
1501 #undef __FUNCT__
1502 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1503 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1504 {
1505   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1506   PetscFunctionBegin;
1507   PetscValidPointer(A,2);
1508   PetscValidPointer(B,3);
1509   *A = aij->A;
1510   *B = aij->B;
1511   /* Note that we don't incref *A and *B, so be careful! */
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 
1516 #undef __FUNCT__
1517 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1518 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1519 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1520 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1521 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1522 {
1523   PetscErrorCode ierr;
1524   PetscMPIInt size, flag;
1525   PetscInt    i,ii;
1526   PetscInt    ismax_c;
1527   PetscFunctionBegin;
1528   if(!ismax) {
1529     PetscFunctionReturn(0);
1530   }
1531   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1532     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1533     if(flag != MPI_IDENT) {
1534       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1535     }
1536     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1537     if(size > 1) {
1538       ++ismax_c;
1539     }
1540   }
1541   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1542     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1543   }
1544   else { /* if(ismax_c) */
1545     Mat *A,*B;
1546     IS  *isrow_c, *iscol_c;
1547     PetscMPIInt size;
1548     /*
1549      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1550      array of sequential matrices underlying the resulting parallel matrices.
1551      Which arrays to allocate is based on the value of MatReuse scall.
1552      There are as many diag matrices as there are original index sets.
1553      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1554 
1555      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1556      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1557      will deposite the extracted diag and off-diag parts.
1558      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1559      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1560     */
1561 
1562     /* Parallel matrix array is allocated only if no reuse is taking place. */
1563     if (scall != MAT_REUSE_MATRIX) {
1564       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1565     }
1566     else {
1567       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1568       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1569       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1570       for(i = 0, ii = 0; i < ismax; ++i) {
1571         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1572         if(size > 1) {
1573           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1574           ++ii;
1575         }
1576         else {
1577           A[i] = (*submat)[i];
1578         }
1579       }
1580     }
1581     /*
1582      Construct the complements of the iscol ISs for parallel ISs only.
1583      These are used to extract the off-diag portion of the resulting parallel matrix.
1584      The row IS for the off-diag portion is the same as for the diag portion,
1585      so we merely alias the row IS, while skipping those that are sequential.
1586     */
1587     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1588     for(i = 0, ii = 0; i < ismax; ++i) {
1589       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1590       if(size > 1) {
1591 	isrow_c[ii] = isrow[i];
1592 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1593 	++ii;
1594       }
1595     }
1596     /* Now obtain the sequential A and B submatrices separately. */
1597     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1598     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1599     for(ii = 0; ii < ismax_c; ++ii) {
1600       ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr);
1601     }
1602     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1603     /*
1604      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1605      have been extracted directly into the parallel matrices containing them, or
1606      simply into the sequential matrix identical with the corresponding A (if size == 1).
1607      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1608      A is put into the correct submat slot (if size == 1).
1609      */
1610     if(scall != MAT_REUSE_MATRIX) {
1611       for(i = 0, ii = 0; i < ismax; ++i) {
1612         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1613         if(size > 1) {
1614           /*
1615            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1616            */
1617           /* Construct submat[i] from the Seq pieces A and B. */
1618           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1619 
1620           ++ii;
1621         }
1622         else {
1623           (*submat)[i] = A[i];
1624         }
1625       }
1626     }
1627     ierr = PetscFree(A); CHKERRQ(ierr);
1628     ierr = PetscFree(B); CHKERRQ(ierr);
1629   }
1630   PetscFunctionReturn(0);
1631 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1632 
1633 
1634 
1635 #undef __FUNCT__
1636 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1637 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1638 {
1639   PetscErrorCode ierr;
1640   PetscFunctionBegin;
1641   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1642                                            MatCreateMPIAIJFromSeqMatrices_Private,
1643                                            MatMPIAIJExtractSeqMatrices_Private);
1644   CHKERRQ(ierr);
1645   PetscFunctionReturn(0);
1646 }
1647