xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision d88c663f2a63cd0b086af274cba34da395d9b43b)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
421   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
422   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
423   PetscInt       *rbuf_i,kmax,rbuf_0;
424   PetscBT        xtable;
425 
426   PetscFunctionBegin;
427   m      = C->rmap->N;
428   rstart = C->rmap->rstart;
429   cstart = C->cmap->rstart;
430   ai     = a->i;
431   aj     = a->j;
432   bi     = b->i;
433   bj     = b->j;
434   garray = c->garray;
435 
436 
437   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
438     rbuf_i  =  rbuf[i];
439     rbuf_0  =  rbuf_i[0];
440     ct     += rbuf_0;
441     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
442   }
443 
444   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
445   else      max1 = 1;
446   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
447   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
448   ++no_malloc;
449   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
450   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
451 
452   ct3 = 0;
453   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
454     rbuf_i =  rbuf[i];
455     rbuf_0 =  rbuf_i[0];
456     ct1    =  2*rbuf_0+1;
457     ct2    =  ct1;
458     ct3    += ct1;
459     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
460       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
461       oct2 = ct2;
462       kmax = rbuf_i[2*j];
463       for (k=0; k<kmax; k++,ct1++) {
464         row = rbuf_i[ct1];
465         if (!PetscBTLookupSet(xtable,row)) {
466           if (!(ct3 < mem_estimate)) {
467             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
468             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
469             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
470             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
471             xdata[0]     = tmp;
472             mem_estimate = new_estimate; ++no_malloc;
473             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
474           }
475           xdata[i][ct2++] = row;
476           ct3++;
477         }
478       }
479       for (k=oct2,max2=ct2; k<max2; k++) {
480         row   = xdata[i][k] - rstart;
481         start = ai[row];
482         end   = ai[row+1];
483         for (l=start; l<end; l++) {
484           val = aj[l] + cstart;
485           if (!PetscBTLookupSet(xtable,val)) {
486             if (!(ct3 < mem_estimate)) {
487               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
488               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
489               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
490               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
491               xdata[0]     = tmp;
492               mem_estimate = new_estimate; ++no_malloc;
493               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
494             }
495             xdata[i][ct2++] = val;
496             ct3++;
497           }
498         }
499         start = bi[row];
500         end   = bi[row+1];
501         for (l=start; l<end; l++) {
502           val = garray[bj[l]];
503           if (!PetscBTLookupSet(xtable,val)) {
504             if (!(ct3 < mem_estimate)) {
505               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
506               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
507               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
508               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
509               xdata[0]     = tmp;
510               mem_estimate = new_estimate; ++no_malloc;
511               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
512             }
513             xdata[i][ct2++] = val;
514             ct3++;
515           }
516         }
517       }
518       /* Update the header*/
519       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
520       xdata[i][2*j-1] = rbuf_i[2*j-1];
521     }
522     xdata[i][0] = rbuf_0;
523     xdata[i+1]  = xdata[i] + ct2;
524     isz1[i]     = ct2; /* size of each message */
525   }
526   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
527   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 /* -------------------------------------------------------------------------*/
531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
533 /*
534     Every processor gets the entire matrix
535 */
536 #undef __FUNCT__
537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
539 {
540   Mat            B;
541   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
542   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
543   PetscErrorCode ierr;
544   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
545   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
546   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
547   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
548 
549   PetscFunctionBegin;
550   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
551   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
552 
553   if (scall == MAT_INITIAL_MATRIX) {
554     /* ----------------------------------------------------------------
555          Tell every processor the number of nonzeros per row
556     */
557     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
558     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
559       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
560     }
561     sendcount = A->rmap->rend - A->rmap->rstart;
562     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
563     for (i=0; i<size; i++) {
564       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
565       displs[i]     = A->rmap->range[i];
566     }
567 #if defined(PETSC_HAVE_MPI_IN_PLACE)
568     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
569 #else
570     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #endif
572     /* ---------------------------------------------------------------
573          Create the sequential matrix of the same type as the local block diagonal
574     */
575     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
576     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
577     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
578     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
579     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
580     **Bin = B;
581     b = (Mat_SeqAIJ *)B->data;
582 
583     /*--------------------------------------------------------------------
584        Copy my part of matrix column indices over
585     */
586     sendcount  = ad->nz + bd->nz;
587     jsendbuf   = b->j + b->i[rstarts[rank]];
588     a_jsendbuf = ad->j;
589     b_jsendbuf = bd->j;
590     n          = A->rmap->rend - A->rmap->rstart;
591     cnt        = 0;
592     for (i=0; i<n; i++) {
593 
594       /* put in lower diagonal portion */
595       m = bd->i[i+1] - bd->i[i];
596       while (m > 0) {
597         /* is it above diagonal (in bd (compressed) numbering) */
598         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
599         jsendbuf[cnt++] = garray[*b_jsendbuf++];
600         m--;
601       }
602 
603       /* put in diagonal portion */
604       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
605         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
606       }
607 
608       /* put in upper diagonal portion */
609       while (m-- > 0) {
610         jsendbuf[cnt++] = garray[*b_jsendbuf++];
611       }
612     }
613     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
614 
615     /*--------------------------------------------------------------------
616        Gather all column indices to all processors
617     */
618     for (i=0; i<size; i++) {
619       recvcounts[i] = 0;
620       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
621         recvcounts[i] += lens[j];
622       }
623     }
624     displs[0]  = 0;
625     for (i=1; i<size; i++) {
626       displs[i] = displs[i-1] + recvcounts[i-1];
627     }
628 #if defined(PETSC_HAVE_MPI_IN_PLACE)
629     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
630 #else
631     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
632 #endif
633     /*--------------------------------------------------------------------
634         Assemble the matrix into useable form (note numerical values not yet set)
635     */
636     /* set the b->ilen (length of each row) values */
637     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
638     /* set the b->i indices */
639     b->i[0] = 0;
640     for (i=1; i<=A->rmap->N; i++) {
641       b->i[i] = b->i[i-1] + lens[i-1];
642     }
643     ierr = PetscFree(lens);CHKERRQ(ierr);
644     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
645     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
646 
647   } else {
648     B  = **Bin;
649     b = (Mat_SeqAIJ *)B->data;
650   }
651 
652   /*--------------------------------------------------------------------
653        Copy my part of matrix numerical values into the values location
654   */
655   if (flag == MAT_GET_VALUES){
656     sendcount = ad->nz + bd->nz;
657     sendbuf   = b->a + b->i[rstarts[rank]];
658     a_sendbuf = ad->a;
659     b_sendbuf = bd->a;
660     b_sendj   = bd->j;
661     n         = A->rmap->rend - A->rmap->rstart;
662     cnt       = 0;
663     for (i=0; i<n; i++) {
664 
665       /* put in lower diagonal portion */
666       m = bd->i[i+1] - bd->i[i];
667       while (m > 0) {
668         /* is it above diagonal (in bd (compressed) numbering) */
669         if (garray[*b_sendj] > A->rmap->rstart + i) break;
670         sendbuf[cnt++] = *b_sendbuf++;
671         m--;
672         b_sendj++;
673       }
674 
675       /* put in diagonal portion */
676       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
677         sendbuf[cnt++] = *a_sendbuf++;
678       }
679 
680       /* put in upper diagonal portion */
681       while (m-- > 0) {
682         sendbuf[cnt++] = *b_sendbuf++;
683         b_sendj++;
684       }
685     }
686     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
687 
688     /* -----------------------------------------------------------------
689        Gather all numerical values to all processors
690     */
691     if (!recvcounts) {
692       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
693     }
694     for (i=0; i<size; i++) {
695       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
696     }
697     displs[0]  = 0;
698     for (i=1; i<size; i++) {
699       displs[i] = displs[i-1] + recvcounts[i-1];
700     }
701     recvbuf   = b->a;
702 #if defined(PETSC_HAVE_MPI_IN_PLACE)
703     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
704 #else
705     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
706 #endif
707   }  /* endof (flag == MAT_GET_VALUES) */
708   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
709 
710   if (A->symmetric){
711     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
712   } else if (A->hermitian) {
713     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
714   } else if (A->structurally_symmetric) {
715     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
716   }
717   PetscFunctionReturn(0);
718 }
719 
720 
721 
722 #undef __FUNCT__
723 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
724 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
725 {
726   PetscErrorCode ierr;
727   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
728   PetscBool      rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
729 
730   PetscFunctionBegin;
731   /*
732        Check for special case each processor gets entire matrix
733   */
734   if (ismax == 1 && C->rmap->N == C->cmap->N) {
735     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
736     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
737     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
738     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
739     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
740       wantallmatrix = PETSC_TRUE;
741       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
742     }
743   }
744   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
745   if (twantallmatrix) {
746     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
747     PetscFunctionReturn(0);
748   }
749 
750   /* Allocate memory to hold all the submatrices */
751   if (scall != MAT_REUSE_MATRIX) {
752     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
753   }
754   /* Determine the number of stages through which submatrices are done */
755   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
756   /*
757      Each stage will extract nmax submatrices.
758      nmax is determined by the matrix column dimension.
759      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
760   */
761   if (!nmax) nmax = 1;
762   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
763 
764   /* Make sure every processor loops through the nstages */
765   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
766 
767   for (i=0,pos=0; i<nstages; i++) {
768     if (pos+nmax <= ismax) max_no = nmax;
769     else if (pos == ismax) max_no = 0;
770     else                   max_no = ismax-pos;
771     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
772     pos += max_no;
773   }
774   PetscFunctionReturn(0);
775 }
776 
777 /* -------------------------------------------------------------------------*/
778 #undef __FUNCT__
779 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
780 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
781 {
782   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
783   Mat            A = c->A;
784   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
785   const PetscInt **icol,**irow;
786   PetscInt       *nrow,*ncol,start;
787   PetscErrorCode ierr;
788   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
789   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
790   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
791   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
792   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
793 #if defined (PETSC_USE_CTABLE)
794   PetscTable     *cmap,cmap_i,*rmap,rmap_i;
795 #else
796   PetscInt       **cmap,*cmap_i,**rmap,*rmap_i;
797 #endif
798   const PetscInt *irow_i;
799   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
800   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
801   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
802   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
803   MPI_Status     *r_status3,*r_status4,*s_status4;
804   MPI_Comm       comm;
805   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
806   PetscMPIInt    *onodes1,*olengths1;
807   PetscMPIInt    idex,idex2,end;
808 
809   PetscFunctionBegin;
810   comm   = ((PetscObject)C)->comm;
811   tag0   = ((PetscObject)C)->tag;
812   size   = c->size;
813   rank   = c->rank;
814 
815   /* Get some new tags to keep the communication clean */
816   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
817   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
818   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
819 
820   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
821 
822   for (i=0; i<ismax; i++) {
823     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
824     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
825     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
826     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
827   }
828 
829   /* evaluate communication - mesg to who, length of mesg, and buffer space
830      required. Based on this, buffers are allocated, and data copied into them*/
831   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
832   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
833   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
834   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
835   for (i=0; i<ismax; i++) {
836     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
837     jmax   = nrow[i];
838     irow_i = irow[i];
839     for (j=0; j<jmax; j++) {
840       l = 0;
841       row  = irow_i[j];
842       while (row >= C->rmap->range[l+1]) l++;
843       proc = l;
844       w4[proc]++;
845     }
846     for (j=0; j<size; j++) {
847       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
848     }
849   }
850 
851   nrqs     = 0;              /* no of outgoing messages */
852   msz      = 0;              /* total mesg length (for all procs) */
853   w1[rank] = 0;              /* no mesg sent to self */
854   w3[rank] = 0;
855   for (i=0; i<size; i++) {
856     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
857   }
858   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
859   for (i=0,j=0; i<size; i++) {
860     if (w1[i]) { pa[j] = i; j++; }
861   }
862 
863   /* Each message would have a header = 1 + 2*(no of IS) + data */
864   for (i=0; i<nrqs; i++) {
865     j     = pa[i];
866     w1[j] += w2[j] + 2* w3[j];
867     msz   += w1[j];
868   }
869   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
870 
871   /* Determine the number of messages to expect, their lengths, from from-ids */
872   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
873   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
874 
875   /* Now post the Irecvs corresponding to these messages */
876   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
877 
878   ierr = PetscFree(onodes1);CHKERRQ(ierr);
879   ierr = PetscFree(olengths1);CHKERRQ(ierr);
880 
881   /* Allocate Memory for outgoing messages */
882   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
883   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
884   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
885 
886   {
887     PetscInt *iptr = tmp,ict = 0;
888     for (i=0; i<nrqs; i++) {
889       j         = pa[i];
890       iptr     += ict;
891       sbuf1[j]  = iptr;
892       ict       = w1[j];
893     }
894   }
895 
896   /* Form the outgoing messages */
897   /* Initialize the header space */
898   for (i=0; i<nrqs; i++) {
899     j           = pa[i];
900     sbuf1[j][0] = 0;
901     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
902     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
903   }
904 
905   /* Parse the isrow and copy data into outbuf */
906   for (i=0; i<ismax; i++) {
907     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
908     irow_i = irow[i];
909     jmax   = nrow[i];
910     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
911       l = 0;
912       row  = irow_i[j];
913       while (row >= C->rmap->range[l+1]) l++;
914       proc = l;
915       if (proc != rank) { /* copy to the outgoing buf*/
916         ctr[proc]++;
917         *ptr[proc] = row;
918         ptr[proc]++;
919       }
920     }
921     /* Update the headers for the current IS */
922     for (j=0; j<size; j++) { /* Can Optimise this loop too */
923       if ((ctr_j = ctr[j])) {
924         sbuf1_j        = sbuf1[j];
925         k              = ++sbuf1_j[0];
926         sbuf1_j[2*k]   = ctr_j;
927         sbuf1_j[2*k-1] = i;
928       }
929     }
930   }
931 
932   /*  Now  post the sends */
933   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
934   for (i=0; i<nrqs; ++i) {
935     j    = pa[i];
936     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
937   }
938 
939   /* Post Receives to capture the buffer size */
940   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
941   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
942   rbuf2[0] = tmp + msz;
943   for (i=1; i<nrqs; ++i) {
944     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
945   }
946   for (i=0; i<nrqs; ++i) {
947     j    = pa[i];
948     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
949   }
950 
951   /* Send to other procs the buf size they should allocate */
952 
953 
954   /* Receive messages*/
955   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
956   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
957   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
958   {
959     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
960     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
961     PetscInt    *sbuf2_i;
962 
963     for (i=0; i<nrqr; ++i) {
964       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
965       req_size[idex] = 0;
966       rbuf1_i         = rbuf1[idex];
967       start           = 2*rbuf1_i[0] + 1;
968       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
969       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
970       sbuf2_i         = sbuf2[idex];
971       for (j=start; j<end; j++) {
972         id               = rbuf1_i[j] - rstart;
973         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
974         sbuf2_i[j]       = ncols;
975         req_size[idex] += ncols;
976       }
977       req_source[idex] = r_status1[i].MPI_SOURCE;
978       /* form the header */
979       sbuf2_i[0]   = req_size[idex];
980       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
981       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
982     }
983   }
984   ierr = PetscFree(r_status1);CHKERRQ(ierr);
985   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
986 
987   /*  recv buffer sizes */
988   /* Receive messages*/
989 
990   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
991   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
992   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
993   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
994   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
995 
996   for (i=0; i<nrqs; ++i) {
997     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
998     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
999     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1000     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1001     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1002   }
1003   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1004   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1005 
1006   /* Wait on sends1 and sends2 */
1007   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1008   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1009 
1010   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1011   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1012   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1013   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1014   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1015   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1016 
1017   /* Now allocate buffers for a->j, and send them off */
1018   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1019   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1020   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1021   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1022 
1023   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1024   {
1025     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1026     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1027     PetscInt cend = C->cmap->rend;
1028     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1029 
1030     for (i=0; i<nrqr; i++) {
1031       rbuf1_i   = rbuf1[i];
1032       sbuf_aj_i = sbuf_aj[i];
1033       ct1       = 2*rbuf1_i[0] + 1;
1034       ct2       = 0;
1035       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1036         kmax = rbuf1[i][2*j];
1037         for (k=0; k<kmax; k++,ct1++) {
1038           row    = rbuf1_i[ct1] - rstart;
1039           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1040           ncols  = nzA + nzB;
1041           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1042 
1043           /* load the column indices for this row into cols*/
1044           cols  = sbuf_aj_i + ct2;
1045 
1046 	  lwrite = 0;
1047           for (l=0; l<nzB; l++) {
1048             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1049           }
1050           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1051           for (l=0; l<nzB; l++) {
1052             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1053           }
1054 
1055           ct2 += ncols;
1056         }
1057       }
1058       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1059     }
1060   }
1061   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1062   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1063 
1064   /* Allocate buffers for a->a, and send them off */
1065   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1066   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1067   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1068   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1069 
1070   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1071   {
1072     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1073     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1074     PetscInt    cend = C->cmap->rend;
1075     PetscInt    *b_j = b->j;
1076     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1077 
1078     for (i=0; i<nrqr; i++) {
1079       rbuf1_i   = rbuf1[i];
1080       sbuf_aa_i = sbuf_aa[i];
1081       ct1       = 2*rbuf1_i[0]+1;
1082       ct2       = 0;
1083       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1084         kmax = rbuf1_i[2*j];
1085         for (k=0; k<kmax; k++,ct1++) {
1086           row    = rbuf1_i[ct1] - rstart;
1087           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1088           ncols  = nzA + nzB;
1089           cworkB = b_j + b_i[row];
1090           vworkA = a_a + a_i[row];
1091           vworkB = b_a + b_i[row];
1092 
1093           /* load the column values for this row into vals*/
1094           vals  = sbuf_aa_i+ct2;
1095 
1096 	  lwrite = 0;
1097           for (l=0; l<nzB; l++) {
1098             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1099           }
1100           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1101           for (l=0; l<nzB; l++) {
1102             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1103           }
1104 
1105           ct2 += ncols;
1106         }
1107       }
1108       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1109     }
1110   }
1111   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1112   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1113   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1114   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1115 
1116   /* Form the matrix */
1117   /* create col map: global col of C -> local col of submatrices */
1118   {
1119     const PetscInt *icol_i;
1120 #if defined (PETSC_USE_CTABLE)
1121     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1122     for (i=0; i<ismax; i++) {
1123       ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr);
1124       jmax   = ncol[i];
1125       icol_i = icol[i];
1126       cmap_i = cmap[i];
1127       for (j=0; j<jmax; j++) {
1128         ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr);
1129       }
1130     }
1131 #else
1132     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1133     ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1134     ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1135     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1136     for (i=0; i<ismax; i++) {
1137       jmax   = ncol[i];
1138       icol_i = icol[i];
1139       cmap_i = cmap[i];
1140       for (j=0; j<jmax; j++) {
1141         cmap_i[icol_i[j]] = j+1;
1142       }
1143     }
1144 #endif
1145   }
1146 
1147   /* Create lens which is required for MatCreate... */
1148   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1149   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1150   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1151   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1152   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1153 
1154   /* Update lens from local data */
1155   for (i=0; i<ismax; i++) {
1156     jmax   = nrow[i];
1157     cmap_i = cmap[i];
1158     irow_i = irow[i];
1159     lens_i = lens[i];
1160     for (j=0; j<jmax; j++) {
1161       l = 0;
1162       row  = irow_i[j];
1163       while (row >= C->rmap->range[l+1]) l++;
1164       proc = l;
1165       if (proc == rank) {
1166         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1167         for (k=0; k<ncols; k++) {
1168 #if defined (PETSC_USE_CTABLE)
1169           ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1170 #else
1171           tcol = cmap_i[cols[k]];
1172 #endif
1173           if (tcol) { lens_i[j]++;}
1174         }
1175         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1176       }
1177     }
1178   }
1179 
1180   /* Create row map: global row of C -> local row of submatrices */
1181 #if defined (PETSC_USE_CTABLE)
1182   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1183     for (i=0; i<ismax; i++) {
1184       ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr);
1185       rmap_i = rmap[i];
1186       irow_i = irow[i];
1187       jmax   = nrow[i];
1188       for (j=0; j<jmax; j++) {
1189         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr);
1190       }
1191     }
1192 #else
1193   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1194   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1195   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1196   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1197   for (i=0; i<ismax; i++) {
1198     rmap_i = rmap[i];
1199     irow_i = irow[i];
1200     jmax   = nrow[i];
1201     for (j=0; j<jmax; j++) {
1202       rmap_i[irow_i[j]] = j;
1203     }
1204   }
1205 #endif
1206 
1207   /* Update lens from offproc data */
1208   {
1209     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1210 
1211     for (tmp2=0; tmp2<nrqs; tmp2++) {
1212       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1213       idex   = pa[idex2];
1214       sbuf1_i = sbuf1[idex];
1215       jmax    = sbuf1_i[0];
1216       ct1     = 2*jmax+1;
1217       ct2     = 0;
1218       rbuf2_i = rbuf2[idex2];
1219       rbuf3_i = rbuf3[idex2];
1220       for (j=1; j<=jmax; j++) {
1221         is_no   = sbuf1_i[2*j-1];
1222         max1    = sbuf1_i[2*j];
1223         lens_i  = lens[is_no];
1224         cmap_i  = cmap[is_no];
1225         rmap_i  = rmap[is_no];
1226         for (k=0; k<max1; k++,ct1++) {
1227 #if defined (PETSC_USE_CTABLE)
1228           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1229           row--;
1230           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1231 #else
1232           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1233 #endif
1234           max2 = rbuf2_i[ct1];
1235           for (l=0; l<max2; l++,ct2++) {
1236 #if defined (PETSC_USE_CTABLE)
1237             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1238 #else
1239             tcol = cmap_i[rbuf3_i[ct2]];
1240 #endif
1241             if (tcol) {
1242               lens_i[row]++;
1243             }
1244           }
1245         }
1246       }
1247     }
1248   }
1249   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1250   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1251   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1252   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1253   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1254 
1255   /* Create the submatrices */
1256   if (scall == MAT_REUSE_MATRIX) {
1257     PetscBool  flag;
1258 
1259     /*
1260         Assumes new rows are same length as the old rows,hence bug!
1261     */
1262     for (i=0; i<ismax; i++) {
1263       mat = (Mat_SeqAIJ *)(submats[i]->data);
1264       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1265       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1266       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1267       /* Initial matrix as if empty */
1268       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1269       submats[i]->factortype = C->factortype;
1270     }
1271   } else {
1272     for (i=0; i<ismax; i++) {
1273       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1274       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1275       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1276       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1277     }
1278   }
1279 
1280   /* Assemble the matrices */
1281   /* First assemble the local rows */
1282   {
1283     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1284     PetscScalar *imat_a;
1285 
1286     for (i=0; i<ismax; i++) {
1287       mat       = (Mat_SeqAIJ*)submats[i]->data;
1288       imat_ilen = mat->ilen;
1289       imat_j    = mat->j;
1290       imat_i    = mat->i;
1291       imat_a    = mat->a;
1292       cmap_i    = cmap[i];
1293       rmap_i    = rmap[i];
1294       irow_i    = irow[i];
1295       jmax      = nrow[i];
1296       for (j=0; j<jmax; j++) {
1297 	l = 0;
1298         row      = irow_i[j];
1299         while (row >= C->rmap->range[l+1]) l++;
1300         proc = l;
1301         if (proc == rank) {
1302           old_row  = row;
1303 #if defined (PETSC_USE_CTABLE)
1304           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1305           row--;
1306 #else
1307           row      = rmap_i[row];
1308 #endif
1309           ilen_row = imat_ilen[row];
1310           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1311           mat_i    = imat_i[row] ;
1312           mat_a    = imat_a + mat_i;
1313           mat_j    = imat_j + mat_i;
1314           for (k=0; k<ncols; k++) {
1315 #if defined (PETSC_USE_CTABLE)
1316             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1317 #else
1318             tcol = cmap_i[cols[k]];
1319 #endif
1320             if (tcol){
1321               *mat_j++ = tcol - 1;
1322               *mat_a++ = vals[k];
1323               ilen_row++;
1324             }
1325           }
1326           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1327           imat_ilen[row] = ilen_row;
1328         }
1329       }
1330     }
1331   }
1332 
1333   /*   Now assemble the off proc rows*/
1334   {
1335     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1336     PetscInt    *imat_j,*imat_i;
1337     PetscScalar *imat_a,*rbuf4_i;
1338 
1339     for (tmp2=0; tmp2<nrqs; tmp2++) {
1340       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1341       idex   = pa[idex2];
1342       sbuf1_i = sbuf1[idex];
1343       jmax    = sbuf1_i[0];
1344       ct1     = 2*jmax + 1;
1345       ct2     = 0;
1346       rbuf2_i = rbuf2[idex2];
1347       rbuf3_i = rbuf3[idex2];
1348       rbuf4_i = rbuf4[idex2];
1349       for (j=1; j<=jmax; j++) {
1350         is_no     = sbuf1_i[2*j-1];
1351         rmap_i    = rmap[is_no];
1352         cmap_i    = cmap[is_no];
1353         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1354         imat_ilen = mat->ilen;
1355         imat_j    = mat->j;
1356         imat_i    = mat->i;
1357         imat_a    = mat->a;
1358         max1      = sbuf1_i[2*j];
1359         for (k=0; k<max1; k++,ct1++) {
1360           row   = sbuf1_i[ct1];
1361 #if defined (PETSC_USE_CTABLE)
1362           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1363           row--;
1364 #else
1365           row   = rmap_i[row];
1366 #endif
1367           ilen  = imat_ilen[row];
1368           mat_i = imat_i[row] ;
1369           mat_a = imat_a + mat_i;
1370           mat_j = imat_j + mat_i;
1371           max2 = rbuf2_i[ct1];
1372           for (l=0; l<max2; l++,ct2++) {
1373 
1374 #if defined (PETSC_USE_CTABLE)
1375             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1376 #else
1377             tcol = cmap_i[rbuf3_i[ct2]];
1378 #endif
1379             if (tcol) {
1380               *mat_j++ = tcol - 1;
1381               *mat_a++ = rbuf4_i[ct2];
1382               ilen++;
1383             }
1384           }
1385           imat_ilen[row] = ilen;
1386         }
1387       }
1388     }
1389   }
1390   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1391   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1392   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1393   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1394   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1395 
1396   /* Restore the indices */
1397   for (i=0; i<ismax; i++) {
1398     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1399     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1400   }
1401 
1402   /* Destroy allocated memory */
1403   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1404   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1405   ierr = PetscFree(pa);CHKERRQ(ierr);
1406 
1407   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1408   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1409   for (i=0; i<nrqr; ++i) {
1410     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1411   }
1412   for (i=0; i<nrqs; ++i) {
1413     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1414     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1415   }
1416 
1417   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1418   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1419   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1420   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1421   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1422   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1423   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1424 #if defined (PETSC_USE_CTABLE)
1425   for (i=0; i<ismax; i++) {
1426     ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1427     ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);
1428   }
1429 #else
1430   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1431   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1432 #endif
1433   ierr = PetscFree(cmap);CHKERRQ(ierr);
1434   ierr = PetscFree(rmap);CHKERRQ(ierr);
1435   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1436   ierr = PetscFree(lens);CHKERRQ(ierr);
1437 
1438   for (i=0; i<ismax; i++) {
1439     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1440     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1441   }
1442   PetscFunctionReturn(0);
1443 }
1444 
1445 /*
1446  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1447  Be careful not to destroy them elsewhere.
1448  */
1449 #undef __FUNCT__
1450 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1451 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1452 {
1453   /* If making this function public, change the error returned in this function away from _PLIB. */
1454   PetscErrorCode ierr;
1455   Mat_MPIAIJ *aij;
1456   PetscBool seqaij;
1457 
1458   PetscFunctionBegin;
1459   /* Check to make sure the component matrices are compatible with C. */
1460   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1461   if(!seqaij) {
1462     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1463   }
1464   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1465   if(!seqaij) {
1466     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1467   }
1468   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1469     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1470   }
1471   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1472   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1473   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1474   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1475   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1476   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1477   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1478     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1479   }
1480   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1481   aij = (Mat_MPIAIJ*)((*C)->data);
1482   aij->A = A;
1483   aij->B = B;
1484   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1485   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1486   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1487   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1488   PetscFunctionReturn(0);
1489 }
1490 
1491 #undef __FUNCT__
1492 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1493 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1494 {
1495   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1496   PetscFunctionBegin;
1497   PetscValidPointer(A,2);
1498   PetscValidPointer(B,3);
1499   *A = aij->A;
1500   *B = aij->B;
1501   /* Note that we don't incref *A and *B, so be careful! */
1502   PetscFunctionReturn(0);
1503 }
1504 
1505 
1506 #undef __FUNCT__
1507 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1508 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1509 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1510 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1511 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1512 {
1513   PetscErrorCode ierr;
1514   PetscMPIInt size, flag;
1515   PetscInt    i,ii;
1516   PetscInt    ismax_c;
1517   PetscFunctionBegin;
1518   if(!ismax) {
1519     PetscFunctionReturn(0);
1520   }
1521   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1522     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1523     if(flag != MPI_IDENT) {
1524       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1525     }
1526     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1527     if(size > 1) {
1528       ++ismax_c;
1529     }
1530   }
1531   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1532     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1533   }
1534   else { /* if(ismax_c) */
1535     Mat *A,*B;
1536     IS  *isrow_c, *iscol_c;
1537     PetscMPIInt size;
1538     /*
1539      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1540      array of sequential matrices underlying the resulting parallel matrices.
1541      Which arrays to allocate is based on the value of MatReuse scall.
1542      There are as many diag matrices as there are original index sets.
1543      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1544 
1545      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1546      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1547      will deposite the extracted diag and off-diag parts.
1548      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1549      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1550     */
1551 
1552     /* Parallel matrix array is allocated only if no reuse is taking place. */
1553     if (scall != MAT_REUSE_MATRIX) {
1554       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1555     }
1556     else {
1557       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1558       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1559       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1560       for(i = 0, ii = 0; i < ismax; ++i) {
1561         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1562         if(size > 1) {
1563           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1564           ++ii;
1565         }
1566         else {
1567           A[i] = (*submat)[i];
1568         }
1569       }
1570     }
1571     /*
1572      Construct the complements of the iscol ISs for parallel ISs only.
1573      These are used to extract the off-diag portion of the resulting parallel matrix.
1574      The row IS for the off-diag portion is the same as for the diag portion,
1575      so we merely alias the row IS, while skipping those that are sequential.
1576     */
1577     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1578     for(i = 0, ii = 0; i < ismax; ++i) {
1579       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1580       if(size > 1) {
1581 	isrow_c[ii] = isrow[i];
1582 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1583 	++ii;
1584       }
1585     }
1586     /* Now obtain the sequential A and B submatrices separately. */
1587     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1588     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1589     for(ii = 0; ii < ismax_c; ++ii) {
1590       ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr);
1591     }
1592     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1593     /*
1594      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1595      have been extracted directly into the parallel matrices containing them, or
1596      simply into the sequential matrix identical with the corresponding A (if size == 1).
1597      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1598      A is put into the correct submat slot (if size == 1).
1599      */
1600     if(scall != MAT_REUSE_MATRIX) {
1601       for(i = 0, ii = 0; i < ismax; ++i) {
1602         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1603         if(size > 1) {
1604           /*
1605            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1606            */
1607           /* Construct submat[i] from the Seq pieces A and B. */
1608           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1609 
1610           ++ii;
1611         }
1612         else {
1613           (*submat)[i] = A[i];
1614         }
1615       }
1616     }
1617     ierr = PetscFree(A); CHKERRQ(ierr);
1618     ierr = PetscFree(B); CHKERRQ(ierr);
1619   }
1620   PetscFunctionReturn(0);
1621 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1622 
1623 
1624 
1625 #undef __FUNCT__
1626 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1627 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1628 {
1629   PetscErrorCode ierr;
1630   PetscFunctionBegin;
1631   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1632                                            MatCreateMPIAIJFromSeqMatrices_Private,
1633                                            MatMPIAIJExtractSeqMatrices_Private);
1634   CHKERRQ(ierr);
1635   PetscFunctionReturn(0);
1636 }
1637