xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 7b6bb2c608b6fc6714ef38fda02c2dbb91c82665)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscMPIInt    rank;
421   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
422   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
423   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
424   PetscInt       *rbuf_i,kmax,rbuf_0;
425   PetscBT        xtable;
426 
427   PetscFunctionBegin;
428   rank   = c->rank;
429   m      = C->rmap->N;
430   rstart = C->rmap->rstart;
431   cstart = C->cmap->rstart;
432   ai     = a->i;
433   aj     = a->j;
434   bi     = b->i;
435   bj     = b->j;
436   garray = c->garray;
437 
438 
439   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
440     rbuf_i  =  rbuf[i];
441     rbuf_0  =  rbuf_i[0];
442     ct     += rbuf_0;
443     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
444   }
445 
446   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
447   else      max1 = 1;
448   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
449   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
450   ++no_malloc;
451   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
452   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
453 
454   ct3 = 0;
455   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
456     rbuf_i =  rbuf[i];
457     rbuf_0 =  rbuf_i[0];
458     ct1    =  2*rbuf_0+1;
459     ct2    =  ct1;
460     ct3    += ct1;
461     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
462       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
463       oct2 = ct2;
464       kmax = rbuf_i[2*j];
465       for (k=0; k<kmax; k++,ct1++) {
466         row = rbuf_i[ct1];
467         if (!PetscBTLookupSet(xtable,row)) {
468           if (!(ct3 < mem_estimate)) {
469             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
470             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
471             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
472             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
473             xdata[0]     = tmp;
474             mem_estimate = new_estimate; ++no_malloc;
475             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
476           }
477           xdata[i][ct2++] = row;
478           ct3++;
479         }
480       }
481       for (k=oct2,max2=ct2; k<max2; k++) {
482         row   = xdata[i][k] - rstart;
483         start = ai[row];
484         end   = ai[row+1];
485         for (l=start; l<end; l++) {
486           val = aj[l] + cstart;
487           if (!PetscBTLookupSet(xtable,val)) {
488             if (!(ct3 < mem_estimate)) {
489               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
490               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
491               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
492               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
493               xdata[0]     = tmp;
494               mem_estimate = new_estimate; ++no_malloc;
495               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
496             }
497             xdata[i][ct2++] = val;
498             ct3++;
499           }
500         }
501         start = bi[row];
502         end   = bi[row+1];
503         for (l=start; l<end; l++) {
504           val = garray[bj[l]];
505           if (!PetscBTLookupSet(xtable,val)) {
506             if (!(ct3 < mem_estimate)) {
507               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
508               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
509               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
510               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
511               xdata[0]     = tmp;
512               mem_estimate = new_estimate; ++no_malloc;
513               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
514             }
515             xdata[i][ct2++] = val;
516             ct3++;
517           }
518         }
519       }
520       /* Update the header*/
521       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
522       xdata[i][2*j-1] = rbuf_i[2*j-1];
523     }
524     xdata[i][0] = rbuf_0;
525     xdata[i+1]  = xdata[i] + ct2;
526     isz1[i]     = ct2; /* size of each message */
527   }
528   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
529   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
530   PetscFunctionReturn(0);
531 }
532 /* -------------------------------------------------------------------------*/
533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
535 /*
536     Every processor gets the entire matrix
537 */
538 #undef __FUNCT__
539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
541 {
542   Mat            B;
543   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
544   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
545   PetscErrorCode ierr;
546   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
547   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
548   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
549   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
550 
551   PetscFunctionBegin;
552   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
553   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
554 
555   if (scall == MAT_INITIAL_MATRIX) {
556     /* ----------------------------------------------------------------
557          Tell every processor the number of nonzeros per row
558     */
559     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
560     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
561       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
562     }
563     sendcount = A->rmap->rend - A->rmap->rstart;
564     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
565     for (i=0; i<size; i++) {
566       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
567       displs[i]     = A->rmap->range[i];
568     }
569 #if defined(PETSC_HAVE_MPI_IN_PLACE)
570     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #else
572     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
573 #endif
574     /* ---------------------------------------------------------------
575          Create the sequential matrix of the same type as the local block diagonal
576     */
577     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
578     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
579     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
580     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
581     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
582     **Bin = B;
583     b = (Mat_SeqAIJ *)B->data;
584 
585     /*--------------------------------------------------------------------
586        Copy my part of matrix column indices over
587     */
588     sendcount  = ad->nz + bd->nz;
589     jsendbuf   = b->j + b->i[rstarts[rank]];
590     a_jsendbuf = ad->j;
591     b_jsendbuf = bd->j;
592     n          = A->rmap->rend - A->rmap->rstart;
593     cnt        = 0;
594     for (i=0; i<n; i++) {
595 
596       /* put in lower diagonal portion */
597       m = bd->i[i+1] - bd->i[i];
598       while (m > 0) {
599         /* is it above diagonal (in bd (compressed) numbering) */
600         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
601         jsendbuf[cnt++] = garray[*b_jsendbuf++];
602         m--;
603       }
604 
605       /* put in diagonal portion */
606       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
607         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
608       }
609 
610       /* put in upper diagonal portion */
611       while (m-- > 0) {
612         jsendbuf[cnt++] = garray[*b_jsendbuf++];
613       }
614     }
615     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
616 
617     /*--------------------------------------------------------------------
618        Gather all column indices to all processors
619     */
620     for (i=0; i<size; i++) {
621       recvcounts[i] = 0;
622       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
623         recvcounts[i] += lens[j];
624       }
625     }
626     displs[0]  = 0;
627     for (i=1; i<size; i++) {
628       displs[i] = displs[i-1] + recvcounts[i-1];
629     }
630 #if defined(PETSC_HAVE_MPI_IN_PLACE)
631     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
632 #else
633     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
634 #endif
635     /*--------------------------------------------------------------------
636         Assemble the matrix into useable form (note numerical values not yet set)
637     */
638     /* set the b->ilen (length of each row) values */
639     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
640     /* set the b->i indices */
641     b->i[0] = 0;
642     for (i=1; i<=A->rmap->N; i++) {
643       b->i[i] = b->i[i-1] + lens[i-1];
644     }
645     ierr = PetscFree(lens);CHKERRQ(ierr);
646     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
648 
649   } else {
650     B  = **Bin;
651     b = (Mat_SeqAIJ *)B->data;
652   }
653 
654   /*--------------------------------------------------------------------
655        Copy my part of matrix numerical values into the values location
656   */
657   if (flag == MAT_GET_VALUES){
658     sendcount = ad->nz + bd->nz;
659     sendbuf   = b->a + b->i[rstarts[rank]];
660     a_sendbuf = ad->a;
661     b_sendbuf = bd->a;
662     b_sendj   = bd->j;
663     n         = A->rmap->rend - A->rmap->rstart;
664     cnt       = 0;
665     for (i=0; i<n; i++) {
666 
667       /* put in lower diagonal portion */
668       m = bd->i[i+1] - bd->i[i];
669       while (m > 0) {
670         /* is it above diagonal (in bd (compressed) numbering) */
671         if (garray[*b_sendj] > A->rmap->rstart + i) break;
672         sendbuf[cnt++] = *b_sendbuf++;
673         m--;
674         b_sendj++;
675       }
676 
677       /* put in diagonal portion */
678       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
679         sendbuf[cnt++] = *a_sendbuf++;
680       }
681 
682       /* put in upper diagonal portion */
683       while (m-- > 0) {
684         sendbuf[cnt++] = *b_sendbuf++;
685         b_sendj++;
686       }
687     }
688     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
689 
690     /* -----------------------------------------------------------------
691        Gather all numerical values to all processors
692     */
693     if (!recvcounts) {
694       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
695     }
696     for (i=0; i<size; i++) {
697       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
698     }
699     displs[0]  = 0;
700     for (i=1; i<size; i++) {
701       displs[i] = displs[i-1] + recvcounts[i-1];
702     }
703     recvbuf   = b->a;
704 #if defined(PETSC_HAVE_MPI_IN_PLACE)
705     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
706 #else
707     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
708 #endif
709   }  /* endof (flag == MAT_GET_VALUES) */
710   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
711 
712   if (A->symmetric){
713     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
714   } else if (A->hermitian) {
715     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
716   } else if (A->structurally_symmetric) {
717     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
718   }
719   PetscFunctionReturn(0);
720 }
721 
722 
723 
724 #undef __FUNCT__
725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
727 {
728   PetscErrorCode ierr;
729   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
730   PetscBool      rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
731 
732   PetscFunctionBegin;
733   /*
734        Check for special case each processor gets entire matrix
735   */
736   if (ismax == 1 && C->rmap->N == C->cmap->N) {
737     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
738     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
739     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
740     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
741     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
742       wantallmatrix = PETSC_TRUE;
743       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
744     }
745   }
746   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
747   if (twantallmatrix) {
748     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
749     PetscFunctionReturn(0);
750   }
751 
752   /* Allocate memory to hold all the submatrices */
753   if (scall != MAT_REUSE_MATRIX) {
754     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
755   }
756   /* Determine the number of stages through which submatrices are done */
757   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
758   /*
759      Each stage will extract nmax submatrices.
760      nmax is determined by the matrix column dimension.
761      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
762   */
763   if (!nmax) nmax = 1;
764   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
765 
766   /* Make sure every processor loops through the nstages */
767   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
768 
769   for (i=0,pos=0; i<nstages; i++) {
770     if (pos+nmax <= ismax) max_no = nmax;
771     else if (pos == ismax) max_no = 0;
772     else                   max_no = ismax-pos;
773     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
774     pos += max_no;
775   }
776   PetscFunctionReturn(0);
777 }
778 
779 /* -------------------------------------------------------------------------*/
780 #undef __FUNCT__
781 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
782 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
783 {
784   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
785   Mat            A = c->A;
786   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
787   const PetscInt **icol,**irow;
788   PetscInt       *nrow,*ncol,start;
789   PetscErrorCode ierr;
790   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
791   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
792   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
793   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
794   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
795 #if defined (PETSC_USE_CTABLE)
796   PetscTable     *cmap,cmap_i,*rmap,rmap_i;
797 #else
798   PetscInt       **cmap,*cmap_i,**rmap,*rmap_i;
799 #endif
800   const PetscInt *irow_i;
801   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
802   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
803   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
804   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
805   MPI_Status     *r_status3,*r_status4,*s_status4;
806   MPI_Comm       comm;
807   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
808   PetscMPIInt    *onodes1,*olengths1;
809   PetscMPIInt    idex,idex2,end;
810 
811   PetscFunctionBegin;
812   comm   = ((PetscObject)C)->comm;
813   tag0   = ((PetscObject)C)->tag;
814   size   = c->size;
815   rank   = c->rank;
816 
817   /* Get some new tags to keep the communication clean */
818   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
819   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
820   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
821 
822   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
823 
824   for (i=0; i<ismax; i++) {
825     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
826     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
827     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
828     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
829   }
830 
831   /* evaluate communication - mesg to who, length of mesg, and buffer space
832      required. Based on this, buffers are allocated, and data copied into them*/
833   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
834   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
835   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
836   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
837   for (i=0; i<ismax; i++) {
838     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
839     jmax   = nrow[i];
840     irow_i = irow[i];
841     for (j=0; j<jmax; j++) {
842       l = 0;
843       row  = irow_i[j];
844       while (row >= C->rmap->range[l+1]) l++;
845       proc = l;
846       w4[proc]++;
847     }
848     for (j=0; j<size; j++) {
849       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
850     }
851   }
852 
853   nrqs     = 0;              /* no of outgoing messages */
854   msz      = 0;              /* total mesg length (for all procs) */
855   w1[rank] = 0;              /* no mesg sent to self */
856   w3[rank] = 0;
857   for (i=0; i<size; i++) {
858     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
859   }
860   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
861   for (i=0,j=0; i<size; i++) {
862     if (w1[i]) { pa[j] = i; j++; }
863   }
864 
865   /* Each message would have a header = 1 + 2*(no of IS) + data */
866   for (i=0; i<nrqs; i++) {
867     j     = pa[i];
868     w1[j] += w2[j] + 2* w3[j];
869     msz   += w1[j];
870   }
871   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
872 
873   /* Determine the number of messages to expect, their lengths, from from-ids */
874   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
875   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
876 
877   /* Now post the Irecvs corresponding to these messages */
878   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
879 
880   ierr = PetscFree(onodes1);CHKERRQ(ierr);
881   ierr = PetscFree(olengths1);CHKERRQ(ierr);
882 
883   /* Allocate Memory for outgoing messages */
884   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
885   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
886   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
887 
888   {
889     PetscInt *iptr = tmp,ict = 0;
890     for (i=0; i<nrqs; i++) {
891       j         = pa[i];
892       iptr     += ict;
893       sbuf1[j]  = iptr;
894       ict       = w1[j];
895     }
896   }
897 
898   /* Form the outgoing messages */
899   /* Initialize the header space */
900   for (i=0; i<nrqs; i++) {
901     j           = pa[i];
902     sbuf1[j][0] = 0;
903     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
904     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
905   }
906 
907   /* Parse the isrow and copy data into outbuf */
908   for (i=0; i<ismax; i++) {
909     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
910     irow_i = irow[i];
911     jmax   = nrow[i];
912     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
913       l = 0;
914       row  = irow_i[j];
915       while (row >= C->rmap->range[l+1]) l++;
916       proc = l;
917       if (proc != rank) { /* copy to the outgoing buf*/
918         ctr[proc]++;
919         *ptr[proc] = row;
920         ptr[proc]++;
921       }
922     }
923     /* Update the headers for the current IS */
924     for (j=0; j<size; j++) { /* Can Optimise this loop too */
925       if ((ctr_j = ctr[j])) {
926         sbuf1_j        = sbuf1[j];
927         k              = ++sbuf1_j[0];
928         sbuf1_j[2*k]   = ctr_j;
929         sbuf1_j[2*k-1] = i;
930       }
931     }
932   }
933 
934   /*  Now  post the sends */
935   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
936   for (i=0; i<nrqs; ++i) {
937     j    = pa[i];
938     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
939   }
940 
941   /* Post Receives to capture the buffer size */
942   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
943   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
944   rbuf2[0] = tmp + msz;
945   for (i=1; i<nrqs; ++i) {
946     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
947   }
948   for (i=0; i<nrqs; ++i) {
949     j    = pa[i];
950     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
951   }
952 
953   /* Send to other procs the buf size they should allocate */
954 
955 
956   /* Receive messages*/
957   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
958   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
959   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
960   {
961     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
962     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
963     PetscInt    *sbuf2_i;
964 
965     for (i=0; i<nrqr; ++i) {
966       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
967       req_size[idex] = 0;
968       rbuf1_i         = rbuf1[idex];
969       start           = 2*rbuf1_i[0] + 1;
970       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
971       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
972       sbuf2_i         = sbuf2[idex];
973       for (j=start; j<end; j++) {
974         id               = rbuf1_i[j] - rstart;
975         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
976         sbuf2_i[j]       = ncols;
977         req_size[idex] += ncols;
978       }
979       req_source[idex] = r_status1[i].MPI_SOURCE;
980       /* form the header */
981       sbuf2_i[0]   = req_size[idex];
982       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
983       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
984     }
985   }
986   ierr = PetscFree(r_status1);CHKERRQ(ierr);
987   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
988 
989   /*  recv buffer sizes */
990   /* Receive messages*/
991 
992   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
993   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
994   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
995   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
996   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
997 
998   for (i=0; i<nrqs; ++i) {
999     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1000     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1001     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1002     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1003     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1004   }
1005   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1006   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1007 
1008   /* Wait on sends1 and sends2 */
1009   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1010   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1011 
1012   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1013   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1014   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1015   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1016   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1017   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1018 
1019   /* Now allocate buffers for a->j, and send them off */
1020   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1021   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1022   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1023   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1024 
1025   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1026   {
1027     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1028     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1029     PetscInt cend = C->cmap->rend;
1030     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1031 
1032     for (i=0; i<nrqr; i++) {
1033       rbuf1_i   = rbuf1[i];
1034       sbuf_aj_i = sbuf_aj[i];
1035       ct1       = 2*rbuf1_i[0] + 1;
1036       ct2       = 0;
1037       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1038         kmax = rbuf1[i][2*j];
1039         for (k=0; k<kmax; k++,ct1++) {
1040           row    = rbuf1_i[ct1] - rstart;
1041           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1042           ncols  = nzA + nzB;
1043           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1044 
1045           /* load the column indices for this row into cols*/
1046           cols  = sbuf_aj_i + ct2;
1047 
1048 	  lwrite = 0;
1049           for (l=0; l<nzB; l++) {
1050             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1051           }
1052           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1053           for (l=0; l<nzB; l++) {
1054             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1055           }
1056 
1057           ct2 += ncols;
1058         }
1059       }
1060       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1061     }
1062   }
1063   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1064   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1065 
1066   /* Allocate buffers for a->a, and send them off */
1067   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1068   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1069   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1070   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1071 
1072   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1073   {
1074     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1075     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1076     PetscInt    cend = C->cmap->rend;
1077     PetscInt    *b_j = b->j;
1078     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1079 
1080     for (i=0; i<nrqr; i++) {
1081       rbuf1_i   = rbuf1[i];
1082       sbuf_aa_i = sbuf_aa[i];
1083       ct1       = 2*rbuf1_i[0]+1;
1084       ct2       = 0;
1085       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1086         kmax = rbuf1_i[2*j];
1087         for (k=0; k<kmax; k++,ct1++) {
1088           row    = rbuf1_i[ct1] - rstart;
1089           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1090           ncols  = nzA + nzB;
1091           cworkB = b_j + b_i[row];
1092           vworkA = a_a + a_i[row];
1093           vworkB = b_a + b_i[row];
1094 
1095           /* load the column values for this row into vals*/
1096           vals  = sbuf_aa_i+ct2;
1097 
1098 	  lwrite = 0;
1099           for (l=0; l<nzB; l++) {
1100             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1101           }
1102           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1103           for (l=0; l<nzB; l++) {
1104             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1105           }
1106 
1107           ct2 += ncols;
1108         }
1109       }
1110       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1111     }
1112   }
1113   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1114   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1115   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1116   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1117 
1118   /* Form the matrix */
1119   /* create col map: global col of C -> local col of submatrices */
1120   {
1121     const PetscInt *icol_i;
1122 #if defined (PETSC_USE_CTABLE)
1123     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1124     for (i=0; i<ismax; i++) {
1125       ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr);
1126       jmax   = ncol[i];
1127       icol_i = icol[i];
1128       cmap_i = cmap[i];
1129       for (j=0; j<jmax; j++) {
1130         ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr);
1131       }
1132     }
1133 #else
1134     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1135     ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1136     ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1137     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1138     for (i=0; i<ismax; i++) {
1139       jmax   = ncol[i];
1140       icol_i = icol[i];
1141       cmap_i = cmap[i];
1142       for (j=0; j<jmax; j++) {
1143         cmap_i[icol_i[j]] = j+1;
1144       }
1145     }
1146 #endif
1147   }
1148 
1149   /* Create lens which is required for MatCreate... */
1150   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1151   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1152   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1153   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1154   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1155 
1156   /* Update lens from local data */
1157   for (i=0; i<ismax; i++) {
1158     jmax   = nrow[i];
1159     cmap_i = cmap[i];
1160     irow_i = irow[i];
1161     lens_i = lens[i];
1162     for (j=0; j<jmax; j++) {
1163       l = 0;
1164       row  = irow_i[j];
1165       while (row >= C->rmap->range[l+1]) l++;
1166       proc = l;
1167       if (proc == rank) {
1168         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1169         for (k=0; k<ncols; k++) {
1170 #if defined (PETSC_USE_CTABLE)
1171           ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1172 #else
1173           tcol = cmap_i[cols[k]];
1174 #endif
1175           if (tcol) { lens_i[j]++;}
1176         }
1177         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1178       }
1179     }
1180   }
1181 
1182   /* Create row map: global row of C -> local row of submatrices */
1183 #if defined (PETSC_USE_CTABLE)
1184   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1185     for (i=0; i<ismax; i++) {
1186       ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr);
1187       rmap_i = rmap[i];
1188       irow_i = irow[i];
1189       jmax   = nrow[i];
1190       for (j=0; j<jmax; j++) {
1191         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr);
1192       }
1193     }
1194 #else
1195   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1196   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1197   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1198   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1199   for (i=0; i<ismax; i++) {
1200     rmap_i = rmap[i];
1201     irow_i = irow[i];
1202     jmax   = nrow[i];
1203     for (j=0; j<jmax; j++) {
1204       rmap_i[irow_i[j]] = j;
1205     }
1206   }
1207 #endif
1208 
1209   /* Update lens from offproc data */
1210   {
1211     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1212 
1213     for (tmp2=0; tmp2<nrqs; tmp2++) {
1214       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1215       idex   = pa[idex2];
1216       sbuf1_i = sbuf1[idex];
1217       jmax    = sbuf1_i[0];
1218       ct1     = 2*jmax+1;
1219       ct2     = 0;
1220       rbuf2_i = rbuf2[idex2];
1221       rbuf3_i = rbuf3[idex2];
1222       for (j=1; j<=jmax; j++) {
1223         is_no   = sbuf1_i[2*j-1];
1224         max1    = sbuf1_i[2*j];
1225         lens_i  = lens[is_no];
1226         cmap_i  = cmap[is_no];
1227         rmap_i  = rmap[is_no];
1228         for (k=0; k<max1; k++,ct1++) {
1229 #if defined (PETSC_USE_CTABLE)
1230           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1231           row--;
1232           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1233 #else
1234           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1235 #endif
1236           max2 = rbuf2_i[ct1];
1237           for (l=0; l<max2; l++,ct2++) {
1238 #if defined (PETSC_USE_CTABLE)
1239             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1240 #else
1241             tcol = cmap_i[rbuf3_i[ct2]];
1242 #endif
1243             if (tcol) {
1244               lens_i[row]++;
1245             }
1246           }
1247         }
1248       }
1249     }
1250   }
1251   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1252   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1253   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1254   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1255   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1256 
1257   /* Create the submatrices */
1258   if (scall == MAT_REUSE_MATRIX) {
1259     PetscBool  flag;
1260 
1261     /*
1262         Assumes new rows are same length as the old rows,hence bug!
1263     */
1264     for (i=0; i<ismax; i++) {
1265       mat = (Mat_SeqAIJ *)(submats[i]->data);
1266       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1267       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1268       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1269       /* Initial matrix as if empty */
1270       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1271       submats[i]->factortype = C->factortype;
1272     }
1273   } else {
1274     for (i=0; i<ismax; i++) {
1275       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1276       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1277       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1278       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1279     }
1280   }
1281 
1282   /* Assemble the matrices */
1283   /* First assemble the local rows */
1284   {
1285     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1286     PetscScalar *imat_a;
1287 
1288     for (i=0; i<ismax; i++) {
1289       mat       = (Mat_SeqAIJ*)submats[i]->data;
1290       imat_ilen = mat->ilen;
1291       imat_j    = mat->j;
1292       imat_i    = mat->i;
1293       imat_a    = mat->a;
1294       cmap_i    = cmap[i];
1295       rmap_i    = rmap[i];
1296       irow_i    = irow[i];
1297       jmax      = nrow[i];
1298       for (j=0; j<jmax; j++) {
1299 	l = 0;
1300         row      = irow_i[j];
1301         while (row >= C->rmap->range[l+1]) l++;
1302         proc = l;
1303         if (proc == rank) {
1304           old_row  = row;
1305 #if defined (PETSC_USE_CTABLE)
1306           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1307           row--;
1308 #else
1309           row      = rmap_i[row];
1310 #endif
1311           ilen_row = imat_ilen[row];
1312           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1313           mat_i    = imat_i[row] ;
1314           mat_a    = imat_a + mat_i;
1315           mat_j    = imat_j + mat_i;
1316           for (k=0; k<ncols; k++) {
1317 #if defined (PETSC_USE_CTABLE)
1318             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1319 #else
1320             tcol = cmap_i[cols[k]];
1321 #endif
1322             if (tcol){
1323               *mat_j++ = tcol - 1;
1324               *mat_a++ = vals[k];
1325               ilen_row++;
1326             }
1327           }
1328           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1329           imat_ilen[row] = ilen_row;
1330         }
1331       }
1332     }
1333   }
1334 
1335   /*   Now assemble the off proc rows*/
1336   {
1337     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1338     PetscInt    *imat_j,*imat_i;
1339     PetscScalar *imat_a,*rbuf4_i;
1340 
1341     for (tmp2=0; tmp2<nrqs; tmp2++) {
1342       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1343       idex   = pa[idex2];
1344       sbuf1_i = sbuf1[idex];
1345       jmax    = sbuf1_i[0];
1346       ct1     = 2*jmax + 1;
1347       ct2     = 0;
1348       rbuf2_i = rbuf2[idex2];
1349       rbuf3_i = rbuf3[idex2];
1350       rbuf4_i = rbuf4[idex2];
1351       for (j=1; j<=jmax; j++) {
1352         is_no     = sbuf1_i[2*j-1];
1353         rmap_i    = rmap[is_no];
1354         cmap_i    = cmap[is_no];
1355         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1356         imat_ilen = mat->ilen;
1357         imat_j    = mat->j;
1358         imat_i    = mat->i;
1359         imat_a    = mat->a;
1360         max1      = sbuf1_i[2*j];
1361         for (k=0; k<max1; k++,ct1++) {
1362           row   = sbuf1_i[ct1];
1363 #if defined (PETSC_USE_CTABLE)
1364           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1365           row--;
1366 #else
1367           row   = rmap_i[row];
1368 #endif
1369           ilen  = imat_ilen[row];
1370           mat_i = imat_i[row] ;
1371           mat_a = imat_a + mat_i;
1372           mat_j = imat_j + mat_i;
1373           max2 = rbuf2_i[ct1];
1374           for (l=0; l<max2; l++,ct2++) {
1375 
1376 #if defined (PETSC_USE_CTABLE)
1377             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1378 #else
1379             tcol = cmap_i[rbuf3_i[ct2]];
1380 #endif
1381             if (tcol) {
1382               *mat_j++ = tcol - 1;
1383               *mat_a++ = rbuf4_i[ct2];
1384               ilen++;
1385             }
1386           }
1387           imat_ilen[row] = ilen;
1388         }
1389       }
1390     }
1391   }
1392   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1393   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1394   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1395   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1396   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1397 
1398   /* Restore the indices */
1399   for (i=0; i<ismax; i++) {
1400     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1401     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1402   }
1403 
1404   /* Destroy allocated memory */
1405   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1406   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1407   ierr = PetscFree(pa);CHKERRQ(ierr);
1408 
1409   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1410   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1411   for (i=0; i<nrqr; ++i) {
1412     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1413   }
1414   for (i=0; i<nrqs; ++i) {
1415     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1416     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1417   }
1418 
1419   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1420   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1421   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1422   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1423   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1424   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1425   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1426 #if defined (PETSC_USE_CTABLE)
1427   for (i=0; i<ismax; i++) {
1428     ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1429     ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);
1430   }
1431 #else
1432   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1433   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1434 #endif
1435   ierr = PetscFree(cmap);CHKERRQ(ierr);
1436   ierr = PetscFree(rmap);CHKERRQ(ierr);
1437   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1438   ierr = PetscFree(lens);CHKERRQ(ierr);
1439 
1440   for (i=0; i<ismax; i++) {
1441     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1442     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1443   }
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 /*
1448  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1449  Be careful not to destroy them elsewhere.
1450  */
1451 #undef __FUNCT__
1452 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1453 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1454 {
1455   /* If making this function public, change the error returned in this function away from _PLIB. */
1456   PetscErrorCode ierr;
1457   Mat_MPIAIJ *aij;
1458   PetscBool seqaij;
1459 
1460   PetscFunctionBegin;
1461   /* Check to make sure the component matrices are compatible with C. */
1462   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1463   if(!seqaij) {
1464     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1465   }
1466   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1467   if(!seqaij) {
1468     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1469   }
1470   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1471     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1472   }
1473   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1474   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1475   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1476   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1477   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1478   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1479   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1480     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1481   }
1482   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1483   aij = (Mat_MPIAIJ*)((*C)->data);
1484   aij->A = A;
1485   aij->B = B;
1486   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1487   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1488   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1489   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1490   PetscFunctionReturn(0);
1491 }
1492 
1493 #undef __FUNCT__
1494 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1495 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1496 {
1497   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1498   PetscFunctionBegin;
1499   PetscValidPointer(A,2);
1500   PetscValidPointer(B,3);
1501   *A = aij->A;
1502   *B = aij->B;
1503   /* Note that we don't incref *A and *B, so be careful! */
1504   PetscFunctionReturn(0);
1505 }
1506 
1507 
1508 #undef __FUNCT__
1509 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1510 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1511 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1512 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1513 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1514 {
1515   PetscErrorCode ierr;
1516   PetscMPIInt size, flag;
1517   PetscInt    i,ii;
1518   PetscInt    ismax_c;
1519   PetscFunctionBegin;
1520   if(!ismax) {
1521     PetscFunctionReturn(0);
1522   }
1523   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1524     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1525     if(flag != MPI_IDENT) {
1526       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1527     }
1528     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1529     if(size > 1) {
1530       ++ismax_c;
1531     }
1532   }
1533   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1534     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1535   }
1536   else { /* if(ismax_c) */
1537     Mat *A,*B;
1538     IS  *isrow_c, *iscol_c;
1539     PetscMPIInt size;
1540     /*
1541      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1542      array of sequential matrices underlying the resulting parallel matrices.
1543      Which arrays to allocate is based on the value of MatReuse scall.
1544      There are as many diag matrices as there are original index sets.
1545      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1546 
1547      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1548      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1549      will deposite the extracted diag and off-diag parts.
1550      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1551      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1552     */
1553 
1554     /* Parallel matrix array is allocated only if no reuse is taking place. */
1555     if (scall != MAT_REUSE_MATRIX) {
1556       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1557     }
1558     else {
1559       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1560       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1561       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1562       for(i = 0, ii = 0; i < ismax; ++i) {
1563         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1564         if(size > 1) {
1565           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1566           ++ii;
1567         }
1568         else {
1569           A[i] = (*submat)[i];
1570         }
1571       }
1572     }
1573     /*
1574      Construct the complements of the iscol ISs for parallel ISs only.
1575      These are used to extract the off-diag portion of the resulting parallel matrix.
1576      The row IS for the off-diag portion is the same as for the diag portion,
1577      so we merely alias the row IS, while skipping those that are sequential.
1578     */
1579     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1580     for(i = 0, ii = 0; i < ismax; ++i) {
1581       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1582       if(size > 1) {
1583 	isrow_c[ii] = isrow[i];
1584 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1585 	++ii;
1586       }
1587     }
1588     /* Now obtain the sequential A and B submatrices separately. */
1589     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1590     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1591     for(ii = 0; ii < ismax_c; ++ii) {
1592       ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr);
1593     }
1594     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1595     /*
1596      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1597      have been extracted directly into the parallel matrices containing them, or
1598      simply into the sequential matrix identical with the corresponding A (if size == 1).
1599      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1600      A is put into the correct submat slot (if size == 1).
1601      */
1602     if(scall != MAT_REUSE_MATRIX) {
1603       for(i = 0, ii = 0; i < ismax; ++i) {
1604         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1605         if(size > 1) {
1606           /*
1607            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1608            */
1609           /* Construct submat[i] from the Seq pieces A and B. */
1610           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1611 
1612           ++ii;
1613         }
1614         else {
1615           (*submat)[i] = A[i];
1616         }
1617       }
1618     }
1619     ierr = PetscFree(A); CHKERRQ(ierr);
1620     ierr = PetscFree(B); CHKERRQ(ierr);
1621   }
1622   PetscFunctionReturn(0);
1623 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1624 
1625 
1626 
1627 #undef __FUNCT__
1628 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1629 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1630 {
1631   PetscErrorCode ierr;
1632   PetscFunctionBegin;
1633   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1634                                            MatCreateMPIAIJFromSeqMatrices_Private,
1635                                            MatMPIAIJExtractSeqMatrices_Private);
1636   CHKERRQ(ierr);
1637   PetscFunctionReturn(0);
1638 }
1639