xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 2c4ddcd83c0a3d14ad4344fe9ab541864a131b64)
1 
2 #define PETSCMAT_DLL
3 
4 /*
5    Routines to compute overlapping regions of a parallel MPI matrix
6   and to find submatrices that were shared across processors.
7 */
8 #include "../src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16 
17 #undef __FUNCT__
18 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
19 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
20 {
21   PetscErrorCode ierr;
22   PetscInt       i;
23 
24   PetscFunctionBegin;
25   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26   for (i=0; i<ov; ++i) {
27     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
28   }
29   PetscFunctionReturn(0);
30 }
31 
32 /*
33   Sample message format:
34   If a processor A wants processor B to process some elements corresponding
35   to index sets is[1],is[5]
36   mesg [0] = 2   (no of index sets in the mesg)
37   -----------
38   mesg [1] = 1 => is[1]
39   mesg [2] = sizeof(is[1]);
40   -----------
41   mesg [3] = 5  => is[5]
42   mesg [4] = sizeof(is[5]);
43   -----------
44   mesg [5]
45   mesg [n]  datas[1]
46   -----------
47   mesg[n+1]
48   mesg[m]  data(is[5])
49   -----------
50 
51   Notes:
52   nrqs - no of requests sent (or to be sent out)
53   nrqr - no of requests recieved (which have to be or which have been processed
54 */
55 #undef __FUNCT__
56 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
57 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
58 {
59   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
60   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
61   const PetscInt **idx,*idx_i;
62   PetscInt       *n,*rtable,**data,len;
63   PetscErrorCode ierr;
64   PetscMPIInt    size,rank,tag1,tag2;
65   PetscInt       m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
66   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
67   PetscBT        *table;
68   MPI_Comm       comm;
69   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
70   MPI_Status     *s_status,*recv_status;
71 
72   PetscFunctionBegin;
73   comm   = ((PetscObject)C)->comm;
74   size   = c->size;
75   rank   = c->rank;
76   m      = C->rmap->N;
77 
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
79   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
80 
81   ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr);
82 
83   for (i=0; i<imax; i++) {
84     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
85     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
86   }
87 
88   /* Create hash table for the mapping :row -> proc*/
89   for (i=0,j=0; i<size; i++) {
90     len = C->rmap->range[i+1];
91     for (; j<len; j++) {
92       rtable[j] = i;
93     }
94   }
95 
96   /* evaluate communication - mesg to who,length of mesg, and buffer space
97      required. Based on this, buffers are allocated, and data copied into them*/
98   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
99   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
100   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
101   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
102   for (i=0; i<imax; i++) {
103     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
104     idx_i = idx[i];
105     len   = n[i];
106     for (j=0; j<len; j++) {
107       row  = idx_i[j];
108       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
109       proc = rtable[row];
110       w4[proc]++;
111     }
112     for (j=0; j<size; j++){
113       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
114     }
115   }
116 
117   nrqs     = 0;              /* no of outgoing messages */
118   msz      = 0;              /* total mesg length (for all proc */
119   w1[rank] = 0;              /* no mesg sent to intself */
120   w3[rank] = 0;
121   for (i=0; i<size; i++) {
122     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
123   }
124   /* pa - is list of processors to communicate with */
125   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
126   for (i=0,j=0; i<size; i++) {
127     if (w1[i]) {pa[j] = i; j++;}
128   }
129 
130   /* Each message would have a header = 1 + 2*(no of IS) + data */
131   for (i=0; i<nrqs; i++) {
132     j      = pa[i];
133     w1[j] += w2[j] + 2*w3[j];
134     msz   += w1[j];
135   }
136 
137   /* Determine the number of messages to expect, their lengths, from from-ids */
138   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
139   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
140 
141   /* Now post the Irecvs corresponding to these messages */
142   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
143 
144   /* Allocate Memory for outgoing messages */
145   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
146   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
147   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
148 
149   {
150     PetscInt *iptr = tmp,ict  = 0;
151     for (i=0; i<nrqs; i++) {
152       j         = pa[i];
153       iptr     +=  ict;
154       outdat[j] = iptr;
155       ict       = w1[j];
156     }
157   }
158 
159   /* Form the outgoing messages */
160   /*plug in the headers*/
161   for (i=0; i<nrqs; i++) {
162     j            = pa[i];
163     outdat[j][0] = 0;
164     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
165     ptr[j]       = outdat[j] + 2*w3[j] + 1;
166   }
167 
168   /* Memory for doing local proc's work*/
169   {
170     PetscInt  *d_p;
171     char      *t_p;
172 
173     /* should replace with PetscMallocN() */
174     ierr  = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
175       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr);
176     ierr  = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
177       (m)*imax*sizeof(PetscInt)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr);
178     data  = (PetscInt **)(table + imax);
179     isz   = (PetscInt  *)(data  + imax);
180     d_p   = (PetscInt  *)(isz   + imax);
181     t_p   = (char *)(d_p   + m*imax);
182     for (i=0; i<imax; i++) {
183       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
184       data[i]  = d_p + (m)*i;
185     }
186   }
187 
188   /* Parse the IS and update local tables and the outgoing buf with the data*/
189   {
190     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
191     PetscBT table_i;
192 
193     for (i=0; i<imax; i++) {
194       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
195       n_i     = n[i];
196       table_i = table[i];
197       idx_i   = idx[i];
198       data_i  = data[i];
199       isz_i   = isz[i];
200       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
201         row  = idx_i[j];
202         proc = rtable[row];
203         if (proc != rank) { /* copy to the outgoing buffer */
204           ctr[proc]++;
205           *ptr[proc] = row;
206           ptr[proc]++;
207         } else { /* Update the local table */
208           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
209         }
210       }
211       /* Update the headers for the current IS */
212       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
213         if ((ctr_j = ctr[j])) {
214           outdat_j        = outdat[j];
215           k               = ++outdat_j[0];
216           outdat_j[2*k]   = ctr_j;
217           outdat_j[2*k-1] = i;
218         }
219       }
220       isz[i] = isz_i;
221     }
222   }
223 
224   /*  Now  post the sends */
225   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
226   for (i=0; i<nrqs; ++i) {
227     j    = pa[i];
228     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
229   }
230 
231   /* No longer need the original indices*/
232   for (i=0; i<imax; ++i) {
233     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
234   }
235   ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr);
236 
237   for (i=0; i<imax; ++i) {
238     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
239   }
240 
241   /* Do Local work*/
242   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
243 
244   /* Receive messages*/
245   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
246   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
247 
248   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
249   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
250 
251   /* Phase 1 sends are complete - deallocate buffers */
252   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
253   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
254 
255   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
256   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
257   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
258   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
259   ierr = PetscFree(rbuf);CHKERRQ(ierr);
260 
261 
262  /* Send the data back*/
263   /* Do a global reduction to know the buffer space req for incoming messages*/
264   {
265     PetscMPIInt *rw1;
266 
267     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
268     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
269 
270     for (i=0; i<nrqr; ++i) {
271       proc      = recv_status[i].MPI_SOURCE;
272       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
273       rw1[proc] = isz1[i];
274     }
275     ierr = PetscFree(onodes1);CHKERRQ(ierr);
276     ierr = PetscFree(olengths1);CHKERRQ(ierr);
277 
278     /* Determine the number of messages to expect, their lengths, from from-ids */
279     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
280     ierr = PetscFree(rw1);CHKERRQ(ierr);
281   }
282   /* Now post the Irecvs corresponding to these messages */
283   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
284 
285   /*  Now  post the sends */
286   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
287   for (i=0; i<nrqr; ++i) {
288     j    = recv_status[i].MPI_SOURCE;
289     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
290   }
291 
292   /* receive work done on other processors*/
293   {
294     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
295     PetscMPIInt idex;
296     PetscBT     table_i;
297     MPI_Status  *status2;
298 
299     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
300     for (i=0; i<nrqs; ++i) {
301       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
302       /* Process the message*/
303       rbuf2_i = rbuf2[idex];
304       ct1     = 2*rbuf2_i[0]+1;
305       jmax    = rbuf2[idex][0];
306       for (j=1; j<=jmax; j++) {
307         max     = rbuf2_i[2*j];
308         is_no   = rbuf2_i[2*j-1];
309         isz_i   = isz[is_no];
310         data_i  = data[is_no];
311         table_i = table[is_no];
312         for (k=0; k<max; k++,ct1++) {
313           row = rbuf2_i[ct1];
314           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
315         }
316         isz[is_no] = isz_i;
317       }
318     }
319 
320     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
321     ierr = PetscFree(status2);CHKERRQ(ierr);
322   }
323 
324   for (i=0; i<imax; ++i) {
325     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
326   }
327 
328   ierr = PetscFree(onodes2);CHKERRQ(ierr);
329   ierr = PetscFree(olengths2);CHKERRQ(ierr);
330 
331   ierr = PetscFree(pa);CHKERRQ(ierr);
332   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
333   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
334   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
335   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
337   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(table);CHKERRQ(ierr);
339   ierr = PetscFree(s_status);CHKERRQ(ierr);
340   ierr = PetscFree(recv_status);CHKERRQ(ierr);
341   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
342   ierr = PetscFree(xdata);CHKERRQ(ierr);
343   ierr = PetscFree(isz1);CHKERRQ(ierr);
344   PetscFunctionReturn(0);
345 }
346 
347 #undef __FUNCT__
348 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
349 /*
350    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
351        the work on the local processor.
352 
353      Inputs:
354       C      - MAT_MPIAIJ;
355       imax - total no of index sets processed at a time;
356       table  - an array of char - size = m bits.
357 
358      Output:
359       isz    - array containing the count of the solution elements correspondign
360                to each index set;
361       data   - pointer to the solutions
362 */
363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
364 {
365   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
366   Mat        A = c->A,B = c->B;
367   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
368   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
369   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
370   PetscBT    table_i;
371 
372   PetscFunctionBegin;
373   rstart = C->rmap->rstart;
374   cstart = C->cmap->rstart;
375   ai     = a->i;
376   aj     = a->j;
377   bi     = b->i;
378   bj     = b->j;
379   garray = c->garray;
380 
381 
382   for (i=0; i<imax; i++) {
383     data_i  = data[i];
384     table_i = table[i];
385     isz_i   = isz[i];
386     for (j=0,max=isz[i]; j<max; j++) {
387       row   = data_i[j] - rstart;
388       start = ai[row];
389       end   = ai[row+1];
390       for (k=start; k<end; k++) { /* Amat */
391         val = aj[k] + cstart;
392         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
393       }
394       start = bi[row];
395       end   = bi[row+1];
396       for (k=start; k<end; k++) { /* Bmat */
397         val = garray[bj[k]];
398         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399       }
400     }
401     isz[i] = isz_i;
402   }
403   PetscFunctionReturn(0);
404 }
405 
406 #undef __FUNCT__
407 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
408 /*
409       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
410          and return the output
411 
412          Input:
413            C    - the matrix
414            nrqr - no of messages being processed.
415            rbuf - an array of pointers to the recieved requests
416 
417          Output:
418            xdata - array of messages to be sent back
419            isz1  - size of each message
420 
421   For better efficiency perhaps we should malloc separately each xdata[i],
422 then if a remalloc is required we need only copy the data for that one row
423 rather then all previous rows as it is now where a single large chunck of
424 memory is used.
425 
426 */
427 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
428 {
429   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
430   Mat            A = c->A,B = c->B;
431   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
432   PetscErrorCode ierr;
433   PetscMPIInt    rank;
434   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
435   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
436   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
437   PetscInt       *rbuf_i,kmax,rbuf_0;
438   PetscBT        xtable;
439 
440   PetscFunctionBegin;
441   rank   = c->rank;
442   m      = C->rmap->N;
443   rstart = C->rmap->rstart;
444   cstart = C->cmap->rstart;
445   ai     = a->i;
446   aj     = a->j;
447   bi     = b->i;
448   bj     = b->j;
449   garray = c->garray;
450 
451 
452   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
453     rbuf_i  =  rbuf[i];
454     rbuf_0  =  rbuf_i[0];
455     ct     += rbuf_0;
456     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
457   }
458 
459   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
460   else      max1 = 1;
461   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
462   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
463   ++no_malloc;
464   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
465   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
466 
467   ct3 = 0;
468   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
469     rbuf_i =  rbuf[i];
470     rbuf_0 =  rbuf_i[0];
471     ct1    =  2*rbuf_0+1;
472     ct2    =  ct1;
473     ct3    += ct1;
474     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
475       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
476       oct2 = ct2;
477       kmax = rbuf_i[2*j];
478       for (k=0; k<kmax; k++,ct1++) {
479         row = rbuf_i[ct1];
480         if (!PetscBTLookupSet(xtable,row)) {
481           if (!(ct3 < mem_estimate)) {
482             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
483             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
484             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
485             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
486             xdata[0]     = tmp;
487             mem_estimate = new_estimate; ++no_malloc;
488             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
489           }
490           xdata[i][ct2++] = row;
491           ct3++;
492         }
493       }
494       for (k=oct2,max2=ct2; k<max2; k++) {
495         row   = xdata[i][k] - rstart;
496         start = ai[row];
497         end   = ai[row+1];
498         for (l=start; l<end; l++) {
499           val = aj[l] + cstart;
500           if (!PetscBTLookupSet(xtable,val)) {
501             if (!(ct3 < mem_estimate)) {
502               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
503               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
504               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
505               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
506               xdata[0]     = tmp;
507               mem_estimate = new_estimate; ++no_malloc;
508               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
509             }
510             xdata[i][ct2++] = val;
511             ct3++;
512           }
513         }
514         start = bi[row];
515         end   = bi[row+1];
516         for (l=start; l<end; l++) {
517           val = garray[bj[l]];
518           if (!PetscBTLookupSet(xtable,val)) {
519             if (!(ct3 < mem_estimate)) {
520               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
521               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
522               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
523               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
524               xdata[0]     = tmp;
525               mem_estimate = new_estimate; ++no_malloc;
526               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
527             }
528             xdata[i][ct2++] = val;
529             ct3++;
530           }
531         }
532       }
533       /* Update the header*/
534       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
535       xdata[i][2*j-1] = rbuf_i[2*j-1];
536     }
537     xdata[i][0] = rbuf_0;
538     xdata[i+1]  = xdata[i] + ct2;
539     isz1[i]     = ct2; /* size of each message */
540   }
541   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
542   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
543   PetscFunctionReturn(0);
544 }
545 /* -------------------------------------------------------------------------*/
546 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
547 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
548 /*
549     Every processor gets the entire matrix
550 */
551 #undef __FUNCT__
552 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
553 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
554 {
555   Mat            B;
556   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
557   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
558   PetscErrorCode ierr;
559   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
560   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
561   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
562   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
563 
564   PetscFunctionBegin;
565   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
566   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
567 
568   if (scall == MAT_INITIAL_MATRIX) {
569     /* ----------------------------------------------------------------
570          Tell every processor the number of nonzeros per row
571     */
572     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
573     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
574       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
575     }
576     sendcount = A->rmap->rend - A->rmap->rstart;
577     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
578     for (i=0; i<size; i++) {
579       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
580       displs[i]     = A->rmap->range[i];
581     }
582 #if defined(PETSC_HAVE_MPI_IN_PLACE)
583     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
584 #else
585     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
586 #endif
587     /* ---------------------------------------------------------------
588          Create the sequential matrix of the same type as the local block diagonal
589     */
590     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
591     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
592     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
593     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
594     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
595     **Bin = B;
596     b = (Mat_SeqAIJ *)B->data;
597 
598     /*--------------------------------------------------------------------
599        Copy my part of matrix column indices over
600     */
601     sendcount  = ad->nz + bd->nz;
602     jsendbuf   = b->j + b->i[rstarts[rank]];
603     a_jsendbuf = ad->j;
604     b_jsendbuf = bd->j;
605     n          = A->rmap->rend - A->rmap->rstart;
606     cnt        = 0;
607     for (i=0; i<n; i++) {
608 
609       /* put in lower diagonal portion */
610       m = bd->i[i+1] - bd->i[i];
611       while (m > 0) {
612         /* is it above diagonal (in bd (compressed) numbering) */
613         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
614         jsendbuf[cnt++] = garray[*b_jsendbuf++];
615         m--;
616       }
617 
618       /* put in diagonal portion */
619       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
620         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
621       }
622 
623       /* put in upper diagonal portion */
624       while (m-- > 0) {
625         jsendbuf[cnt++] = garray[*b_jsendbuf++];
626       }
627     }
628     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
629 
630     /*--------------------------------------------------------------------
631        Gather all column indices to all processors
632     */
633     for (i=0; i<size; i++) {
634       recvcounts[i] = 0;
635       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
636         recvcounts[i] += lens[j];
637       }
638     }
639     displs[0]  = 0;
640     for (i=1; i<size; i++) {
641       displs[i] = displs[i-1] + recvcounts[i-1];
642     }
643 #if defined(PETSC_HAVE_MPI_IN_PLACE)
644     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
645 #else
646     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
647 #endif
648     /*--------------------------------------------------------------------
649         Assemble the matrix into useable form (note numerical values not yet set)
650     */
651     /* set the b->ilen (length of each row) values */
652     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
653     /* set the b->i indices */
654     b->i[0] = 0;
655     for (i=1; i<=A->rmap->N; i++) {
656       b->i[i] = b->i[i-1] + lens[i-1];
657     }
658     ierr = PetscFree(lens);CHKERRQ(ierr);
659     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
660     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
661 
662   } else {
663     B  = **Bin;
664     b = (Mat_SeqAIJ *)B->data;
665   }
666 
667   /*--------------------------------------------------------------------
668        Copy my part of matrix numerical values into the values location
669   */
670   if (flag == MAT_GET_VALUES){
671     sendcount = ad->nz + bd->nz;
672     sendbuf   = b->a + b->i[rstarts[rank]];
673     a_sendbuf = ad->a;
674     b_sendbuf = bd->a;
675     b_sendj   = bd->j;
676     n         = A->rmap->rend - A->rmap->rstart;
677     cnt       = 0;
678     for (i=0; i<n; i++) {
679 
680       /* put in lower diagonal portion */
681       m = bd->i[i+1] - bd->i[i];
682       while (m > 0) {
683         /* is it above diagonal (in bd (compressed) numbering) */
684         if (garray[*b_sendj] > A->rmap->rstart + i) break;
685         sendbuf[cnt++] = *b_sendbuf++;
686         m--;
687         b_sendj++;
688       }
689 
690       /* put in diagonal portion */
691       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
692         sendbuf[cnt++] = *a_sendbuf++;
693       }
694 
695       /* put in upper diagonal portion */
696       while (m-- > 0) {
697         sendbuf[cnt++] = *b_sendbuf++;
698         b_sendj++;
699       }
700     }
701     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
702 
703     /* -----------------------------------------------------------------
704        Gather all numerical values to all processors
705     */
706     if (!recvcounts) {
707       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
708     }
709     for (i=0; i<size; i++) {
710       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
711     }
712     displs[0]  = 0;
713     for (i=1; i<size; i++) {
714       displs[i] = displs[i-1] + recvcounts[i-1];
715     }
716     recvbuf   = b->a;
717 #if defined(PETSC_HAVE_MPI_IN_PLACE)
718     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
719 #else
720     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
721 #endif
722   }  /* endof (flag == MAT_GET_VALUES) */
723   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
724 
725   if (A->symmetric){
726     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
727   } else if (A->hermitian) {
728     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
729   } else if (A->structurally_symmetric) {
730     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
731   }
732   PetscFunctionReturn(0);
733 }
734 
735 
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
739 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
740 {
741   PetscErrorCode ierr;
742   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
743   PetscBool      rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
744 
745   PetscFunctionBegin;
746   /*
747        Check for special case each processor gets entire matrix
748   */
749   if (ismax == 1 && C->rmap->N == C->cmap->N) {
750     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
751     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
752     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
753     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
754     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
755       wantallmatrix = PETSC_TRUE;
756       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
757     }
758   }
759   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
760   if (twantallmatrix) {
761     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
762     PetscFunctionReturn(0);
763   }
764 
765   /* Allocate memory to hold all the submatrices */
766   if (scall != MAT_REUSE_MATRIX) {
767     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
768   }
769   /* Determine the number of stages through which submatrices are done */
770   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
771   /*
772      Each stage will extract nmax submatrices.
773      nmax is determined by the matrix column dimension.
774      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
775   */
776   if (!nmax) nmax = 1;
777   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
778 
779   /* Make sure every processor loops through the nstages */
780   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
781 
782   for (i=0,pos=0; i<nstages; i++) {
783     if (pos+nmax <= ismax) max_no = nmax;
784     else if (pos == ismax) max_no = 0;
785     else                   max_no = ismax-pos;
786     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
787     pos += max_no;
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 
793 
794 
795 /* -------------------------------------------------------------------------*/
796 #undef __FUNCT__
797 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
798 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
799 {
800   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
801   Mat            A = c->A;
802   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
803   const PetscInt **icol,**irow;
804   PetscInt       *nrow,*ncol,start;
805   PetscErrorCode ierr;
806   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
807   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
808   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
809   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
810   PetscInt       **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
811   const PetscInt *irow_i;
812   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
813   PetscInt       *rmap_i;
814   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
815   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
816   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
817   MPI_Status     *r_status3,*r_status4,*s_status4;
818   MPI_Comm       comm;
819   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
820   PetscBool      sorted;
821   PetscMPIInt    *onodes1,*olengths1;
822   PetscMPIInt    idex,idex2,end;
823 
824   PetscFunctionBegin;
825   comm   = ((PetscObject)C)->comm;
826   tag0   = ((PetscObject)C)->tag;
827   size   = c->size;
828   rank   = c->rank;
829 
830   /* Get some new tags to keep the communication clean */
831   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
832   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
833   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
834 
835     /* Check if the col indices are sorted */
836   for (i=0; i<ismax; i++) {
837     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
838     /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
839     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
840     /*    if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
841   }
842   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
843 
844   for (i=0; i<ismax; i++) {
845     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
846     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
847     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
848     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
849   }
850 
851   /* evaluate communication - mesg to who, length of mesg, and buffer space
852      required. Based on this, buffers are allocated, and data copied into them*/
853   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
854   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
855   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
856   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
857   for (i=0; i<ismax; i++) {
858     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
859     jmax   = nrow[i];
860     irow_i = irow[i];
861     for (j=0; j<jmax; j++) {
862       l = 0;
863       row  = irow_i[j];
864       while (row >= C->rmap->range[l+1]) l++;
865       proc = l;
866       w4[proc]++;
867     }
868     for (j=0; j<size; j++) {
869       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
870     }
871   }
872 
873   nrqs     = 0;              /* no of outgoing messages */
874   msz      = 0;              /* total mesg length (for all procs) */
875   w1[rank] = 0;              /* no mesg sent to self */
876   w3[rank] = 0;
877   for (i=0; i<size; i++) {
878     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
879   }
880   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
881   for (i=0,j=0; i<size; i++) {
882     if (w1[i]) { pa[j] = i; j++; }
883   }
884 
885   /* Each message would have a header = 1 + 2*(no of IS) + data */
886   for (i=0; i<nrqs; i++) {
887     j     = pa[i];
888     w1[j] += w2[j] + 2* w3[j];
889     msz   += w1[j];
890   }
891   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
892 
893   /* Determine the number of messages to expect, their lengths, from from-ids */
894   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
895   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
896 
897   /* Now post the Irecvs corresponding to these messages */
898   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
899 
900   ierr = PetscFree(onodes1);CHKERRQ(ierr);
901   ierr = PetscFree(olengths1);CHKERRQ(ierr);
902 
903   /* Allocate Memory for outgoing messages */
904   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
905   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
906   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
907 
908   {
909     PetscInt *iptr = tmp,ict = 0;
910     for (i=0; i<nrqs; i++) {
911       j         = pa[i];
912       iptr     += ict;
913       sbuf1[j]  = iptr;
914       ict       = w1[j];
915     }
916   }
917 
918   /* Form the outgoing messages */
919   /* Initialize the header space */
920   for (i=0; i<nrqs; i++) {
921     j           = pa[i];
922     sbuf1[j][0] = 0;
923     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
924     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
925   }
926 
927   /* Parse the isrow and copy data into outbuf */
928   for (i=0; i<ismax; i++) {
929     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
930     irow_i = irow[i];
931     jmax   = nrow[i];
932     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
933       l = 0;
934       row  = irow_i[j];
935       while (row >= C->rmap->range[l+1]) l++;
936       proc = l;
937       if (proc != rank) { /* copy to the outgoing buf*/
938         ctr[proc]++;
939         *ptr[proc] = row;
940         ptr[proc]++;
941       }
942     }
943     /* Update the headers for the current IS */
944     for (j=0; j<size; j++) { /* Can Optimise this loop too */
945       if ((ctr_j = ctr[j])) {
946         sbuf1_j        = sbuf1[j];
947         k              = ++sbuf1_j[0];
948         sbuf1_j[2*k]   = ctr_j;
949         sbuf1_j[2*k-1] = i;
950       }
951     }
952   }
953 
954   /*  Now  post the sends */
955   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
956   for (i=0; i<nrqs; ++i) {
957     j    = pa[i];
958     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
959   }
960 
961   /* Post Receives to capture the buffer size */
962   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
963   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
964   rbuf2[0] = tmp + msz;
965   for (i=1; i<nrqs; ++i) {
966     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
967   }
968   for (i=0; i<nrqs; ++i) {
969     j    = pa[i];
970     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
971   }
972 
973   /* Send to other procs the buf size they should allocate */
974 
975 
976   /* Receive messages*/
977   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
978   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
979   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
980   {
981     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
982     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
983     PetscInt    *sbuf2_i;
984 
985     for (i=0; i<nrqr; ++i) {
986       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
987       req_size[idex] = 0;
988       rbuf1_i         = rbuf1[idex];
989       start           = 2*rbuf1_i[0] + 1;
990       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
991       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
992       sbuf2_i         = sbuf2[idex];
993       for (j=start; j<end; j++) {
994         id               = rbuf1_i[j] - rstart;
995         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
996         sbuf2_i[j]       = ncols;
997         req_size[idex] += ncols;
998       }
999       req_source[idex] = r_status1[i].MPI_SOURCE;
1000       /* form the header */
1001       sbuf2_i[0]   = req_size[idex];
1002       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1003       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1004     }
1005   }
1006   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1007   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1008 
1009   /*  recv buffer sizes */
1010   /* Receive messages*/
1011 
1012   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1013   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1014   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1015   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1016   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1017 
1018   for (i=0; i<nrqs; ++i) {
1019     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1020     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1021     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1022     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1023     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1024   }
1025   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1026   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1027 
1028   /* Wait on sends1 and sends2 */
1029   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1030   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1031 
1032   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1033   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1034   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1035   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1036   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1037   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1038 
1039   /* Now allocate buffers for a->j, and send them off */
1040   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1041   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1042   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1043   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1044 
1045   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1046   {
1047     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1048     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1049     PetscInt cend = C->cmap->rend;
1050     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1051 
1052     for (i=0; i<nrqr; i++) {
1053       rbuf1_i   = rbuf1[i];
1054       sbuf_aj_i = sbuf_aj[i];
1055       ct1       = 2*rbuf1_i[0] + 1;
1056       ct2       = 0;
1057       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1058         kmax = rbuf1[i][2*j];
1059         for (k=0; k<kmax; k++,ct1++) {
1060           row    = rbuf1_i[ct1] - rstart;
1061           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1062           ncols  = nzA + nzB;
1063           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1064 
1065           /* load the column indices for this row into cols*/
1066           cols  = sbuf_aj_i + ct2;
1067 
1068 	  lwrite = 0;
1069           for (l=0; l<nzB; l++) {
1070             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1071           }
1072           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1073           for (l=0; l<nzB; l++) {
1074             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1075           }
1076 
1077           ct2 += ncols;
1078         }
1079       }
1080       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1081     }
1082   }
1083   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1084   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1085 
1086   /* Allocate buffers for a->a, and send them off */
1087   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1088   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1089   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1090   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1091 
1092   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1093   {
1094     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1095     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1096     PetscInt    cend = C->cmap->rend;
1097     PetscInt    *b_j = b->j;
1098     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1099 
1100     for (i=0; i<nrqr; i++) {
1101       rbuf1_i   = rbuf1[i];
1102       sbuf_aa_i = sbuf_aa[i];
1103       ct1       = 2*rbuf1_i[0]+1;
1104       ct2       = 0;
1105       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1106         kmax = rbuf1_i[2*j];
1107         for (k=0; k<kmax; k++,ct1++) {
1108           row    = rbuf1_i[ct1] - rstart;
1109           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1110           ncols  = nzA + nzB;
1111           cworkB = b_j + b_i[row];
1112           vworkA = a_a + a_i[row];
1113           vworkB = b_a + b_i[row];
1114 
1115           /* load the column values for this row into vals*/
1116           vals  = sbuf_aa_i+ct2;
1117 
1118 	  lwrite = 0;
1119           for (l=0; l<nzB; l++) {
1120             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1121           }
1122           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1123           for (l=0; l<nzB; l++) {
1124             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1125           }
1126 
1127           ct2 += ncols;
1128         }
1129       }
1130       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1131     }
1132   }
1133   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1134   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1135   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1136   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1137 
1138   /* Form the matrix */
1139   /* create col map */
1140   {
1141     const PetscInt *icol_i;
1142 
1143     ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1144     ierr    = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1145     ierr    = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1146     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1147     for (i=0; i<ismax; i++) {
1148       jmax   = ncol[i];
1149       icol_i = icol[i];
1150       cmap_i = cmap[i];
1151       for (j=0; j<jmax; j++) {
1152         cmap_i[icol_i[j]] = j+1;
1153       }
1154     }
1155   }
1156 
1157   /* Create lens which is required for MatCreate... */
1158   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1159   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1160   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1161   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1162   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1163 
1164   /* Update lens from local data */
1165   for (i=0; i<ismax; i++) {
1166     jmax   = nrow[i];
1167     cmap_i = cmap[i];
1168     irow_i = irow[i];
1169     lens_i = lens[i];
1170     for (j=0; j<jmax; j++) {
1171       l = 0;
1172       row  = irow_i[j];
1173       while (row >= C->rmap->range[l+1]) l++;
1174       proc = l;
1175       if (proc == rank) {
1176         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1177         for (k=0; k<ncols; k++) {
1178           if (cmap_i[cols[k]]) { lens_i[j]++;}
1179         }
1180         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1181       }
1182     }
1183   }
1184 
1185   /* Create row map*/
1186   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1187   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1188   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1189   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1190   for (i=0; i<ismax; i++) {
1191     rmap_i = rmap[i];
1192     irow_i = irow[i];
1193     jmax   = nrow[i];
1194     for (j=0; j<jmax; j++) {
1195       rmap_i[irow_i[j]] = j;
1196     }
1197   }
1198 
1199   /* Update lens from offproc data */
1200   {
1201     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1202 
1203     for (tmp2=0; tmp2<nrqs; tmp2++) {
1204       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1205       idex   = pa[idex2];
1206       sbuf1_i = sbuf1[idex];
1207       jmax    = sbuf1_i[0];
1208       ct1     = 2*jmax+1;
1209       ct2     = 0;
1210       rbuf2_i = rbuf2[idex2];
1211       rbuf3_i = rbuf3[idex2];
1212       for (j=1; j<=jmax; j++) {
1213         is_no   = sbuf1_i[2*j-1];
1214         max1    = sbuf1_i[2*j];
1215         lens_i  = lens[is_no];
1216         cmap_i  = cmap[is_no];
1217         rmap_i  = rmap[is_no];
1218         for (k=0; k<max1; k++,ct1++) {
1219           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1220           max2 = rbuf2_i[ct1];
1221           for (l=0; l<max2; l++,ct2++) {
1222             if (cmap_i[rbuf3_i[ct2]]) {
1223               lens_i[row]++;
1224             }
1225           }
1226         }
1227       }
1228     }
1229   }
1230   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1231   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1232   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1233   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1234   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1235 
1236   /* Create the submatrices */
1237   if (scall == MAT_REUSE_MATRIX) {
1238     PetscBool  flag;
1239 
1240     /*
1241         Assumes new rows are same length as the old rows,hence bug!
1242     */
1243     for (i=0; i<ismax; i++) {
1244       mat = (Mat_SeqAIJ *)(submats[i]->data);
1245       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1246       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1247       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1248       /* Initial matrix as if empty */
1249       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1250       submats[i]->factortype = C->factortype;
1251     }
1252   } else {
1253     for (i=0; i<ismax; i++) {
1254       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1255       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1256       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1257       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1258     }
1259   }
1260 
1261   /* Assemble the matrices */
1262   /* First assemble the local rows */
1263   {
1264     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1265     PetscScalar *imat_a;
1266 
1267     for (i=0; i<ismax; i++) {
1268       mat       = (Mat_SeqAIJ*)submats[i]->data;
1269       imat_ilen = mat->ilen;
1270       imat_j    = mat->j;
1271       imat_i    = mat->i;
1272       imat_a    = mat->a;
1273       cmap_i    = cmap[i];
1274       rmap_i    = rmap[i];
1275       irow_i    = irow[i];
1276       jmax      = nrow[i];
1277       for (j=0; j<jmax; j++) {
1278 	l = 0;
1279         row      = irow_i[j];
1280         while (row >= C->rmap->range[l+1]) l++;
1281         proc = l;
1282         if (proc == rank) {
1283           old_row  = row;
1284           row      = rmap_i[row];
1285           ilen_row = imat_ilen[row];
1286           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1287           mat_i    = imat_i[row] ;
1288           mat_a    = imat_a + mat_i;
1289           mat_j    = imat_j + mat_i;
1290           for (k=0; k<ncols; k++) {
1291             if ((tcol = cmap_i[cols[k]])) {
1292               *mat_j++ = tcol - 1;
1293               *mat_a++ = vals[k];
1294               ilen_row++;
1295             }
1296           }
1297           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1298           imat_ilen[row] = ilen_row;
1299         }
1300       }
1301     }
1302   }
1303 
1304   /*   Now assemble the off proc rows*/
1305   {
1306     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1307     PetscInt    *imat_j,*imat_i;
1308     PetscScalar *imat_a,*rbuf4_i;
1309 
1310     for (tmp2=0; tmp2<nrqs; tmp2++) {
1311       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1312       idex   = pa[idex2];
1313       sbuf1_i = sbuf1[idex];
1314       jmax    = sbuf1_i[0];
1315       ct1     = 2*jmax + 1;
1316       ct2     = 0;
1317       rbuf2_i = rbuf2[idex2];
1318       rbuf3_i = rbuf3[idex2];
1319       rbuf4_i = rbuf4[idex2];
1320       for (j=1; j<=jmax; j++) {
1321         is_no     = sbuf1_i[2*j-1];
1322         rmap_i    = rmap[is_no];
1323         cmap_i    = cmap[is_no];
1324         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1325         imat_ilen = mat->ilen;
1326         imat_j    = mat->j;
1327         imat_i    = mat->i;
1328         imat_a    = mat->a;
1329         max1      = sbuf1_i[2*j];
1330         for (k=0; k<max1; k++,ct1++) {
1331           row   = sbuf1_i[ct1];
1332           row   = rmap_i[row];
1333           ilen  = imat_ilen[row];
1334           mat_i = imat_i[row] ;
1335           mat_a = imat_a + mat_i;
1336           mat_j = imat_j + mat_i;
1337           max2 = rbuf2_i[ct1];
1338           for (l=0; l<max2; l++,ct2++) {
1339             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1340               *mat_j++ = tcol - 1;
1341               *mat_a++ = rbuf4_i[ct2];
1342               ilen++;
1343             }
1344           }
1345           imat_ilen[row] = ilen;
1346         }
1347       }
1348     }
1349   }
1350   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1351   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1352   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1353   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1354   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1355 
1356   /* Restore the indices */
1357   for (i=0; i<ismax; i++) {
1358     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1359     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1360   }
1361 
1362   /* Destroy allocated memory */
1363   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1364   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1365   ierr = PetscFree(pa);CHKERRQ(ierr);
1366 
1367   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1368   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1369   for (i=0; i<nrqr; ++i) {
1370     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1371   }
1372   for (i=0; i<nrqs; ++i) {
1373     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1374     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1375   }
1376 
1377   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1378   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1379   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1380   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1381   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1382   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1383   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1384 
1385   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1386   ierr = PetscFree(cmap);CHKERRQ(ierr);
1387   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1388   ierr = PetscFree(rmap);CHKERRQ(ierr);
1389   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1390   ierr = PetscFree(lens);CHKERRQ(ierr);
1391 
1392   for (i=0; i<ismax; i++) {
1393     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1394     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1395   }
1396   PetscFunctionReturn(0);
1397 }
1398 
1399 /*
1400  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1401  Be careful not to destroy them elsewhere.
1402  */
1403 #undef __FUNCT__
1404 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1405 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1406 {
1407   /* If making this function public, change the error returned in this function away from _PLIB. */
1408   PetscErrorCode ierr;
1409   Mat_MPIAIJ *aij;
1410   PetscBool seqaij;
1411 
1412   PetscFunctionBegin;
1413   /* Check to make sure the component matrices are compatible with C. */
1414   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1415   if(!seqaij) {
1416     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1417   }
1418   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1419   if(!seqaij) {
1420     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1421   }
1422   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1423     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1424   }
1425   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1426   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1427   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1428   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1429   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1430   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1431   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1432     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1433   }
1434   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1435   aij = (Mat_MPIAIJ*)((*C)->data);
1436   aij->A = A;
1437   aij->B = B;
1438   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1439   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1440   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1441   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1442   PetscFunctionReturn(0);
1443 }
1444 
1445 #undef __FUNCT__
1446 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1447 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1448 {
1449   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1450   PetscFunctionBegin;
1451   PetscValidPointer(A,2);
1452   PetscValidPointer(B,3);
1453   *A = aij->A;
1454   *B = aij->B;
1455   /* Note that we don't incref *A and *B, so be careful! */
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 
1460 #undef __FUNCT__
1461 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1462 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1463 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1464 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1465 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1466 {
1467   PetscErrorCode ierr;
1468   PetscMPIInt size, flag;
1469   PetscInt    i,ii;
1470   PetscInt    ismax_c;
1471   PetscFunctionBegin;
1472   if(!ismax) {
1473     PetscFunctionReturn(0);
1474   }
1475   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1476     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1477     if(flag != MPI_IDENT) {
1478       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1479     }
1480     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1481     if(size > 1) {
1482       ++ismax_c;
1483     }
1484   }
1485   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1486     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1487   }
1488   else { /* if(ismax_c) */
1489     Mat *A,*B;
1490     IS  *isrow_c, *iscol_c;
1491     PetscMPIInt size;
1492     /*
1493      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1494      array of sequential matrices underlying the resulting parallel matrices.
1495      Which arrays to allocate is based on the value of MatReuse scall.
1496      There are as many diag matrices as there are original index sets.
1497      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1498 
1499      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1500      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1501      will deposite the extracted diag and off-diag parts.
1502      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1503      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1504     */
1505 
1506     /* Parallel matrix array is allocated only if no reuse is taking place. */
1507     if (scall != MAT_REUSE_MATRIX) {
1508       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1509     }
1510     else {
1511       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1512       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1513       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1514       for(i = 0, ii = 0; i < ismax; ++i) {
1515         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1516         if(size > 1) {
1517           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1518           ++ii;
1519         }
1520         else {
1521           A[i] = (*submat)[i];
1522         }
1523       }
1524     }
1525     /*
1526      Construct the complements of the iscol ISs for parallel ISs only.
1527      These are used to extract the off-diag portion of the resulting parallel matrix.
1528      The row IS for the off-diag portion is the same as for the diag portion,
1529      so we merely alias the row IS, while skipping those that are sequential.
1530     */
1531     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1532     for(i = 0, ii = 0; i < ismax; ++i) {
1533       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1534       if(size > 1) {
1535 	isrow_c[ii] = isrow[i];
1536 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1537 	++ii;
1538       }
1539     }
1540     /* Now obtain the sequential A and B submatrices separately. */
1541     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1542     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1543     for(ii = 0; ii < ismax_c; ++ii) {
1544       ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr);
1545     }
1546     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1547     /*
1548      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1549      have been extracted directly into the parallel matrices containing them, or
1550      simply into the sequential matrix identical with the corresponding A (if size == 1).
1551      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1552      A is put into the correct submat slot (if size == 1).
1553      */
1554     if(scall != MAT_REUSE_MATRIX) {
1555       for(i = 0, ii = 0; i < ismax; ++i) {
1556         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1557         if(size > 1) {
1558           /*
1559            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1560            */
1561           /* Construct submat[i] from the Seq pieces A and B. */
1562           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1563 
1564           ++ii;
1565         }
1566         else {
1567           (*submat)[i] = A[i];
1568         }
1569       }
1570     }
1571     ierr = PetscFree(A); CHKERRQ(ierr);
1572     ierr = PetscFree(B); CHKERRQ(ierr);
1573   }
1574   PetscFunctionReturn(0);
1575 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1576 
1577 
1578 
1579 #undef __FUNCT__
1580 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1581 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1582 {
1583   PetscErrorCode ierr;
1584   PetscFunctionBegin;
1585   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1586                                            MatCreateMPIAIJFromSeqMatrices_Private,
1587                                            MatMPIAIJExtractSeqMatrices_Private);
1588   CHKERRQ(ierr);
1589   PetscFunctionReturn(0);
1590 }
1591