xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 5ef26d82791c105202c99bf0171a606e8eac9ba6)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
421   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
422   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
423   PetscInt       *rbuf_i,kmax,rbuf_0;
424   PetscBT        xtable;
425 
426   PetscFunctionBegin;
427   m      = C->rmap->N;
428   rstart = C->rmap->rstart;
429   cstart = C->cmap->rstart;
430   ai     = a->i;
431   aj     = a->j;
432   bi     = b->i;
433   bj     = b->j;
434   garray = c->garray;
435 
436 
437   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
438     rbuf_i  =  rbuf[i];
439     rbuf_0  =  rbuf_i[0];
440     ct     += rbuf_0;
441     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
442   }
443 
444   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
445   else      max1 = 1;
446   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
447   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
448   ++no_malloc;
449   ierr         = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
450   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
451 
452   ct3 = 0;
453   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
454     rbuf_i =  rbuf[i];
455     rbuf_0 =  rbuf_i[0];
456     ct1    =  2*rbuf_0+1;
457     ct2    =  ct1;
458     ct3    += ct1;
459     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
460       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
461       oct2 = ct2;
462       kmax = rbuf_i[2*j];
463       for (k=0; k<kmax; k++,ct1++) {
464         row = rbuf_i[ct1];
465         if (!PetscBTLookupSet(xtable,row)) {
466           if (!(ct3 < mem_estimate)) {
467             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
468             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
469             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
470             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
471             xdata[0]     = tmp;
472             mem_estimate = new_estimate; ++no_malloc;
473             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
474           }
475           xdata[i][ct2++] = row;
476           ct3++;
477         }
478       }
479       for (k=oct2,max2=ct2; k<max2; k++) {
480         row   = xdata[i][k] - rstart;
481         start = ai[row];
482         end   = ai[row+1];
483         for (l=start; l<end; l++) {
484           val = aj[l] + cstart;
485           if (!PetscBTLookupSet(xtable,val)) {
486             if (!(ct3 < mem_estimate)) {
487               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
488               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
489               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
490               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
491               xdata[0]     = tmp;
492               mem_estimate = new_estimate; ++no_malloc;
493               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
494             }
495             xdata[i][ct2++] = val;
496             ct3++;
497           }
498         }
499         start = bi[row];
500         end   = bi[row+1];
501         for (l=start; l<end; l++) {
502           val = garray[bj[l]];
503           if (!PetscBTLookupSet(xtable,val)) {
504             if (!(ct3 < mem_estimate)) {
505               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
506               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
507               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
508               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
509               xdata[0]     = tmp;
510               mem_estimate = new_estimate; ++no_malloc;
511               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
512             }
513             xdata[i][ct2++] = val;
514             ct3++;
515           }
516         }
517       }
518       /* Update the header*/
519       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
520       xdata[i][2*j-1] = rbuf_i[2*j-1];
521     }
522     xdata[i][0] = rbuf_0;
523     xdata[i+1]  = xdata[i] + ct2;
524     isz1[i]     = ct2; /* size of each message */
525   }
526   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
527   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 /* -------------------------------------------------------------------------*/
531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
533 /*
534     Every processor gets the entire matrix
535 */
536 #undef __FUNCT__
537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
539 {
540   Mat            B;
541   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
542   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
543   PetscErrorCode ierr;
544   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
545   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
546   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
547   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
548 
549   PetscFunctionBegin;
550   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
551   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
552 
553   if (scall == MAT_INITIAL_MATRIX) {
554     /* ----------------------------------------------------------------
555          Tell every processor the number of nonzeros per row
556     */
557     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
558     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
559       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
560     }
561     sendcount = A->rmap->rend - A->rmap->rstart;
562     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
563     for (i=0; i<size; i++) {
564       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
565       displs[i]     = A->rmap->range[i];
566     }
567 #if defined(PETSC_HAVE_MPI_IN_PLACE)
568     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
569 #else
570     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #endif
572     /* ---------------------------------------------------------------
573          Create the sequential matrix of the same type as the local block diagonal
574     */
575     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
576     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
577     ierr  = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
578     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
579     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
580     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
581     **Bin = B;
582     b = (Mat_SeqAIJ *)B->data;
583 
584     /*--------------------------------------------------------------------
585        Copy my part of matrix column indices over
586     */
587     sendcount  = ad->nz + bd->nz;
588     jsendbuf   = b->j + b->i[rstarts[rank]];
589     a_jsendbuf = ad->j;
590     b_jsendbuf = bd->j;
591     n          = A->rmap->rend - A->rmap->rstart;
592     cnt        = 0;
593     for (i=0; i<n; i++) {
594 
595       /* put in lower diagonal portion */
596       m = bd->i[i+1] - bd->i[i];
597       while (m > 0) {
598         /* is it above diagonal (in bd (compressed) numbering) */
599         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
600         jsendbuf[cnt++] = garray[*b_jsendbuf++];
601         m--;
602       }
603 
604       /* put in diagonal portion */
605       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
606         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
607       }
608 
609       /* put in upper diagonal portion */
610       while (m-- > 0) {
611         jsendbuf[cnt++] = garray[*b_jsendbuf++];
612       }
613     }
614     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
615 
616     /*--------------------------------------------------------------------
617        Gather all column indices to all processors
618     */
619     for (i=0; i<size; i++) {
620       recvcounts[i] = 0;
621       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
622         recvcounts[i] += lens[j];
623       }
624     }
625     displs[0]  = 0;
626     for (i=1; i<size; i++) {
627       displs[i] = displs[i-1] + recvcounts[i-1];
628     }
629 #if defined(PETSC_HAVE_MPI_IN_PLACE)
630     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
631 #else
632     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
633 #endif
634     /*--------------------------------------------------------------------
635         Assemble the matrix into useable form (note numerical values not yet set)
636     */
637     /* set the b->ilen (length of each row) values */
638     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
639     /* set the b->i indices */
640     b->i[0] = 0;
641     for (i=1; i<=A->rmap->N; i++) {
642       b->i[i] = b->i[i-1] + lens[i-1];
643     }
644     ierr = PetscFree(lens);CHKERRQ(ierr);
645     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
646     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647 
648   } else {
649     B  = **Bin;
650     b = (Mat_SeqAIJ *)B->data;
651   }
652 
653   /*--------------------------------------------------------------------
654        Copy my part of matrix numerical values into the values location
655   */
656   if (flag == MAT_GET_VALUES){
657     sendcount = ad->nz + bd->nz;
658     sendbuf   = b->a + b->i[rstarts[rank]];
659     a_sendbuf = ad->a;
660     b_sendbuf = bd->a;
661     b_sendj   = bd->j;
662     n         = A->rmap->rend - A->rmap->rstart;
663     cnt       = 0;
664     for (i=0; i<n; i++) {
665 
666       /* put in lower diagonal portion */
667       m = bd->i[i+1] - bd->i[i];
668       while (m > 0) {
669         /* is it above diagonal (in bd (compressed) numbering) */
670         if (garray[*b_sendj] > A->rmap->rstart + i) break;
671         sendbuf[cnt++] = *b_sendbuf++;
672         m--;
673         b_sendj++;
674       }
675 
676       /* put in diagonal portion */
677       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
678         sendbuf[cnt++] = *a_sendbuf++;
679       }
680 
681       /* put in upper diagonal portion */
682       while (m-- > 0) {
683         sendbuf[cnt++] = *b_sendbuf++;
684         b_sendj++;
685       }
686     }
687     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
688 
689     /* -----------------------------------------------------------------
690        Gather all numerical values to all processors
691     */
692     if (!recvcounts) {
693       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
694     }
695     for (i=0; i<size; i++) {
696       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
697     }
698     displs[0]  = 0;
699     for (i=1; i<size; i++) {
700       displs[i] = displs[i-1] + recvcounts[i-1];
701     }
702     recvbuf   = b->a;
703 #if defined(PETSC_HAVE_MPI_IN_PLACE)
704     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
705 #else
706     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
707 #endif
708   }  /* endof (flag == MAT_GET_VALUES) */
709   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
710 
711   if (A->symmetric){
712     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
713   } else if (A->hermitian) {
714     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
715   } else if (A->structurally_symmetric) {
716     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
717   }
718   PetscFunctionReturn(0);
719 }
720 
721 
722 
723 #undef __FUNCT__
724 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
725 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
726 {
727   PetscErrorCode ierr;
728   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
729   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
730 
731   PetscFunctionBegin;
732 
733   /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
734   /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level.
735      However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that
736      take care to order the result correctly by assembling it with MatSetValues() (after preallocating).
737    */
738   for(i = 0; i < ismax; ++i) {
739     PetscBool sorted;
740     ierr = ISSorted(iscol[i], &sorted); CHKERRQ(ierr);
741     if(!sorted) SETERRQ1(((PetscObject)iscol[i])->comm, PETSC_ERR_SUP, "Column index set %D not sorted", i);
742   }
743 
744   /*
745        Check for special case: each processor gets entire matrix
746   */
747   if (ismax == 1 && C->rmap->N == C->cmap->N) {
748     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
749     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
750     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
751     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
752     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
753       wantallmatrix = PETSC_TRUE;
754       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
755     }
756   }
757   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
758   if (twantallmatrix) {
759     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
760     PetscFunctionReturn(0);
761   }
762 
763   /* Allocate memory to hold all the submatrices */
764   if (scall != MAT_REUSE_MATRIX) {
765     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
766   }
767 
768   /* Check for special case: each processor gets entire matrix columns */
769   ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr);
770   for (i=0; i<ismax; i++) {
771     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
772     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
773     if (colflag && ncol == C->cmap->N){
774       allcolumns[i] = PETSC_TRUE;
775     } else {
776       allcolumns[i] = PETSC_FALSE;
777     }
778   }
779 
780   /* Determine the number of stages through which submatrices are done */
781   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
782 
783   /*
784      Each stage will extract nmax submatrices.
785      nmax is determined by the matrix column dimension.
786      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
787   */
788   if (!nmax) nmax = 1;
789   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
790 
791   /* Make sure every processor loops through the nstages */
792   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
793 
794   for (i=0,pos=0; i<nstages; i++) {
795     if (pos+nmax <= ismax) max_no = nmax;
796     else if (pos == ismax) max_no = 0;
797     else                   max_no = ismax-pos;
798     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
799     pos += max_no;
800   }
801 
802   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
803   PetscFunctionReturn(0);
804 }
805 
806 /* -------------------------------------------------------------------------*/
807 #undef __FUNCT__
808 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
809 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
810 {
811   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
812   Mat            A = c->A;
813   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
814   const PetscInt **icol,**irow;
815   PetscInt       *nrow,*ncol,start;
816   PetscErrorCode ierr;
817   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
818   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
819   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
820   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
821   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
822 #if defined (PETSC_USE_CTABLE)
823   PetscTable     *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i;
824 #else
825   PetscInt       **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i;
826 #endif
827   const PetscInt *irow_i;
828   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
829   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
830   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
831   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
832   MPI_Status     *r_status3,*r_status4,*s_status4;
833   MPI_Comm       comm;
834   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
835   PetscMPIInt    *onodes1,*olengths1;
836   PetscMPIInt    idex,idex2,end;
837 
838   PetscFunctionBegin;
839 
840   comm   = ((PetscObject)C)->comm;
841   tag0   = ((PetscObject)C)->tag;
842   size   = c->size;
843   rank   = c->rank;
844 
845   /* Get some new tags to keep the communication clean */
846   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
847   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
848   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
849 
850   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
851 
852   for (i=0; i<ismax; i++) {
853     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
854     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
855     if (allcolumns[i]){
856       icol[i] = PETSC_NULL;
857       ncol[i] = C->cmap->N;
858     } else {
859       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
860       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
861     }
862   }
863 
864   /* evaluate communication - mesg to who, length of mesg, and buffer space
865      required. Based on this, buffers are allocated, and data copied into them*/
866   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
867   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
868   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
869   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
870   for (i=0; i<ismax; i++) {
871     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
872     jmax   = nrow[i];
873     irow_i = irow[i];
874     for (j=0; j<jmax; j++) {
875       l = 0;
876       row  = irow_i[j];
877       while (row >= C->rmap->range[l+1]) l++;
878       proc = l;
879       w4[proc]++;
880     }
881     for (j=0; j<size; j++) {
882       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
883     }
884   }
885 
886   nrqs     = 0;              /* no of outgoing messages */
887   msz      = 0;              /* total mesg length (for all procs) */
888   w1[rank] = 0;              /* no mesg sent to self */
889   w3[rank] = 0;
890   for (i=0; i<size; i++) {
891     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
892   }
893   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
894   for (i=0,j=0; i<size; i++) {
895     if (w1[i]) { pa[j] = i; j++; }
896   }
897 
898   /* Each message would have a header = 1 + 2*(no of IS) + data */
899   for (i=0; i<nrqs; i++) {
900     j     = pa[i];
901     w1[j] += w2[j] + 2* w3[j];
902     msz   += w1[j];
903   }
904   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
905 
906   /* Determine the number of messages to expect, their lengths, from from-ids */
907   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
908   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
909 
910   /* Now post the Irecvs corresponding to these messages */
911   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
912 
913   ierr = PetscFree(onodes1);CHKERRQ(ierr);
914   ierr = PetscFree(olengths1);CHKERRQ(ierr);
915 
916   /* Allocate Memory for outgoing messages */
917   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
918   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
919   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
920 
921   {
922     PetscInt *iptr = tmp,ict = 0;
923     for (i=0; i<nrqs; i++) {
924       j         = pa[i];
925       iptr     += ict;
926       sbuf1[j]  = iptr;
927       ict       = w1[j];
928     }
929   }
930 
931   /* Form the outgoing messages */
932   /* Initialize the header space */
933   for (i=0; i<nrqs; i++) {
934     j           = pa[i];
935     sbuf1[j][0] = 0;
936     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
937     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
938   }
939 
940   /* Parse the isrow and copy data into outbuf */
941   for (i=0; i<ismax; i++) {
942     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
943     irow_i = irow[i];
944     jmax   = nrow[i];
945     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
946       l = 0;
947       row  = irow_i[j];
948       while (row >= C->rmap->range[l+1]) l++;
949       proc = l;
950       if (proc != rank) { /* copy to the outgoing buf*/
951         ctr[proc]++;
952         *ptr[proc] = row;
953         ptr[proc]++;
954       }
955     }
956     /* Update the headers for the current IS */
957     for (j=0; j<size; j++) { /* Can Optimise this loop too */
958       if ((ctr_j = ctr[j])) {
959         sbuf1_j        = sbuf1[j];
960         k              = ++sbuf1_j[0];
961         sbuf1_j[2*k]   = ctr_j;
962         sbuf1_j[2*k-1] = i;
963       }
964     }
965   }
966 
967   /*  Now  post the sends */
968   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
969   for (i=0; i<nrqs; ++i) {
970     j    = pa[i];
971     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
972   }
973 
974   /* Post Receives to capture the buffer size */
975   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
976   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
977   rbuf2[0] = tmp + msz;
978   for (i=1; i<nrqs; ++i) {
979     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
980   }
981   for (i=0; i<nrqs; ++i) {
982     j    = pa[i];
983     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
984   }
985 
986   /* Send to other procs the buf size they should allocate */
987 
988 
989   /* Receive messages*/
990   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
991   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
992   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
993   {
994     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
995     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
996     PetscInt    *sbuf2_i;
997 
998     for (i=0; i<nrqr; ++i) {
999       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1000       req_size[idex] = 0;
1001       rbuf1_i         = rbuf1[idex];
1002       start           = 2*rbuf1_i[0] + 1;
1003       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1004       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
1005       sbuf2_i         = sbuf2[idex];
1006       for (j=start; j<end; j++) {
1007         id               = rbuf1_i[j] - rstart;
1008         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1009         sbuf2_i[j]       = ncols;
1010         req_size[idex] += ncols;
1011       }
1012       req_source[idex] = r_status1[i].MPI_SOURCE;
1013       /* form the header */
1014       sbuf2_i[0]   = req_size[idex];
1015       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1016       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1017     }
1018   }
1019   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1020   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1021 
1022   /*  recv buffer sizes */
1023   /* Receive messages*/
1024 
1025   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1026   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1027   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1028   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1029   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1030 
1031   for (i=0; i<nrqs; ++i) {
1032     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1033     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1034     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1035     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1036     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1037   }
1038   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1039   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1040 
1041   /* Wait on sends1 and sends2 */
1042   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1043   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1044 
1045   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1046   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1047   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1048   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1049   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1050   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1051 
1052   /* Now allocate buffers for a->j, and send them off */
1053   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1054   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1055   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1056   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1057 
1058   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1059   {
1060     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1061     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1062     PetscInt cend = C->cmap->rend;
1063     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1064 
1065     for (i=0; i<nrqr; i++) {
1066       rbuf1_i   = rbuf1[i];
1067       sbuf_aj_i = sbuf_aj[i];
1068       ct1       = 2*rbuf1_i[0] + 1;
1069       ct2       = 0;
1070       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1071         kmax = rbuf1[i][2*j];
1072         for (k=0; k<kmax; k++,ct1++) {
1073           row    = rbuf1_i[ct1] - rstart;
1074           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1075           ncols  = nzA + nzB;
1076           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1077 
1078           /* load the column indices for this row into cols*/
1079           cols  = sbuf_aj_i + ct2;
1080 
1081 	  lwrite = 0;
1082           for (l=0; l<nzB; l++) {
1083             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1084           }
1085           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1086           for (l=0; l<nzB; l++) {
1087             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1088           }
1089 
1090           ct2 += ncols;
1091         }
1092       }
1093       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1094     }
1095   }
1096   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1097   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1098 
1099   /* Allocate buffers for a->a, and send them off */
1100   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1101   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1102   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1103   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1104 
1105   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1106   {
1107     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1108     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1109     PetscInt    cend = C->cmap->rend;
1110     PetscInt    *b_j = b->j;
1111     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1112 
1113     for (i=0; i<nrqr; i++) {
1114       rbuf1_i   = rbuf1[i];
1115       sbuf_aa_i = sbuf_aa[i];
1116       ct1       = 2*rbuf1_i[0]+1;
1117       ct2       = 0;
1118       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1119         kmax = rbuf1_i[2*j];
1120         for (k=0; k<kmax; k++,ct1++) {
1121           row    = rbuf1_i[ct1] - rstart;
1122           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1123           ncols  = nzA + nzB;
1124           cworkB = b_j + b_i[row];
1125           vworkA = a_a + a_i[row];
1126           vworkB = b_a + b_i[row];
1127 
1128           /* load the column values for this row into vals*/
1129           vals  = sbuf_aa_i+ct2;
1130 
1131 	  lwrite = 0;
1132           for (l=0; l<nzB; l++) {
1133             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1134           }
1135           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1136           for (l=0; l<nzB; l++) {
1137             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1138           }
1139 
1140           ct2 += ncols;
1141         }
1142       }
1143       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1144     }
1145   }
1146   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1147   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1148   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1149   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1150 
1151   /* Form the matrix */
1152   /* create col map: global col of C -> local col of submatrices */
1153   {
1154     const PetscInt *icol_i;
1155 #if defined (PETSC_USE_CTABLE)
1156     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1157     for (i=0; i<ismax; i++) {
1158       if (!allcolumns[i]){
1159         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1160         jmax   = ncol[i];
1161         icol_i = icol[i];
1162         cmap_i = cmap[i];
1163         for (j=0; j<jmax; j++) {
1164           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1165         }
1166       } else {
1167         cmap[i] = PETSC_NULL;
1168       }
1169     }
1170 #else
1171     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1172     for (i=0; i<ismax; i++) {
1173       if (!allcolumns[i]){
1174         ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr);
1175         ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1176         jmax   = ncol[i];
1177         icol_i = icol[i];
1178         cmap_i = cmap[i];
1179         for (j=0; j<jmax; j++) {
1180           cmap_i[icol_i[j]] = j+1;
1181         }
1182       } else {
1183         cmap[i] = PETSC_NULL;
1184       }
1185     }
1186 #endif
1187   }
1188 
1189   /* Create lens which is required for MatCreate... */
1190   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1191   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1192   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1193   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1194   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1195 
1196   /* Update lens from local data */
1197   for (i=0; i<ismax; i++) {
1198     jmax   = nrow[i];
1199     if (!allcolumns[i]) cmap_i = cmap[i];
1200     irow_i = irow[i];
1201     lens_i = lens[i];
1202     for (j=0; j<jmax; j++) {
1203       l = 0;
1204       row  = irow_i[j];
1205       while (row >= C->rmap->range[l+1]) l++;
1206       proc = l;
1207       if (proc == rank) {
1208         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1209         if (!allcolumns[i]){
1210           for (k=0; k<ncols; k++) {
1211 #if defined (PETSC_USE_CTABLE)
1212             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1213 #else
1214             tcol = cmap_i[cols[k]];
1215 #endif
1216             if (tcol) { lens_i[j]++;}
1217           }
1218         } else { /* allcolumns */
1219           lens_i[j] = ncols;
1220         }
1221         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1222       }
1223     }
1224   }
1225 
1226   /* Create row map: global row of C -> local row of submatrices */
1227 #if defined (PETSC_USE_CTABLE)
1228   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1229     for (i=0; i<ismax; i++) {
1230       ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1231       rmap_i = rmap[i];
1232       irow_i = irow[i];
1233       jmax   = nrow[i];
1234       for (j=0; j<jmax; j++) {
1235         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1236       }
1237     }
1238 #else
1239   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1240   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1241   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1242   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1243   for (i=0; i<ismax; i++) {
1244     rmap_i = rmap[i];
1245     irow_i = irow[i];
1246     jmax   = nrow[i];
1247     for (j=0; j<jmax; j++) {
1248       rmap_i[irow_i[j]] = j;
1249     }
1250   }
1251 #endif
1252 
1253   /* Update lens from offproc data */
1254   {
1255     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1256 
1257     for (tmp2=0; tmp2<nrqs; tmp2++) {
1258       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1259       idex   = pa[idex2];
1260       sbuf1_i = sbuf1[idex];
1261       jmax    = sbuf1_i[0];
1262       ct1     = 2*jmax+1;
1263       ct2     = 0;
1264       rbuf2_i = rbuf2[idex2];
1265       rbuf3_i = rbuf3[idex2];
1266       for (j=1; j<=jmax; j++) {
1267         is_no   = sbuf1_i[2*j-1];
1268         max1    = sbuf1_i[2*j];
1269         lens_i  = lens[is_no];
1270         if (!allcolumns[is_no]){cmap_i  = cmap[is_no];}
1271         rmap_i  = rmap[is_no];
1272         for (k=0; k<max1; k++,ct1++) {
1273 #if defined (PETSC_USE_CTABLE)
1274           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1275           row--;
1276           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1277 #else
1278           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1279 #endif
1280           max2 = rbuf2_i[ct1];
1281           for (l=0; l<max2; l++,ct2++) {
1282             if (!allcolumns[is_no]){
1283 #if defined (PETSC_USE_CTABLE)
1284               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1285 #else
1286               tcol = cmap_i[rbuf3_i[ct2]];
1287 #endif
1288               if (tcol) {
1289                 lens_i[row]++;
1290               }
1291             } else { /* allcolumns */
1292               lens_i[row]++; /* lens_i[row] += max2 ? */
1293             }
1294           }
1295         }
1296       }
1297     }
1298   }
1299   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1300   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1301   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1302   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1303   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1304 
1305   /* Create the submatrices */
1306   if (scall == MAT_REUSE_MATRIX) {
1307     PetscBool  flag;
1308 
1309     /*
1310         Assumes new rows are same length as the old rows,hence bug!
1311     */
1312     for (i=0; i<ismax; i++) {
1313       mat = (Mat_SeqAIJ *)(submats[i]->data);
1314       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1315       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1316       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1317       /* Initial matrix as if empty */
1318       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1319       submats[i]->factortype = C->factortype;
1320     }
1321   } else {
1322     for (i=0; i<ismax; i++) {
1323       PetscInt rbs,cbs;
1324       ierr = ISGetBlockSize(isrow[i],&rbs); CHKERRQ(ierr);
1325       ierr = ISGetBlockSize(iscol[i],&cbs); CHKERRQ(ierr);
1326 
1327       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1328       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1329 
1330       ierr = MatSetBlockSizes(submats[i],rbs,cbs); CHKERRQ(ierr);
1331       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1332       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1333     }
1334   }
1335 
1336   /* Assemble the matrices */
1337   /* First assemble the local rows */
1338   {
1339     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1340     PetscScalar *imat_a;
1341 
1342     for (i=0; i<ismax; i++) {
1343       mat       = (Mat_SeqAIJ*)submats[i]->data;
1344       imat_ilen = mat->ilen;
1345       imat_j    = mat->j;
1346       imat_i    = mat->i;
1347       imat_a    = mat->a;
1348       if (!allcolumns[i]) cmap_i = cmap[i];
1349       rmap_i    = rmap[i];
1350       irow_i    = irow[i];
1351       jmax      = nrow[i];
1352       for (j=0; j<jmax; j++) {
1353 	l = 0;
1354         row      = irow_i[j];
1355         while (row >= C->rmap->range[l+1]) l++;
1356         proc = l;
1357         if (proc == rank) {
1358           old_row  = row;
1359 #if defined (PETSC_USE_CTABLE)
1360           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1361           row--;
1362 #else
1363           row      = rmap_i[row];
1364 #endif
1365           ilen_row = imat_ilen[row];
1366           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1367           mat_i    = imat_i[row] ;
1368           mat_a    = imat_a + mat_i;
1369           mat_j    = imat_j + mat_i;
1370           if (!allcolumns[i]){
1371             for (k=0; k<ncols; k++) {
1372 #if defined (PETSC_USE_CTABLE)
1373               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1374 #else
1375               tcol = cmap_i[cols[k]];
1376 #endif
1377               if (tcol){
1378                 *mat_j++ = tcol - 1;
1379                 *mat_a++ = vals[k];
1380                 ilen_row++;
1381               }
1382             }
1383           } else { /* allcolumns */
1384             for (k=0; k<ncols; k++) {
1385               *mat_j++ = cols[k] ; /* global col index! */
1386               *mat_a++ = vals[k];
1387               ilen_row++;
1388             }
1389           }
1390           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1391           imat_ilen[row] = ilen_row;
1392         }
1393       }
1394     }
1395   }
1396 
1397   /*   Now assemble the off proc rows*/
1398   {
1399     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1400     PetscInt    *imat_j,*imat_i;
1401     PetscScalar *imat_a,*rbuf4_i;
1402 
1403     for (tmp2=0; tmp2<nrqs; tmp2++) {
1404       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1405       idex   = pa[idex2];
1406       sbuf1_i = sbuf1[idex];
1407       jmax    = sbuf1_i[0];
1408       ct1     = 2*jmax + 1;
1409       ct2     = 0;
1410       rbuf2_i = rbuf2[idex2];
1411       rbuf3_i = rbuf3[idex2];
1412       rbuf4_i = rbuf4[idex2];
1413       for (j=1; j<=jmax; j++) {
1414         is_no     = sbuf1_i[2*j-1];
1415         rmap_i    = rmap[is_no];
1416         if (!allcolumns[is_no]){cmap_i = cmap[is_no];}
1417         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1418         imat_ilen = mat->ilen;
1419         imat_j    = mat->j;
1420         imat_i    = mat->i;
1421         imat_a    = mat->a;
1422         max1      = sbuf1_i[2*j];
1423         for (k=0; k<max1; k++,ct1++) {
1424           row   = sbuf1_i[ct1];
1425 #if defined (PETSC_USE_CTABLE)
1426           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1427           row--;
1428 #else
1429           row   = rmap_i[row];
1430 #endif
1431           ilen  = imat_ilen[row];
1432           mat_i = imat_i[row] ;
1433           mat_a = imat_a + mat_i;
1434           mat_j = imat_j + mat_i;
1435           max2 = rbuf2_i[ct1];
1436           if (!allcolumns[is_no]){
1437             for (l=0; l<max2; l++,ct2++) {
1438 
1439 #if defined (PETSC_USE_CTABLE)
1440               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1441 #else
1442               tcol = cmap_i[rbuf3_i[ct2]];
1443 #endif
1444               if (tcol) {
1445                 *mat_j++ = tcol - 1;
1446                 *mat_a++ = rbuf4_i[ct2];
1447                 ilen++;
1448               }
1449             }
1450           } else { /* allcolumns */
1451             for (l=0; l<max2; l++,ct2++) {
1452               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1453               *mat_a++ = rbuf4_i[ct2];
1454               ilen++;
1455             }
1456           }
1457           imat_ilen[row] = ilen;
1458         }
1459       }
1460     }
1461   }
1462 
1463   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1464   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1465   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1466   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1467   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1468 
1469   /* Restore the indices */
1470   for (i=0; i<ismax; i++) {
1471     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1472     if (!allcolumns[i]){
1473       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1474     }
1475   }
1476 
1477   /* Destroy allocated memory */
1478   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1479   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1480   ierr = PetscFree(pa);CHKERRQ(ierr);
1481 
1482   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1483   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1484   for (i=0; i<nrqr; ++i) {
1485     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1486   }
1487   for (i=0; i<nrqs; ++i) {
1488     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1489     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1490   }
1491 
1492   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1493   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1494   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1495   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1496   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1497   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1498   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1499 
1500 #if defined (PETSC_USE_CTABLE)
1501   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1502 #else
1503   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1504 #endif
1505   ierr = PetscFree(rmap);CHKERRQ(ierr);
1506 
1507   for (i=0; i<ismax; i++) {
1508     if (!allcolumns[i]){
1509 #if defined (PETSC_USE_CTABLE)
1510       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1511 #else
1512       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1513 #endif
1514     }
1515   }
1516   ierr = PetscFree(cmap);CHKERRQ(ierr);
1517   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1518   ierr = PetscFree(lens);CHKERRQ(ierr);
1519 
1520   for (i=0; i<ismax; i++) {
1521     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1522     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1523   }
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 /*
1528  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1529  Be careful not to destroy them elsewhere.
1530  */
1531 #undef __FUNCT__
1532 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1533 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1534 {
1535   /* If making this function public, change the error returned in this function away from _PLIB. */
1536   PetscErrorCode ierr;
1537   Mat_MPIAIJ     *aij;
1538   PetscBool      seqaij;
1539 
1540   PetscFunctionBegin;
1541   /* Check to make sure the component matrices are compatible with C. */
1542   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1543   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1544   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1545   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1546   if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1547 
1548   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1549   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1550   ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs); CHKERRQ(ierr);
1551   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1552   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1553   if((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1554   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1555   aij = (Mat_MPIAIJ*)((*C)->data);
1556   aij->A = A;
1557   aij->B = B;
1558   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1559   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1560   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1561   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 #undef __FUNCT__
1566 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1567 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1568 {
1569   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1570   PetscFunctionBegin;
1571   PetscValidPointer(A,2);
1572   PetscValidPointer(B,3);
1573   *A = aij->A;
1574   *B = aij->B;
1575   /* Note that we don't incref *A and *B, so be careful! */
1576   PetscFunctionReturn(0);
1577 }
1578 
1579 #undef __FUNCT__
1580 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1581 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1582 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1583 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1584 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1585 {
1586   PetscErrorCode ierr;
1587   PetscMPIInt    size, flag;
1588   PetscInt       i,ii;
1589   PetscInt       ismax_c;
1590 
1591   PetscFunctionBegin;
1592   if (!ismax) {
1593     PetscFunctionReturn(0);
1594   }
1595   for (i = 0, ismax_c = 0; i < ismax; ++i) {
1596     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1597     if(flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1598     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1599     if(size > 1) {
1600       ++ismax_c;
1601     }
1602   }
1603   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1604     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1605   } else { /* if(ismax_c) */
1606     Mat         *A,*B;
1607     IS          *isrow_c, *iscol_c;
1608     PetscMPIInt size;
1609     /*
1610      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1611      array of sequential matrices underlying the resulting parallel matrices.
1612      Which arrays to allocate is based on the value of MatReuse scall.
1613      There are as many diag matrices as there are original index sets.
1614      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1615 
1616      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1617      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1618      will deposite the extracted diag and off-diag parts.
1619      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1620      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1621     */
1622 
1623     /* Parallel matrix array is allocated only if no reuse is taking place. */
1624     if (scall != MAT_REUSE_MATRIX) {
1625       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1626     } else {
1627       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1628       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1629       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1630       for(i = 0, ii = 0; i < ismax; ++i) {
1631         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1632         if(size > 1) {
1633           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1634           ++ii;
1635         } else {
1636           A[i] = (*submat)[i];
1637         }
1638       }
1639     }
1640     /*
1641      Construct the complements of the iscol ISs for parallel ISs only.
1642      These are used to extract the off-diag portion of the resulting parallel matrix.
1643      The row IS for the off-diag portion is the same as for the diag portion,
1644      so we merely alias the row IS, while skipping those that are sequential.
1645     */
1646     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1647     for(i = 0, ii = 0; i < ismax; ++i) {
1648       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1649       if(size > 1) {
1650 	isrow_c[ii] = isrow[i];
1651 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1652 	++ii;
1653       }
1654     }
1655     /* Now obtain the sequential A and B submatrices separately. */
1656     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1657     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1658     for(ii = 0; ii < ismax_c; ++ii) {
1659       ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr);
1660     }
1661     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1662     /*
1663      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1664      have been extracted directly into the parallel matrices containing them, or
1665      simply into the sequential matrix identical with the corresponding A (if size == 1).
1666      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1667      A is put into the correct submat slot (if size == 1).
1668      */
1669     if(scall != MAT_REUSE_MATRIX) {
1670       for(i = 0, ii = 0; i < ismax; ++i) {
1671         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1672         if(size > 1) {
1673           /*
1674            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1675            */
1676           /* Construct submat[i] from the Seq pieces A and B. */
1677           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1678 
1679           ++ii;
1680         } else {
1681           (*submat)[i] = A[i];
1682         }
1683       }
1684     }
1685     ierr = PetscFree(A); CHKERRQ(ierr);
1686     ierr = PetscFree(B); CHKERRQ(ierr);
1687   }
1688   PetscFunctionReturn(0);
1689 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1690 
1691 
1692 
1693 #undef __FUNCT__
1694 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1695 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1696 {
1697   PetscErrorCode ierr;
1698 
1699   PetscFunctionBegin;
1700   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
1701   PetscFunctionReturn(0);
1702 }
1703