xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 5c8f6a953e7ed1c81f507d64aebddb11080b60e9)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++) {
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
421   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
422   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
423   PetscInt       *rbuf_i,kmax,rbuf_0;
424   PetscBT        xtable;
425 
426   PetscFunctionBegin;
427   m      = C->rmap->N;
428   rstart = C->rmap->rstart;
429   cstart = C->cmap->rstart;
430   ai     = a->i;
431   aj     = a->j;
432   bi     = b->i;
433   bj     = b->j;
434   garray = c->garray;
435 
436 
437   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
438     rbuf_i  =  rbuf[i];
439     rbuf_0  =  rbuf_i[0];
440     ct     += rbuf_0;
441     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
442   }
443 
444   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
445   else      max1 = 1;
446   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
447   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
448   ++no_malloc;
449   ierr         = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
450   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
451 
452   ct3 = 0;
453   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
454     rbuf_i =  rbuf[i];
455     rbuf_0 =  rbuf_i[0];
456     ct1    =  2*rbuf_0+1;
457     ct2    =  ct1;
458     ct3    += ct1;
459     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
460       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
461       oct2 = ct2;
462       kmax = rbuf_i[2*j];
463       for (k=0; k<kmax; k++,ct1++) {
464         row = rbuf_i[ct1];
465         if (!PetscBTLookupSet(xtable,row)) {
466           if (!(ct3 < mem_estimate)) {
467             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
468             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
469             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
470             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
471             xdata[0]     = tmp;
472             mem_estimate = new_estimate; ++no_malloc;
473             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
474           }
475           xdata[i][ct2++] = row;
476           ct3++;
477         }
478       }
479       for (k=oct2,max2=ct2; k<max2; k++) {
480         row   = xdata[i][k] - rstart;
481         start = ai[row];
482         end   = ai[row+1];
483         for (l=start; l<end; l++) {
484           val = aj[l] + cstart;
485           if (!PetscBTLookupSet(xtable,val)) {
486             if (!(ct3 < mem_estimate)) {
487               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
488               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
489               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
490               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
491               xdata[0]     = tmp;
492               mem_estimate = new_estimate; ++no_malloc;
493               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
494             }
495             xdata[i][ct2++] = val;
496             ct3++;
497           }
498         }
499         start = bi[row];
500         end   = bi[row+1];
501         for (l=start; l<end; l++) {
502           val = garray[bj[l]];
503           if (!PetscBTLookupSet(xtable,val)) {
504             if (!(ct3 < mem_estimate)) {
505               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
506               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
507               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
508               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
509               xdata[0]     = tmp;
510               mem_estimate = new_estimate; ++no_malloc;
511               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
512             }
513             xdata[i][ct2++] = val;
514             ct3++;
515           }
516         }
517       }
518       /* Update the header*/
519       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
520       xdata[i][2*j-1] = rbuf_i[2*j-1];
521     }
522     xdata[i][0] = rbuf_0;
523     xdata[i+1]  = xdata[i] + ct2;
524     isz1[i]     = ct2; /* size of each message */
525   }
526   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
527   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 /* -------------------------------------------------------------------------*/
531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
533 /*
534     Every processor gets the entire matrix
535 */
536 #undef __FUNCT__
537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
539 {
540   Mat            B;
541   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
542   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
543   PetscErrorCode ierr;
544   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
545   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
546   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
547   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
548 
549   PetscFunctionBegin;
550   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
551   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
552 
553   if (scall == MAT_INITIAL_MATRIX) {
554     /* ----------------------------------------------------------------
555          Tell every processor the number of nonzeros per row
556     */
557     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
558     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
559       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
560     }
561     sendcount = A->rmap->rend - A->rmap->rstart;
562     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
563     for (i=0; i<size; i++) {
564       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
565       displs[i]     = A->rmap->range[i];
566     }
567 #if defined(PETSC_HAVE_MPI_IN_PLACE)
568     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
569 #else
570     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #endif
572     /* ---------------------------------------------------------------
573          Create the sequential matrix of the same type as the local block diagonal
574     */
575     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
576     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
577     ierr  = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
578     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
579     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
580     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
581     **Bin = B;
582     b = (Mat_SeqAIJ *)B->data;
583 
584     /*--------------------------------------------------------------------
585        Copy my part of matrix column indices over
586     */
587     sendcount  = ad->nz + bd->nz;
588     jsendbuf   = b->j + b->i[rstarts[rank]];
589     a_jsendbuf = ad->j;
590     b_jsendbuf = bd->j;
591     n          = A->rmap->rend - A->rmap->rstart;
592     cnt        = 0;
593     for (i=0; i<n; i++) {
594 
595       /* put in lower diagonal portion */
596       m = bd->i[i+1] - bd->i[i];
597       while (m > 0) {
598         /* is it above diagonal (in bd (compressed) numbering) */
599         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
600         jsendbuf[cnt++] = garray[*b_jsendbuf++];
601         m--;
602       }
603 
604       /* put in diagonal portion */
605       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
606         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
607       }
608 
609       /* put in upper diagonal portion */
610       while (m-- > 0) {
611         jsendbuf[cnt++] = garray[*b_jsendbuf++];
612       }
613     }
614     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
615 
616     /*--------------------------------------------------------------------
617        Gather all column indices to all processors
618     */
619     for (i=0; i<size; i++) {
620       recvcounts[i] = 0;
621       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
622         recvcounts[i] += lens[j];
623       }
624     }
625     displs[0]  = 0;
626     for (i=1; i<size; i++) {
627       displs[i] = displs[i-1] + recvcounts[i-1];
628     }
629 #if defined(PETSC_HAVE_MPI_IN_PLACE)
630     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
631 #else
632     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
633 #endif
634     /*--------------------------------------------------------------------
635         Assemble the matrix into useable form (note numerical values not yet set)
636     */
637     /* set the b->ilen (length of each row) values */
638     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
639     /* set the b->i indices */
640     b->i[0] = 0;
641     for (i=1; i<=A->rmap->N; i++) {
642       b->i[i] = b->i[i-1] + lens[i-1];
643     }
644     ierr = PetscFree(lens);CHKERRQ(ierr);
645     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
646     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647 
648   } else {
649     B  = **Bin;
650     b = (Mat_SeqAIJ *)B->data;
651   }
652 
653   /*--------------------------------------------------------------------
654        Copy my part of matrix numerical values into the values location
655   */
656   if (flag == MAT_GET_VALUES) {
657     sendcount = ad->nz + bd->nz;
658     sendbuf   = b->a + b->i[rstarts[rank]];
659     a_sendbuf = ad->a;
660     b_sendbuf = bd->a;
661     b_sendj   = bd->j;
662     n         = A->rmap->rend - A->rmap->rstart;
663     cnt       = 0;
664     for (i=0; i<n; i++) {
665 
666       /* put in lower diagonal portion */
667       m = bd->i[i+1] - bd->i[i];
668       while (m > 0) {
669         /* is it above diagonal (in bd (compressed) numbering) */
670         if (garray[*b_sendj] > A->rmap->rstart + i) break;
671         sendbuf[cnt++] = *b_sendbuf++;
672         m--;
673         b_sendj++;
674       }
675 
676       /* put in diagonal portion */
677       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
678         sendbuf[cnt++] = *a_sendbuf++;
679       }
680 
681       /* put in upper diagonal portion */
682       while (m-- > 0) {
683         sendbuf[cnt++] = *b_sendbuf++;
684         b_sendj++;
685       }
686     }
687     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
688 
689     /* -----------------------------------------------------------------
690        Gather all numerical values to all processors
691     */
692     if (!recvcounts) {
693       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
694     }
695     for (i=0; i<size; i++) {
696       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
697     }
698     displs[0]  = 0;
699     for (i=1; i<size; i++) {
700       displs[i] = displs[i-1] + recvcounts[i-1];
701     }
702     recvbuf   = b->a;
703 #if defined(PETSC_HAVE_MPI_IN_PLACE)
704     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
705 #else
706     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
707 #endif
708   }  /* endof (flag == MAT_GET_VALUES) */
709   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
710 
711   if (A->symmetric) {
712     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
713   } else if (A->hermitian) {
714     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
715   } else if (A->structurally_symmetric) {
716     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
717   }
718   PetscFunctionReturn(0);
719 }
720 
721 
722 
723 #undef __FUNCT__
724 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
725 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
726 {
727   PetscErrorCode ierr;
728   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
729   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
730 
731   PetscFunctionBegin;
732   /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
733   /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level.
734      However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that
735      take care to order the result correctly by assembling it with MatSetValues() (after preallocating).
736    */
737   for (i = 0; i < ismax; ++i) {
738     PetscBool sorted;
739     ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr);
740     if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i);
741   }
742 
743   /*
744        Check for special case: each processor gets entire matrix
745   */
746   if (ismax == 1 && C->rmap->N == C->cmap->N) {
747     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
748     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
749     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
750     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
751     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
752       wantallmatrix = PETSC_TRUE;
753       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
754     }
755   }
756   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
757   if (twantallmatrix) {
758     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
759     PetscFunctionReturn(0);
760   }
761 
762   /* Allocate memory to hold all the submatrices */
763   if (scall != MAT_REUSE_MATRIX) {
764     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
765   }
766 
767   /* Check for special case: each processor gets entire matrix columns */
768   ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr);
769   for (i=0; i<ismax; i++) {
770     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
771     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
772     if (colflag && ncol == C->cmap->N) {
773       allcolumns[i] = PETSC_TRUE;
774     } else {
775       allcolumns[i] = PETSC_FALSE;
776     }
777   }
778 
779   /* Determine the number of stages through which submatrices are done */
780   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
781 
782   /*
783      Each stage will extract nmax submatrices.
784      nmax is determined by the matrix column dimension.
785      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
786   */
787   if (!nmax) nmax = 1;
788   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
789 
790   /* Make sure every processor loops through the nstages */
791   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
792 
793   for (i=0,pos=0; i<nstages; i++) {
794     if (pos+nmax <= ismax) max_no = nmax;
795     else if (pos == ismax) max_no = 0;
796     else                   max_no = ismax-pos;
797     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
798     pos += max_no;
799   }
800 
801   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
802   PetscFunctionReturn(0);
803 }
804 
805 /* -------------------------------------------------------------------------*/
806 #undef __FUNCT__
807 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
808 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
809 {
810   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
811   Mat            A = c->A;
812   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
813   const PetscInt **icol,**irow;
814   PetscInt       *nrow,*ncol,start;
815   PetscErrorCode ierr;
816   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
817   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
818   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
819   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
820   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
821 #if defined (PETSC_USE_CTABLE)
822   PetscTable     *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i;
823 #else
824   PetscInt       **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i;
825 #endif
826   const PetscInt *irow_i;
827   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
828   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
829   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
830   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
831   MPI_Status     *r_status3,*r_status4,*s_status4;
832   MPI_Comm       comm;
833   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
834   PetscMPIInt    *onodes1,*olengths1;
835   PetscMPIInt    idex,idex2,end;
836 
837   PetscFunctionBegin;
838   comm   = ((PetscObject)C)->comm;
839   tag0   = ((PetscObject)C)->tag;
840   size   = c->size;
841   rank   = c->rank;
842 
843   /* Get some new tags to keep the communication clean */
844   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
845   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
846   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
847 
848   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
849 
850   for (i=0; i<ismax; i++) {
851     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
852     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
853     if (allcolumns[i]) {
854       icol[i] = PETSC_NULL;
855       ncol[i] = C->cmap->N;
856     } else {
857       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
858       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
859     }
860   }
861 
862   /* evaluate communication - mesg to who, length of mesg, and buffer space
863      required. Based on this, buffers are allocated, and data copied into them*/
864   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
865   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
866   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
867   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
868   for (i=0; i<ismax; i++) {
869     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
870     jmax   = nrow[i];
871     irow_i = irow[i];
872     for (j=0; j<jmax; j++) {
873       l = 0;
874       row  = irow_i[j];
875       while (row >= C->rmap->range[l+1]) l++;
876       proc = l;
877       w4[proc]++;
878     }
879     for (j=0; j<size; j++) {
880       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
881     }
882   }
883 
884   nrqs     = 0;              /* no of outgoing messages */
885   msz      = 0;              /* total mesg length (for all procs) */
886   w1[rank] = 0;              /* no mesg sent to self */
887   w3[rank] = 0;
888   for (i=0; i<size; i++) {
889     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
890   }
891   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
892   for (i=0,j=0; i<size; i++) {
893     if (w1[i]) { pa[j] = i; j++; }
894   }
895 
896   /* Each message would have a header = 1 + 2*(no of IS) + data */
897   for (i=0; i<nrqs; i++) {
898     j     = pa[i];
899     w1[j] += w2[j] + 2* w3[j];
900     msz   += w1[j];
901   }
902   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
903 
904   /* Determine the number of messages to expect, their lengths, from from-ids */
905   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
906   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
907 
908   /* Now post the Irecvs corresponding to these messages */
909   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
910 
911   ierr = PetscFree(onodes1);CHKERRQ(ierr);
912   ierr = PetscFree(olengths1);CHKERRQ(ierr);
913 
914   /* Allocate Memory for outgoing messages */
915   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
916   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
917   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
918 
919   {
920     PetscInt *iptr = tmp,ict = 0;
921     for (i=0; i<nrqs; i++) {
922       j         = pa[i];
923       iptr     += ict;
924       sbuf1[j]  = iptr;
925       ict       = w1[j];
926     }
927   }
928 
929   /* Form the outgoing messages */
930   /* Initialize the header space */
931   for (i=0; i<nrqs; i++) {
932     j           = pa[i];
933     sbuf1[j][0] = 0;
934     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
935     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
936   }
937 
938   /* Parse the isrow and copy data into outbuf */
939   for (i=0; i<ismax; i++) {
940     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
941     irow_i = irow[i];
942     jmax   = nrow[i];
943     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
944       l = 0;
945       row  = irow_i[j];
946       while (row >= C->rmap->range[l+1]) l++;
947       proc = l;
948       if (proc != rank) { /* copy to the outgoing buf*/
949         ctr[proc]++;
950         *ptr[proc] = row;
951         ptr[proc]++;
952       }
953     }
954     /* Update the headers for the current IS */
955     for (j=0; j<size; j++) { /* Can Optimise this loop too */
956       if ((ctr_j = ctr[j])) {
957         sbuf1_j        = sbuf1[j];
958         k              = ++sbuf1_j[0];
959         sbuf1_j[2*k]   = ctr_j;
960         sbuf1_j[2*k-1] = i;
961       }
962     }
963   }
964 
965   /*  Now  post the sends */
966   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
967   for (i=0; i<nrqs; ++i) {
968     j    = pa[i];
969     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
970   }
971 
972   /* Post Receives to capture the buffer size */
973   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
974   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
975   rbuf2[0] = tmp + msz;
976   for (i=1; i<nrqs; ++i) {
977     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
978   }
979   for (i=0; i<nrqs; ++i) {
980     j    = pa[i];
981     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
982   }
983 
984   /* Send to other procs the buf size they should allocate */
985 
986 
987   /* Receive messages*/
988   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
989   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
990   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
991   {
992     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
993     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
994     PetscInt    *sbuf2_i;
995 
996     for (i=0; i<nrqr; ++i) {
997       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
998       req_size[idex] = 0;
999       rbuf1_i         = rbuf1[idex];
1000       start           = 2*rbuf1_i[0] + 1;
1001       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1002       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
1003       sbuf2_i         = sbuf2[idex];
1004       for (j=start; j<end; j++) {
1005         id               = rbuf1_i[j] - rstart;
1006         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1007         sbuf2_i[j]       = ncols;
1008         req_size[idex] += ncols;
1009       }
1010       req_source[idex] = r_status1[i].MPI_SOURCE;
1011       /* form the header */
1012       sbuf2_i[0]   = req_size[idex];
1013       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1014       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1015     }
1016   }
1017   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1018   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1019 
1020   /*  recv buffer sizes */
1021   /* Receive messages*/
1022 
1023   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1024   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1025   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1026   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1027   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1028 
1029   for (i=0; i<nrqs; ++i) {
1030     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1031     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1032     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1033     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1034     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1035   }
1036   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1037   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1038 
1039   /* Wait on sends1 and sends2 */
1040   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1041   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1042 
1043   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1044   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1045   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1046   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1047   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1048   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1049 
1050   /* Now allocate buffers for a->j, and send them off */
1051   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1052   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1053   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1054   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1055 
1056   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1057   {
1058     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1059     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1060     PetscInt cend = C->cmap->rend;
1061     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1062 
1063     for (i=0; i<nrqr; i++) {
1064       rbuf1_i   = rbuf1[i];
1065       sbuf_aj_i = sbuf_aj[i];
1066       ct1       = 2*rbuf1_i[0] + 1;
1067       ct2       = 0;
1068       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1069         kmax = rbuf1[i][2*j];
1070         for (k=0; k<kmax; k++,ct1++) {
1071           row    = rbuf1_i[ct1] - rstart;
1072           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1073           ncols  = nzA + nzB;
1074           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1075 
1076           /* load the column indices for this row into cols*/
1077           cols  = sbuf_aj_i + ct2;
1078 
1079           lwrite = 0;
1080           for (l=0; l<nzB; l++) {
1081             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1082           }
1083           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1084           for (l=0; l<nzB; l++) {
1085             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1086           }
1087 
1088           ct2 += ncols;
1089         }
1090       }
1091       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1092     }
1093   }
1094   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1095   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1096 
1097   /* Allocate buffers for a->a, and send them off */
1098   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1099   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1100   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1101   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1102 
1103   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1104   {
1105     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1106     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1107     PetscInt    cend = C->cmap->rend;
1108     PetscInt    *b_j = b->j;
1109     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1110 
1111     for (i=0; i<nrqr; i++) {
1112       rbuf1_i   = rbuf1[i];
1113       sbuf_aa_i = sbuf_aa[i];
1114       ct1       = 2*rbuf1_i[0]+1;
1115       ct2       = 0;
1116       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1117         kmax = rbuf1_i[2*j];
1118         for (k=0; k<kmax; k++,ct1++) {
1119           row    = rbuf1_i[ct1] - rstart;
1120           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1121           ncols  = nzA + nzB;
1122           cworkB = b_j + b_i[row];
1123           vworkA = a_a + a_i[row];
1124           vworkB = b_a + b_i[row];
1125 
1126           /* load the column values for this row into vals*/
1127           vals  = sbuf_aa_i+ct2;
1128 
1129           lwrite = 0;
1130           for (l=0; l<nzB; l++) {
1131             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1132           }
1133           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1134           for (l=0; l<nzB; l++) {
1135             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1136           }
1137 
1138           ct2 += ncols;
1139         }
1140       }
1141       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1142     }
1143   }
1144   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1145   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1146   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1147   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1148 
1149   /* Form the matrix */
1150   /* create col map: global col of C -> local col of submatrices */
1151   {
1152     const PetscInt *icol_i;
1153 #if defined (PETSC_USE_CTABLE)
1154     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1155     for (i=0; i<ismax; i++) {
1156       if (!allcolumns[i]) {
1157         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1158         jmax   = ncol[i];
1159         icol_i = icol[i];
1160         cmap_i = cmap[i];
1161         for (j=0; j<jmax; j++) {
1162           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1163         }
1164       } else {
1165         cmap[i] = PETSC_NULL;
1166       }
1167     }
1168 #else
1169     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1170     for (i=0; i<ismax; i++) {
1171       if (!allcolumns[i]) {
1172         ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr);
1173         ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1174         jmax   = ncol[i];
1175         icol_i = icol[i];
1176         cmap_i = cmap[i];
1177         for (j=0; j<jmax; j++) {
1178           cmap_i[icol_i[j]] = j+1;
1179         }
1180       } else {
1181         cmap[i] = PETSC_NULL;
1182       }
1183     }
1184 #endif
1185   }
1186 
1187   /* Create lens which is required for MatCreate... */
1188   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1189   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1190   if (ismax) {
1191     ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1192     ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1193   }
1194   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1195 
1196   /* Update lens from local data */
1197   for (i=0; i<ismax; i++) {
1198     jmax   = nrow[i];
1199     if (!allcolumns[i]) cmap_i = cmap[i];
1200     irow_i = irow[i];
1201     lens_i = lens[i];
1202     for (j=0; j<jmax; j++) {
1203       l = 0;
1204       row  = irow_i[j];
1205       while (row >= C->rmap->range[l+1]) l++;
1206       proc = l;
1207       if (proc == rank) {
1208         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1209         if (!allcolumns[i]) {
1210           for (k=0; k<ncols; k++) {
1211 #if defined (PETSC_USE_CTABLE)
1212             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1213 #else
1214             tcol = cmap_i[cols[k]];
1215 #endif
1216             if (tcol) { lens_i[j]++;}
1217           }
1218         } else { /* allcolumns */
1219           lens_i[j] = ncols;
1220         }
1221         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1222       }
1223     }
1224   }
1225 
1226   /* Create row map: global row of C -> local row of submatrices */
1227 #if defined (PETSC_USE_CTABLE)
1228   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1229     for (i=0; i<ismax; i++) {
1230       ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1231       rmap_i = rmap[i];
1232       irow_i = irow[i];
1233       jmax   = nrow[i];
1234       for (j=0; j<jmax; j++) {
1235         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1236       }
1237     }
1238 #else
1239   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1240   if (ismax) {
1241     ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1242     ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1243   }
1244   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1245   for (i=0; i<ismax; i++) {
1246     rmap_i = rmap[i];
1247     irow_i = irow[i];
1248     jmax   = nrow[i];
1249     for (j=0; j<jmax; j++) {
1250       rmap_i[irow_i[j]] = j;
1251     }
1252   }
1253 #endif
1254 
1255   /* Update lens from offproc data */
1256   {
1257     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1258 
1259     for (tmp2=0; tmp2<nrqs; tmp2++) {
1260       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1261       idex   = pa[idex2];
1262       sbuf1_i = sbuf1[idex];
1263       jmax    = sbuf1_i[0];
1264       ct1     = 2*jmax+1;
1265       ct2     = 0;
1266       rbuf2_i = rbuf2[idex2];
1267       rbuf3_i = rbuf3[idex2];
1268       for (j=1; j<=jmax; j++) {
1269         is_no   = sbuf1_i[2*j-1];
1270         max1    = sbuf1_i[2*j];
1271         lens_i  = lens[is_no];
1272         if (!allcolumns[is_no]) {cmap_i  = cmap[is_no];}
1273         rmap_i  = rmap[is_no];
1274         for (k=0; k<max1; k++,ct1++) {
1275 #if defined (PETSC_USE_CTABLE)
1276           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1277           row--;
1278           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1279 #else
1280           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1281 #endif
1282           max2 = rbuf2_i[ct1];
1283           for (l=0; l<max2; l++,ct2++) {
1284             if (!allcolumns[is_no]) {
1285 #if defined (PETSC_USE_CTABLE)
1286               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1287 #else
1288               tcol = cmap_i[rbuf3_i[ct2]];
1289 #endif
1290               if (tcol) {
1291                 lens_i[row]++;
1292               }
1293             } else { /* allcolumns */
1294               lens_i[row]++; /* lens_i[row] += max2 ? */
1295             }
1296           }
1297         }
1298       }
1299     }
1300   }
1301   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1302   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1303   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1304   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1305   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1306 
1307   /* Create the submatrices */
1308   if (scall == MAT_REUSE_MATRIX) {
1309     PetscBool  flag;
1310 
1311     /*
1312         Assumes new rows are same length as the old rows,hence bug!
1313     */
1314     for (i=0; i<ismax; i++) {
1315       mat = (Mat_SeqAIJ *)(submats[i]->data);
1316       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1317       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1318       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1319       /* Initial matrix as if empty */
1320       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1321       submats[i]->factortype = C->factortype;
1322     }
1323   } else {
1324     for (i=0; i<ismax; i++) {
1325       PetscInt rbs,cbs;
1326       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1327       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1328 
1329       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1330       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1331 
1332       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1333       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1334       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1335     }
1336   }
1337 
1338   /* Assemble the matrices */
1339   /* First assemble the local rows */
1340   {
1341     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1342     PetscScalar *imat_a;
1343 
1344     for (i=0; i<ismax; i++) {
1345       mat       = (Mat_SeqAIJ*)submats[i]->data;
1346       imat_ilen = mat->ilen;
1347       imat_j    = mat->j;
1348       imat_i    = mat->i;
1349       imat_a    = mat->a;
1350       if (!allcolumns[i]) cmap_i = cmap[i];
1351       rmap_i    = rmap[i];
1352       irow_i    = irow[i];
1353       jmax      = nrow[i];
1354       for (j=0; j<jmax; j++) {
1355         l = 0;
1356         row      = irow_i[j];
1357         while (row >= C->rmap->range[l+1]) l++;
1358         proc = l;
1359         if (proc == rank) {
1360           old_row  = row;
1361 #if defined (PETSC_USE_CTABLE)
1362           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1363           row--;
1364 #else
1365           row      = rmap_i[row];
1366 #endif
1367           ilen_row = imat_ilen[row];
1368           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1369           mat_i    = imat_i[row] ;
1370           mat_a    = imat_a + mat_i;
1371           mat_j    = imat_j + mat_i;
1372           if (!allcolumns[i]) {
1373             for (k=0; k<ncols; k++) {
1374 #if defined (PETSC_USE_CTABLE)
1375               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1376 #else
1377               tcol = cmap_i[cols[k]];
1378 #endif
1379               if (tcol) {
1380                 *mat_j++ = tcol - 1;
1381                 *mat_a++ = vals[k];
1382                 ilen_row++;
1383               }
1384             }
1385           } else { /* allcolumns */
1386             for (k=0; k<ncols; k++) {
1387               *mat_j++ = cols[k] ; /* global col index! */
1388               *mat_a++ = vals[k];
1389               ilen_row++;
1390             }
1391           }
1392           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1393           imat_ilen[row] = ilen_row;
1394         }
1395       }
1396     }
1397   }
1398 
1399   /*   Now assemble the off proc rows*/
1400   {
1401     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1402     PetscInt    *imat_j,*imat_i;
1403     PetscScalar *imat_a,*rbuf4_i;
1404 
1405     for (tmp2=0; tmp2<nrqs; tmp2++) {
1406       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1407       idex   = pa[idex2];
1408       sbuf1_i = sbuf1[idex];
1409       jmax    = sbuf1_i[0];
1410       ct1     = 2*jmax + 1;
1411       ct2     = 0;
1412       rbuf2_i = rbuf2[idex2];
1413       rbuf3_i = rbuf3[idex2];
1414       rbuf4_i = rbuf4[idex2];
1415       for (j=1; j<=jmax; j++) {
1416         is_no     = sbuf1_i[2*j-1];
1417         rmap_i    = rmap[is_no];
1418         if (!allcolumns[is_no]) {cmap_i = cmap[is_no];}
1419         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1420         imat_ilen = mat->ilen;
1421         imat_j    = mat->j;
1422         imat_i    = mat->i;
1423         imat_a    = mat->a;
1424         max1      = sbuf1_i[2*j];
1425         for (k=0; k<max1; k++,ct1++) {
1426           row   = sbuf1_i[ct1];
1427 #if defined (PETSC_USE_CTABLE)
1428           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1429           row--;
1430 #else
1431           row   = rmap_i[row];
1432 #endif
1433           ilen  = imat_ilen[row];
1434           mat_i = imat_i[row] ;
1435           mat_a = imat_a + mat_i;
1436           mat_j = imat_j + mat_i;
1437           max2 = rbuf2_i[ct1];
1438           if (!allcolumns[is_no]) {
1439             for (l=0; l<max2; l++,ct2++) {
1440 
1441 #if defined (PETSC_USE_CTABLE)
1442               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1443 #else
1444               tcol = cmap_i[rbuf3_i[ct2]];
1445 #endif
1446               if (tcol) {
1447                 *mat_j++ = tcol - 1;
1448                 *mat_a++ = rbuf4_i[ct2];
1449                 ilen++;
1450               }
1451             }
1452           } else { /* allcolumns */
1453             for (l=0; l<max2; l++,ct2++) {
1454               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1455               *mat_a++ = rbuf4_i[ct2];
1456               ilen++;
1457             }
1458           }
1459           imat_ilen[row] = ilen;
1460         }
1461       }
1462     }
1463   }
1464 
1465   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1466   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1467   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1468   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1469   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1470 
1471   /* Restore the indices */
1472   for (i=0; i<ismax; i++) {
1473     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1474     if (!allcolumns[i]) {
1475       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1476     }
1477   }
1478 
1479   /* Destroy allocated memory */
1480   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1481   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1482   ierr = PetscFree(pa);CHKERRQ(ierr);
1483 
1484   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1485   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1486   for (i=0; i<nrqr; ++i) {
1487     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1488   }
1489   for (i=0; i<nrqs; ++i) {
1490     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1491     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1492   }
1493 
1494   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1495   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1496   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1497   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1498   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1499   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1500   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1501 
1502 #if defined (PETSC_USE_CTABLE)
1503   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1504 #else
1505   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1506 #endif
1507   ierr = PetscFree(rmap);CHKERRQ(ierr);
1508 
1509   for (i=0; i<ismax; i++) {
1510     if (!allcolumns[i]) {
1511 #if defined (PETSC_USE_CTABLE)
1512       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1513 #else
1514       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1515 #endif
1516     }
1517   }
1518   ierr = PetscFree(cmap);CHKERRQ(ierr);
1519   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1520   ierr = PetscFree(lens);CHKERRQ(ierr);
1521 
1522   for (i=0; i<ismax; i++) {
1523     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1524     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1525   }
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 /*
1530  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1531  Be careful not to destroy them elsewhere.
1532  */
1533 #undef __FUNCT__
1534 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1535 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1536 {
1537   /* If making this function public, change the error returned in this function away from _PLIB. */
1538   PetscErrorCode ierr;
1539   Mat_MPIAIJ     *aij;
1540   PetscBool      seqaij;
1541 
1542   PetscFunctionBegin;
1543   /* Check to make sure the component matrices are compatible with C. */
1544   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1545   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1546   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1547   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1548   if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1549 
1550   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1551   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1552   ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr);
1553   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1554   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1555   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1556   ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1557   aij = (Mat_MPIAIJ*)((*C)->data);
1558   aij->A = A;
1559   aij->B = B;
1560   ierr = PetscLogObjectParent(*C,A);CHKERRQ(ierr);
1561   ierr = PetscLogObjectParent(*C,B);CHKERRQ(ierr);
1562   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1563   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 #undef __FUNCT__
1568 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1569 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1570 {
1571   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1572 
1573   PetscFunctionBegin;
1574   PetscValidPointer(A,2);
1575   PetscValidPointer(B,3);
1576   *A = aij->A;
1577   *B = aij->B;
1578   /* Note that we don't incref *A and *B, so be careful! */
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 #undef __FUNCT__
1583 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1584 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1585                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1586                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1587                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1588 {
1589   PetscErrorCode ierr;
1590   PetscMPIInt    size, flag;
1591   PetscInt       i,ii;
1592   PetscInt       ismax_c;
1593 
1594   PetscFunctionBegin;
1595   if (!ismax) {
1596     PetscFunctionReturn(0);
1597   }
1598   for (i = 0, ismax_c = 0; i < ismax; ++i) {
1599     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
1600     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1601     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1602     if (size > 1) {
1603       ++ismax_c;
1604     }
1605   }
1606   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1607     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1608   } else { /* if (ismax_c) */
1609     Mat         *A,*B;
1610     IS          *isrow_c, *iscol_c;
1611     PetscMPIInt size;
1612     /*
1613      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1614      array of sequential matrices underlying the resulting parallel matrices.
1615      Which arrays to allocate is based on the value of MatReuse scall.
1616      There are as many diag matrices as there are original index sets.
1617      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1618 
1619      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1620      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1621      will deposite the extracted diag and off-diag parts.
1622      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1623      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1624     */
1625 
1626     /* Parallel matrix array is allocated only if no reuse is taking place. */
1627     if (scall != MAT_REUSE_MATRIX) {
1628       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1629     } else {
1630       ierr = PetscMalloc(ismax*sizeof(Mat), &A);CHKERRQ(ierr);
1631       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B);CHKERRQ(ierr);
1632       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1633       for (i = 0, ii = 0; i < ismax; ++i) {
1634         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1635         if (size > 1) {
1636           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
1637           ++ii;
1638         } else {
1639           A[i] = (*submat)[i];
1640         }
1641       }
1642     }
1643     /*
1644      Construct the complements of the iscol ISs for parallel ISs only.
1645      These are used to extract the off-diag portion of the resulting parallel matrix.
1646      The row IS for the off-diag portion is the same as for the diag portion,
1647      so we merely alias the row IS, while skipping those that are sequential.
1648     */
1649     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c);CHKERRQ(ierr);
1650     for (i = 0, ii = 0; i < ismax; ++i) {
1651       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1652       if (size > 1) {
1653         isrow_c[ii] = isrow[i];
1654         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
1655         ++ii;
1656       }
1657     }
1658     /* Now obtain the sequential A and B submatrices separately. */
1659     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
1660     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
1661     for (ii = 0; ii < ismax_c; ++ii) {
1662       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
1663     }
1664     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
1665     /*
1666      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1667      have been extracted directly into the parallel matrices containing them, or
1668      simply into the sequential matrix identical with the corresponding A (if size == 1).
1669      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1670      A is put into the correct submat slot (if size == 1).
1671      */
1672     if (scall != MAT_REUSE_MATRIX) {
1673       for (i = 0, ii = 0; i < ismax; ++i) {
1674         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1675         if (size > 1) {
1676           /*
1677            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1678            */
1679           /* Construct submat[i] from the Seq pieces A and B. */
1680           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
1681 
1682           ++ii;
1683         } else {
1684           (*submat)[i] = A[i];
1685         }
1686       }
1687     }
1688     ierr = PetscFree(A);CHKERRQ(ierr);
1689     ierr = PetscFree(B);CHKERRQ(ierr);
1690   }
1691   PetscFunctionReturn(0);
1692 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1693 
1694 
1695 
1696 #undef __FUNCT__
1697 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1698 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1699 {
1700   PetscErrorCode ierr;
1701 
1702   PetscFunctionBegin;
1703   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
1704   PetscFunctionReturn(0);
1705 }
1706