xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 8cbdbec6f8317ddf7886f91eb9c6bd083b543c50)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
421   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
422   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
423   PetscInt       *rbuf_i,kmax,rbuf_0;
424   PetscBT        xtable;
425 
426   PetscFunctionBegin;
427   m      = C->rmap->N;
428   rstart = C->rmap->rstart;
429   cstart = C->cmap->rstart;
430   ai     = a->i;
431   aj     = a->j;
432   bi     = b->i;
433   bj     = b->j;
434   garray = c->garray;
435 
436 
437   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
438     rbuf_i  =  rbuf[i];
439     rbuf_0  =  rbuf_i[0];
440     ct     += rbuf_0;
441     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
442   }
443 
444   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
445   else      max1 = 1;
446   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
447   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
448   ++no_malloc;
449   ierr         = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
450   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
451 
452   ct3 = 0;
453   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
454     rbuf_i =  rbuf[i];
455     rbuf_0 =  rbuf_i[0];
456     ct1    =  2*rbuf_0+1;
457     ct2    =  ct1;
458     ct3    += ct1;
459     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
460       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
461       oct2 = ct2;
462       kmax = rbuf_i[2*j];
463       for (k=0; k<kmax; k++,ct1++) {
464         row = rbuf_i[ct1];
465         if (!PetscBTLookupSet(xtable,row)) {
466           if (!(ct3 < mem_estimate)) {
467             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
468             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
469             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
470             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
471             xdata[0]     = tmp;
472             mem_estimate = new_estimate; ++no_malloc;
473             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
474           }
475           xdata[i][ct2++] = row;
476           ct3++;
477         }
478       }
479       for (k=oct2,max2=ct2; k<max2; k++) {
480         row   = xdata[i][k] - rstart;
481         start = ai[row];
482         end   = ai[row+1];
483         for (l=start; l<end; l++) {
484           val = aj[l] + cstart;
485           if (!PetscBTLookupSet(xtable,val)) {
486             if (!(ct3 < mem_estimate)) {
487               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
488               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
489               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
490               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
491               xdata[0]     = tmp;
492               mem_estimate = new_estimate; ++no_malloc;
493               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
494             }
495             xdata[i][ct2++] = val;
496             ct3++;
497           }
498         }
499         start = bi[row];
500         end   = bi[row+1];
501         for (l=start; l<end; l++) {
502           val = garray[bj[l]];
503           if (!PetscBTLookupSet(xtable,val)) {
504             if (!(ct3 < mem_estimate)) {
505               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
506               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
507               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
508               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
509               xdata[0]     = tmp;
510               mem_estimate = new_estimate; ++no_malloc;
511               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
512             }
513             xdata[i][ct2++] = val;
514             ct3++;
515           }
516         }
517       }
518       /* Update the header*/
519       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
520       xdata[i][2*j-1] = rbuf_i[2*j-1];
521     }
522     xdata[i][0] = rbuf_0;
523     xdata[i+1]  = xdata[i] + ct2;
524     isz1[i]     = ct2; /* size of each message */
525   }
526   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
527   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 /* -------------------------------------------------------------------------*/
531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
533 /*
534     Every processor gets the entire matrix
535 */
536 #undef __FUNCT__
537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
539 {
540   Mat            B;
541   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
542   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
543   PetscErrorCode ierr;
544   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
545   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
546   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
547   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
548 
549   PetscFunctionBegin;
550   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
551   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
552 
553   if (scall == MAT_INITIAL_MATRIX) {
554     /* ----------------------------------------------------------------
555          Tell every processor the number of nonzeros per row
556     */
557     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
558     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
559       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
560     }
561     sendcount = A->rmap->rend - A->rmap->rstart;
562     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
563     for (i=0; i<size; i++) {
564       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
565       displs[i]     = A->rmap->range[i];
566     }
567 #if defined(PETSC_HAVE_MPI_IN_PLACE)
568     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
569 #else
570     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #endif
572     /* ---------------------------------------------------------------
573          Create the sequential matrix of the same type as the local block diagonal
574     */
575     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
576     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
577     ierr  = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
578     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
579     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
580     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
581     **Bin = B;
582     b = (Mat_SeqAIJ *)B->data;
583 
584     /*--------------------------------------------------------------------
585        Copy my part of matrix column indices over
586     */
587     sendcount  = ad->nz + bd->nz;
588     jsendbuf   = b->j + b->i[rstarts[rank]];
589     a_jsendbuf = ad->j;
590     b_jsendbuf = bd->j;
591     n          = A->rmap->rend - A->rmap->rstart;
592     cnt        = 0;
593     for (i=0; i<n; i++) {
594 
595       /* put in lower diagonal portion */
596       m = bd->i[i+1] - bd->i[i];
597       while (m > 0) {
598         /* is it above diagonal (in bd (compressed) numbering) */
599         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
600         jsendbuf[cnt++] = garray[*b_jsendbuf++];
601         m--;
602       }
603 
604       /* put in diagonal portion */
605       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
606         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
607       }
608 
609       /* put in upper diagonal portion */
610       while (m-- > 0) {
611         jsendbuf[cnt++] = garray[*b_jsendbuf++];
612       }
613     }
614     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
615 
616     /*--------------------------------------------------------------------
617        Gather all column indices to all processors
618     */
619     for (i=0; i<size; i++) {
620       recvcounts[i] = 0;
621       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
622         recvcounts[i] += lens[j];
623       }
624     }
625     displs[0]  = 0;
626     for (i=1; i<size; i++) {
627       displs[i] = displs[i-1] + recvcounts[i-1];
628     }
629 #if defined(PETSC_HAVE_MPI_IN_PLACE)
630     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
631 #else
632     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
633 #endif
634     /*--------------------------------------------------------------------
635         Assemble the matrix into useable form (note numerical values not yet set)
636     */
637     /* set the b->ilen (length of each row) values */
638     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
639     /* set the b->i indices */
640     b->i[0] = 0;
641     for (i=1; i<=A->rmap->N; i++) {
642       b->i[i] = b->i[i-1] + lens[i-1];
643     }
644     ierr = PetscFree(lens);CHKERRQ(ierr);
645     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
646     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647 
648   } else {
649     B  = **Bin;
650     b = (Mat_SeqAIJ *)B->data;
651   }
652 
653   /*--------------------------------------------------------------------
654        Copy my part of matrix numerical values into the values location
655   */
656   if (flag == MAT_GET_VALUES){
657     sendcount = ad->nz + bd->nz;
658     sendbuf   = b->a + b->i[rstarts[rank]];
659     a_sendbuf = ad->a;
660     b_sendbuf = bd->a;
661     b_sendj   = bd->j;
662     n         = A->rmap->rend - A->rmap->rstart;
663     cnt       = 0;
664     for (i=0; i<n; i++) {
665 
666       /* put in lower diagonal portion */
667       m = bd->i[i+1] - bd->i[i];
668       while (m > 0) {
669         /* is it above diagonal (in bd (compressed) numbering) */
670         if (garray[*b_sendj] > A->rmap->rstart + i) break;
671         sendbuf[cnt++] = *b_sendbuf++;
672         m--;
673         b_sendj++;
674       }
675 
676       /* put in diagonal portion */
677       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
678         sendbuf[cnt++] = *a_sendbuf++;
679       }
680 
681       /* put in upper diagonal portion */
682       while (m-- > 0) {
683         sendbuf[cnt++] = *b_sendbuf++;
684         b_sendj++;
685       }
686     }
687     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
688 
689     /* -----------------------------------------------------------------
690        Gather all numerical values to all processors
691     */
692     if (!recvcounts) {
693       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
694     }
695     for (i=0; i<size; i++) {
696       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
697     }
698     displs[0]  = 0;
699     for (i=1; i<size; i++) {
700       displs[i] = displs[i-1] + recvcounts[i-1];
701     }
702     recvbuf   = b->a;
703 #if defined(PETSC_HAVE_MPI_IN_PLACE)
704     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
705 #else
706     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
707 #endif
708   }  /* endof (flag == MAT_GET_VALUES) */
709   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
710 
711   if (A->symmetric){
712     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
713   } else if (A->hermitian) {
714     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
715   } else if (A->structurally_symmetric) {
716     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
717   }
718   PetscFunctionReturn(0);
719 }
720 
721 
722 
723 #undef __FUNCT__
724 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
725 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
726 {
727   PetscErrorCode ierr;
728   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
729   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
730 
731   PetscFunctionBegin;
732 
733   /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
734   /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level.
735      However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that
736      take care to order the result correctly by assembling it with MatSetValues() (after preallocating).
737    */
738   for (i = 0; i < ismax; ++i) {
739     PetscBool sorted;
740     ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr);
741     if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i);
742   }
743 
744   /*
745        Check for special case: each processor gets entire matrix
746   */
747   if (ismax == 1 && C->rmap->N == C->cmap->N) {
748     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
749     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
750     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
751     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
752     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
753       wantallmatrix = PETSC_TRUE;
754       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
755     }
756   }
757   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
758   if (twantallmatrix) {
759     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
760     PetscFunctionReturn(0);
761   }
762 
763   /* Allocate memory to hold all the submatrices */
764   if (scall != MAT_REUSE_MATRIX) {
765     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
766   }
767 
768   /* Check for special case: each processor gets entire matrix columns */
769   ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr);
770   for (i=0; i<ismax; i++) {
771     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
772     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
773     if (colflag && ncol == C->cmap->N){
774       allcolumns[i] = PETSC_TRUE;
775     } else {
776       allcolumns[i] = PETSC_FALSE;
777     }
778   }
779 
780   /* Determine the number of stages through which submatrices are done */
781   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
782 
783   /*
784      Each stage will extract nmax submatrices.
785      nmax is determined by the matrix column dimension.
786      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
787   */
788   if (!nmax) nmax = 1;
789   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
790 
791   /* Make sure every processor loops through the nstages */
792   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
793 
794   for (i=0,pos=0; i<nstages; i++) {
795     if (pos+nmax <= ismax) max_no = nmax;
796     else if (pos == ismax) max_no = 0;
797     else                   max_no = ismax-pos;
798     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
799     pos += max_no;
800   }
801 
802   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
803   PetscFunctionReturn(0);
804 }
805 
806 /* -------------------------------------------------------------------------*/
807 #undef __FUNCT__
808 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
809 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
810 {
811   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
812   Mat            A = c->A;
813   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
814   const PetscInt **icol,**irow;
815   PetscInt       *nrow,*ncol,start;
816   PetscErrorCode ierr;
817   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
818   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
819   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
820   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
821   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
822 #if defined (PETSC_USE_CTABLE)
823   PetscTable     *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i;
824 #else
825   PetscInt       **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i;
826 #endif
827   const PetscInt *irow_i;
828   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
829   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
830   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
831   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
832   MPI_Status     *r_status3,*r_status4,*s_status4;
833   MPI_Comm       comm;
834   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
835   PetscMPIInt    *onodes1,*olengths1;
836   PetscMPIInt    idex,idex2,end;
837 
838   PetscFunctionBegin;
839 
840   comm   = ((PetscObject)C)->comm;
841   tag0   = ((PetscObject)C)->tag;
842   size   = c->size;
843   rank   = c->rank;
844 
845   /* Get some new tags to keep the communication clean */
846   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
847   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
848   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
849 
850   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
851 
852   for (i=0; i<ismax; i++) {
853     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
854     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
855     if (allcolumns[i]){
856       icol[i] = PETSC_NULL;
857       ncol[i] = C->cmap->N;
858     } else {
859       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
860       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
861     }
862   }
863 
864   /* evaluate communication - mesg to who, length of mesg, and buffer space
865      required. Based on this, buffers are allocated, and data copied into them*/
866   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
867   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
868   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
869   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
870   for (i=0; i<ismax; i++) {
871     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
872     jmax   = nrow[i];
873     irow_i = irow[i];
874     for (j=0; j<jmax; j++) {
875       l = 0;
876       row  = irow_i[j];
877       while (row >= C->rmap->range[l+1]) l++;
878       proc = l;
879       w4[proc]++;
880     }
881     for (j=0; j<size; j++) {
882       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
883     }
884   }
885 
886   nrqs     = 0;              /* no of outgoing messages */
887   msz      = 0;              /* total mesg length (for all procs) */
888   w1[rank] = 0;              /* no mesg sent to self */
889   w3[rank] = 0;
890   for (i=0; i<size; i++) {
891     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
892   }
893   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
894   for (i=0,j=0; i<size; i++) {
895     if (w1[i]) { pa[j] = i; j++; }
896   }
897 
898   /* Each message would have a header = 1 + 2*(no of IS) + data */
899   for (i=0; i<nrqs; i++) {
900     j     = pa[i];
901     w1[j] += w2[j] + 2* w3[j];
902     msz   += w1[j];
903   }
904   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
905 
906   /* Determine the number of messages to expect, their lengths, from from-ids */
907   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
908   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
909 
910   /* Now post the Irecvs corresponding to these messages */
911   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
912 
913   ierr = PetscFree(onodes1);CHKERRQ(ierr);
914   ierr = PetscFree(olengths1);CHKERRQ(ierr);
915 
916   /* Allocate Memory for outgoing messages */
917   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
918   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
919   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
920 
921   {
922     PetscInt *iptr = tmp,ict = 0;
923     for (i=0; i<nrqs; i++) {
924       j         = pa[i];
925       iptr     += ict;
926       sbuf1[j]  = iptr;
927       ict       = w1[j];
928     }
929   }
930 
931   /* Form the outgoing messages */
932   /* Initialize the header space */
933   for (i=0; i<nrqs; i++) {
934     j           = pa[i];
935     sbuf1[j][0] = 0;
936     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
937     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
938   }
939 
940   /* Parse the isrow and copy data into outbuf */
941   for (i=0; i<ismax; i++) {
942     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
943     irow_i = irow[i];
944     jmax   = nrow[i];
945     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
946       l = 0;
947       row  = irow_i[j];
948       while (row >= C->rmap->range[l+1]) l++;
949       proc = l;
950       if (proc != rank) { /* copy to the outgoing buf*/
951         ctr[proc]++;
952         *ptr[proc] = row;
953         ptr[proc]++;
954       }
955     }
956     /* Update the headers for the current IS */
957     for (j=0; j<size; j++) { /* Can Optimise this loop too */
958       if ((ctr_j = ctr[j])) {
959         sbuf1_j        = sbuf1[j];
960         k              = ++sbuf1_j[0];
961         sbuf1_j[2*k]   = ctr_j;
962         sbuf1_j[2*k-1] = i;
963       }
964     }
965   }
966 
967   /*  Now  post the sends */
968   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
969   for (i=0; i<nrqs; ++i) {
970     j    = pa[i];
971     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
972   }
973 
974   /* Post Receives to capture the buffer size */
975   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
976   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
977   rbuf2[0] = tmp + msz;
978   for (i=1; i<nrqs; ++i) {
979     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
980   }
981   for (i=0; i<nrqs; ++i) {
982     j    = pa[i];
983     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
984   }
985 
986   /* Send to other procs the buf size they should allocate */
987 
988 
989   /* Receive messages*/
990   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
991   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
992   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
993   {
994     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
995     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
996     PetscInt    *sbuf2_i;
997 
998     for (i=0; i<nrqr; ++i) {
999       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
1000       req_size[idex] = 0;
1001       rbuf1_i         = rbuf1[idex];
1002       start           = 2*rbuf1_i[0] + 1;
1003       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1004       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
1005       sbuf2_i         = sbuf2[idex];
1006       for (j=start; j<end; j++) {
1007         id               = rbuf1_i[j] - rstart;
1008         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1009         sbuf2_i[j]       = ncols;
1010         req_size[idex] += ncols;
1011       }
1012       req_source[idex] = r_status1[i].MPI_SOURCE;
1013       /* form the header */
1014       sbuf2_i[0]   = req_size[idex];
1015       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1016       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1017     }
1018   }
1019   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1020   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1021 
1022   /*  recv buffer sizes */
1023   /* Receive messages*/
1024 
1025   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1026   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1027   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1028   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1029   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1030 
1031   for (i=0; i<nrqs; ++i) {
1032     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1033     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1034     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1035     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1036     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1037   }
1038   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1039   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1040 
1041   /* Wait on sends1 and sends2 */
1042   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1043   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1044 
1045   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1046   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1047   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1048   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1049   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1050   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1051 
1052   /* Now allocate buffers for a->j, and send them off */
1053   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1054   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1055   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1056   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1057 
1058   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1059   {
1060     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1061     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1062     PetscInt cend = C->cmap->rend;
1063     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1064 
1065     for (i=0; i<nrqr; i++) {
1066       rbuf1_i   = rbuf1[i];
1067       sbuf_aj_i = sbuf_aj[i];
1068       ct1       = 2*rbuf1_i[0] + 1;
1069       ct2       = 0;
1070       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1071         kmax = rbuf1[i][2*j];
1072         for (k=0; k<kmax; k++,ct1++) {
1073           row    = rbuf1_i[ct1] - rstart;
1074           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1075           ncols  = nzA + nzB;
1076           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1077 
1078           /* load the column indices for this row into cols*/
1079           cols  = sbuf_aj_i + ct2;
1080 
1081 	  lwrite = 0;
1082           for (l=0; l<nzB; l++) {
1083             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1084           }
1085           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1086           for (l=0; l<nzB; l++) {
1087             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1088           }
1089 
1090           ct2 += ncols;
1091         }
1092       }
1093       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1094     }
1095   }
1096   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1097   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1098 
1099   /* Allocate buffers for a->a, and send them off */
1100   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1101   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1102   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1103   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1104 
1105   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1106   {
1107     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1108     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1109     PetscInt    cend = C->cmap->rend;
1110     PetscInt    *b_j = b->j;
1111     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1112 
1113     for (i=0; i<nrqr; i++) {
1114       rbuf1_i   = rbuf1[i];
1115       sbuf_aa_i = sbuf_aa[i];
1116       ct1       = 2*rbuf1_i[0]+1;
1117       ct2       = 0;
1118       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1119         kmax = rbuf1_i[2*j];
1120         for (k=0; k<kmax; k++,ct1++) {
1121           row    = rbuf1_i[ct1] - rstart;
1122           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1123           ncols  = nzA + nzB;
1124           cworkB = b_j + b_i[row];
1125           vworkA = a_a + a_i[row];
1126           vworkB = b_a + b_i[row];
1127 
1128           /* load the column values for this row into vals*/
1129           vals  = sbuf_aa_i+ct2;
1130 
1131 	  lwrite = 0;
1132           for (l=0; l<nzB; l++) {
1133             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1134           }
1135           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1136           for (l=0; l<nzB; l++) {
1137             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1138           }
1139 
1140           ct2 += ncols;
1141         }
1142       }
1143       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1144     }
1145   }
1146   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1147   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1148   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1149   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1150 
1151   /* Form the matrix */
1152   /* create col map: global col of C -> local col of submatrices */
1153   {
1154     const PetscInt *icol_i;
1155 #if defined (PETSC_USE_CTABLE)
1156     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1157     for (i=0; i<ismax; i++) {
1158       if (!allcolumns[i]){
1159         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1160         jmax   = ncol[i];
1161         icol_i = icol[i];
1162         cmap_i = cmap[i];
1163         for (j=0; j<jmax; j++) {
1164           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1165         }
1166       } else {
1167         cmap[i] = PETSC_NULL;
1168       }
1169     }
1170 #else
1171     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1172     for (i=0; i<ismax; i++) {
1173       if (!allcolumns[i]){
1174         ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr);
1175         ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1176         jmax   = ncol[i];
1177         icol_i = icol[i];
1178         cmap_i = cmap[i];
1179         for (j=0; j<jmax; j++) {
1180           cmap_i[icol_i[j]] = j+1;
1181         }
1182       } else {
1183         cmap[i] = PETSC_NULL;
1184       }
1185     }
1186 #endif
1187   }
1188 
1189   /* Create lens which is required for MatCreate... */
1190   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1191   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1192   if (ismax) {
1193     ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1194     ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1195   }
1196   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1197 
1198   /* Update lens from local data */
1199   for (i=0; i<ismax; i++) {
1200     jmax   = nrow[i];
1201     if (!allcolumns[i]) cmap_i = cmap[i];
1202     irow_i = irow[i];
1203     lens_i = lens[i];
1204     for (j=0; j<jmax; j++) {
1205       l = 0;
1206       row  = irow_i[j];
1207       while (row >= C->rmap->range[l+1]) l++;
1208       proc = l;
1209       if (proc == rank) {
1210         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1211         if (!allcolumns[i]){
1212           for (k=0; k<ncols; k++) {
1213 #if defined (PETSC_USE_CTABLE)
1214             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1215 #else
1216             tcol = cmap_i[cols[k]];
1217 #endif
1218             if (tcol) { lens_i[j]++;}
1219           }
1220         } else { /* allcolumns */
1221           lens_i[j] = ncols;
1222         }
1223         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1224       }
1225     }
1226   }
1227 
1228   /* Create row map: global row of C -> local row of submatrices */
1229 #if defined (PETSC_USE_CTABLE)
1230   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1231     for (i=0; i<ismax; i++) {
1232       ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1233       rmap_i = rmap[i];
1234       irow_i = irow[i];
1235       jmax   = nrow[i];
1236       for (j=0; j<jmax; j++) {
1237         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1238       }
1239     }
1240 #else
1241   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1242   if (ismax) {
1243     ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1244     ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1245   }
1246   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1247   for (i=0; i<ismax; i++) {
1248     rmap_i = rmap[i];
1249     irow_i = irow[i];
1250     jmax   = nrow[i];
1251     for (j=0; j<jmax; j++) {
1252       rmap_i[irow_i[j]] = j;
1253     }
1254   }
1255 #endif
1256 
1257   /* Update lens from offproc data */
1258   {
1259     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1260 
1261     for (tmp2=0; tmp2<nrqs; tmp2++) {
1262       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1263       idex   = pa[idex2];
1264       sbuf1_i = sbuf1[idex];
1265       jmax    = sbuf1_i[0];
1266       ct1     = 2*jmax+1;
1267       ct2     = 0;
1268       rbuf2_i = rbuf2[idex2];
1269       rbuf3_i = rbuf3[idex2];
1270       for (j=1; j<=jmax; j++) {
1271         is_no   = sbuf1_i[2*j-1];
1272         max1    = sbuf1_i[2*j];
1273         lens_i  = lens[is_no];
1274         if (!allcolumns[is_no]){cmap_i  = cmap[is_no];}
1275         rmap_i  = rmap[is_no];
1276         for (k=0; k<max1; k++,ct1++) {
1277 #if defined (PETSC_USE_CTABLE)
1278           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1279           row--;
1280           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1281 #else
1282           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1283 #endif
1284           max2 = rbuf2_i[ct1];
1285           for (l=0; l<max2; l++,ct2++) {
1286             if (!allcolumns[is_no]){
1287 #if defined (PETSC_USE_CTABLE)
1288               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1289 #else
1290               tcol = cmap_i[rbuf3_i[ct2]];
1291 #endif
1292               if (tcol) {
1293                 lens_i[row]++;
1294               }
1295             } else { /* allcolumns */
1296               lens_i[row]++; /* lens_i[row] += max2 ? */
1297             }
1298           }
1299         }
1300       }
1301     }
1302   }
1303   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1304   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1305   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1306   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1307   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1308 
1309   /* Create the submatrices */
1310   if (scall == MAT_REUSE_MATRIX) {
1311     PetscBool  flag;
1312 
1313     /*
1314         Assumes new rows are same length as the old rows,hence bug!
1315     */
1316     for (i=0; i<ismax; i++) {
1317       mat = (Mat_SeqAIJ *)(submats[i]->data);
1318       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1319       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1320       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1321       /* Initial matrix as if empty */
1322       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1323       submats[i]->factortype = C->factortype;
1324     }
1325   } else {
1326     for (i=0; i<ismax; i++) {
1327       PetscInt rbs,cbs;
1328       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1329       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1330 
1331       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1332       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1333 
1334       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1335       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1336       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1337     }
1338   }
1339 
1340   /* Assemble the matrices */
1341   /* First assemble the local rows */
1342   {
1343     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1344     PetscScalar *imat_a;
1345 
1346     for (i=0; i<ismax; i++) {
1347       mat       = (Mat_SeqAIJ*)submats[i]->data;
1348       imat_ilen = mat->ilen;
1349       imat_j    = mat->j;
1350       imat_i    = mat->i;
1351       imat_a    = mat->a;
1352       if (!allcolumns[i]) cmap_i = cmap[i];
1353       rmap_i    = rmap[i];
1354       irow_i    = irow[i];
1355       jmax      = nrow[i];
1356       for (j=0; j<jmax; j++) {
1357 	l = 0;
1358         row      = irow_i[j];
1359         while (row >= C->rmap->range[l+1]) l++;
1360         proc = l;
1361         if (proc == rank) {
1362           old_row  = row;
1363 #if defined (PETSC_USE_CTABLE)
1364           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1365           row--;
1366 #else
1367           row      = rmap_i[row];
1368 #endif
1369           ilen_row = imat_ilen[row];
1370           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1371           mat_i    = imat_i[row] ;
1372           mat_a    = imat_a + mat_i;
1373           mat_j    = imat_j + mat_i;
1374           if (!allcolumns[i]){
1375             for (k=0; k<ncols; k++) {
1376 #if defined (PETSC_USE_CTABLE)
1377               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1378 #else
1379               tcol = cmap_i[cols[k]];
1380 #endif
1381               if (tcol){
1382                 *mat_j++ = tcol - 1;
1383                 *mat_a++ = vals[k];
1384                 ilen_row++;
1385               }
1386             }
1387           } else { /* allcolumns */
1388             for (k=0; k<ncols; k++) {
1389               *mat_j++ = cols[k] ; /* global col index! */
1390               *mat_a++ = vals[k];
1391               ilen_row++;
1392             }
1393           }
1394           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1395           imat_ilen[row] = ilen_row;
1396         }
1397       }
1398     }
1399   }
1400 
1401   /*   Now assemble the off proc rows*/
1402   {
1403     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1404     PetscInt    *imat_j,*imat_i;
1405     PetscScalar *imat_a,*rbuf4_i;
1406 
1407     for (tmp2=0; tmp2<nrqs; tmp2++) {
1408       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1409       idex   = pa[idex2];
1410       sbuf1_i = sbuf1[idex];
1411       jmax    = sbuf1_i[0];
1412       ct1     = 2*jmax + 1;
1413       ct2     = 0;
1414       rbuf2_i = rbuf2[idex2];
1415       rbuf3_i = rbuf3[idex2];
1416       rbuf4_i = rbuf4[idex2];
1417       for (j=1; j<=jmax; j++) {
1418         is_no     = sbuf1_i[2*j-1];
1419         rmap_i    = rmap[is_no];
1420         if (!allcolumns[is_no]){cmap_i = cmap[is_no];}
1421         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1422         imat_ilen = mat->ilen;
1423         imat_j    = mat->j;
1424         imat_i    = mat->i;
1425         imat_a    = mat->a;
1426         max1      = sbuf1_i[2*j];
1427         for (k=0; k<max1; k++,ct1++) {
1428           row   = sbuf1_i[ct1];
1429 #if defined (PETSC_USE_CTABLE)
1430           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1431           row--;
1432 #else
1433           row   = rmap_i[row];
1434 #endif
1435           ilen  = imat_ilen[row];
1436           mat_i = imat_i[row] ;
1437           mat_a = imat_a + mat_i;
1438           mat_j = imat_j + mat_i;
1439           max2 = rbuf2_i[ct1];
1440           if (!allcolumns[is_no]){
1441             for (l=0; l<max2; l++,ct2++) {
1442 
1443 #if defined (PETSC_USE_CTABLE)
1444               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1445 #else
1446               tcol = cmap_i[rbuf3_i[ct2]];
1447 #endif
1448               if (tcol) {
1449                 *mat_j++ = tcol - 1;
1450                 *mat_a++ = rbuf4_i[ct2];
1451                 ilen++;
1452               }
1453             }
1454           } else { /* allcolumns */
1455             for (l=0; l<max2; l++,ct2++) {
1456               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1457               *mat_a++ = rbuf4_i[ct2];
1458               ilen++;
1459             }
1460           }
1461           imat_ilen[row] = ilen;
1462         }
1463       }
1464     }
1465   }
1466 
1467   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1468   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1469   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1470   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1471   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1472 
1473   /* Restore the indices */
1474   for (i=0; i<ismax; i++) {
1475     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1476     if (!allcolumns[i]){
1477       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1478     }
1479   }
1480 
1481   /* Destroy allocated memory */
1482   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1483   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1484   ierr = PetscFree(pa);CHKERRQ(ierr);
1485 
1486   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1487   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1488   for (i=0; i<nrqr; ++i) {
1489     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1490   }
1491   for (i=0; i<nrqs; ++i) {
1492     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1493     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1494   }
1495 
1496   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1497   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1498   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1499   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1500   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1501   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1502   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1503 
1504 #if defined (PETSC_USE_CTABLE)
1505   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1506 #else
1507   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1508 #endif
1509   ierr = PetscFree(rmap);CHKERRQ(ierr);
1510 
1511   for (i=0; i<ismax; i++) {
1512     if (!allcolumns[i]){
1513 #if defined (PETSC_USE_CTABLE)
1514       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1515 #else
1516       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1517 #endif
1518     }
1519   }
1520   ierr = PetscFree(cmap);CHKERRQ(ierr);
1521   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1522   ierr = PetscFree(lens);CHKERRQ(ierr);
1523 
1524   for (i=0; i<ismax; i++) {
1525     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1526     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1527   }
1528   PetscFunctionReturn(0);
1529 }
1530 
1531 /*
1532  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1533  Be careful not to destroy them elsewhere.
1534  */
1535 #undef __FUNCT__
1536 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1537 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1538 {
1539   /* If making this function public, change the error returned in this function away from _PLIB. */
1540   PetscErrorCode ierr;
1541   Mat_MPIAIJ     *aij;
1542   PetscBool      seqaij;
1543 
1544   PetscFunctionBegin;
1545   /* Check to make sure the component matrices are compatible with C. */
1546   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1547   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1548   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1549   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1550   if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1551 
1552   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1553   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1554   ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr);
1555   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1556   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1557   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1558   ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1559   aij = (Mat_MPIAIJ*)((*C)->data);
1560   aij->A = A;
1561   aij->B = B;
1562   ierr = PetscLogObjectParent(*C,A);CHKERRQ(ierr);
1563   ierr = PetscLogObjectParent(*C,B);CHKERRQ(ierr);
1564   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1565   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 #undef __FUNCT__
1570 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1571 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1572 {
1573   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1574   PetscFunctionBegin;
1575   PetscValidPointer(A,2);
1576   PetscValidPointer(B,3);
1577   *A = aij->A;
1578   *B = aij->B;
1579   /* Note that we don't incref *A and *B, so be careful! */
1580   PetscFunctionReturn(0);
1581 }
1582 
1583 #undef __FUNCT__
1584 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1585 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1586 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1587 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1588 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1589 {
1590   PetscErrorCode ierr;
1591   PetscMPIInt    size, flag;
1592   PetscInt       i,ii;
1593   PetscInt       ismax_c;
1594 
1595   PetscFunctionBegin;
1596   if (!ismax) {
1597     PetscFunctionReturn(0);
1598   }
1599   for (i = 0, ismax_c = 0; i < ismax; ++i) {
1600     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
1601     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1602     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1603     if (size > 1) {
1604       ++ismax_c;
1605     }
1606   }
1607   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1608     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1609   } else { /* if (ismax_c) */
1610     Mat         *A,*B;
1611     IS          *isrow_c, *iscol_c;
1612     PetscMPIInt size;
1613     /*
1614      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1615      array of sequential matrices underlying the resulting parallel matrices.
1616      Which arrays to allocate is based on the value of MatReuse scall.
1617      There are as many diag matrices as there are original index sets.
1618      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1619 
1620      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1621      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1622      will deposite the extracted diag and off-diag parts.
1623      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1624      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1625     */
1626 
1627     /* Parallel matrix array is allocated only if no reuse is taking place. */
1628     if (scall != MAT_REUSE_MATRIX) {
1629       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1630     } else {
1631       ierr = PetscMalloc(ismax*sizeof(Mat), &A);CHKERRQ(ierr);
1632       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B);CHKERRQ(ierr);
1633       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1634       for (i = 0, ii = 0; i < ismax; ++i) {
1635         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1636         if (size > 1) {
1637           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
1638           ++ii;
1639         } else {
1640           A[i] = (*submat)[i];
1641         }
1642       }
1643     }
1644     /*
1645      Construct the complements of the iscol ISs for parallel ISs only.
1646      These are used to extract the off-diag portion of the resulting parallel matrix.
1647      The row IS for the off-diag portion is the same as for the diag portion,
1648      so we merely alias the row IS, while skipping those that are sequential.
1649     */
1650     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c);CHKERRQ(ierr);
1651     for (i = 0, ii = 0; i < ismax; ++i) {
1652       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1653       if (size > 1) {
1654 	isrow_c[ii] = isrow[i];
1655 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
1656 	++ii;
1657       }
1658     }
1659     /* Now obtain the sequential A and B submatrices separately. */
1660     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
1661     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
1662     for (ii = 0; ii < ismax_c; ++ii) {
1663       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
1664     }
1665     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
1666     /*
1667      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1668      have been extracted directly into the parallel matrices containing them, or
1669      simply into the sequential matrix identical with the corresponding A (if size == 1).
1670      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1671      A is put into the correct submat slot (if size == 1).
1672      */
1673     if (scall != MAT_REUSE_MATRIX) {
1674       for (i = 0, ii = 0; i < ismax; ++i) {
1675         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1676         if (size > 1) {
1677           /*
1678            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1679            */
1680           /* Construct submat[i] from the Seq pieces A and B. */
1681           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
1682 
1683           ++ii;
1684         } else {
1685           (*submat)[i] = A[i];
1686         }
1687       }
1688     }
1689     ierr = PetscFree(A);CHKERRQ(ierr);
1690     ierr = PetscFree(B);CHKERRQ(ierr);
1691   }
1692   PetscFunctionReturn(0);
1693 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1694 
1695 
1696 
1697 #undef __FUNCT__
1698 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1699 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1700 {
1701   PetscErrorCode ierr;
1702 
1703   PetscFunctionBegin;
1704   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
1705   PetscFunctionReturn(0);
1706 }
1707