xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision f73d5cc4fab9985badaa1a29b152f456bf59ad34)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
421   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
422   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
423   PetscInt       *rbuf_i,kmax,rbuf_0;
424   PetscBT        xtable;
425 
426   PetscFunctionBegin;
427   m      = C->rmap->N;
428   rstart = C->rmap->rstart;
429   cstart = C->cmap->rstart;
430   ai     = a->i;
431   aj     = a->j;
432   bi     = b->i;
433   bj     = b->j;
434   garray = c->garray;
435 
436 
437   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
438     rbuf_i  =  rbuf[i];
439     rbuf_0  =  rbuf_i[0];
440     ct     += rbuf_0;
441     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
442   }
443 
444   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
445   else      max1 = 1;
446   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
447   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
448   ++no_malloc;
449   ierr         = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
450   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
451 
452   ct3 = 0;
453   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
454     rbuf_i =  rbuf[i];
455     rbuf_0 =  rbuf_i[0];
456     ct1    =  2*rbuf_0+1;
457     ct2    =  ct1;
458     ct3    += ct1;
459     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
460       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
461       oct2 = ct2;
462       kmax = rbuf_i[2*j];
463       for (k=0; k<kmax; k++,ct1++) {
464         row = rbuf_i[ct1];
465         if (!PetscBTLookupSet(xtable,row)) {
466           if (!(ct3 < mem_estimate)) {
467             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
468             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
469             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
470             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
471             xdata[0]     = tmp;
472             mem_estimate = new_estimate; ++no_malloc;
473             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
474           }
475           xdata[i][ct2++] = row;
476           ct3++;
477         }
478       }
479       for (k=oct2,max2=ct2; k<max2; k++) {
480         row   = xdata[i][k] - rstart;
481         start = ai[row];
482         end   = ai[row+1];
483         for (l=start; l<end; l++) {
484           val = aj[l] + cstart;
485           if (!PetscBTLookupSet(xtable,val)) {
486             if (!(ct3 < mem_estimate)) {
487               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
488               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
489               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
490               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
491               xdata[0]     = tmp;
492               mem_estimate = new_estimate; ++no_malloc;
493               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
494             }
495             xdata[i][ct2++] = val;
496             ct3++;
497           }
498         }
499         start = bi[row];
500         end   = bi[row+1];
501         for (l=start; l<end; l++) {
502           val = garray[bj[l]];
503           if (!PetscBTLookupSet(xtable,val)) {
504             if (!(ct3 < mem_estimate)) {
505               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
506               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
507               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
508               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
509               xdata[0]     = tmp;
510               mem_estimate = new_estimate; ++no_malloc;
511               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
512             }
513             xdata[i][ct2++] = val;
514             ct3++;
515           }
516         }
517       }
518       /* Update the header*/
519       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
520       xdata[i][2*j-1] = rbuf_i[2*j-1];
521     }
522     xdata[i][0] = rbuf_0;
523     xdata[i+1]  = xdata[i] + ct2;
524     isz1[i]     = ct2; /* size of each message */
525   }
526   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
527   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 /* -------------------------------------------------------------------------*/
531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
533 /*
534     Every processor gets the entire matrix
535 */
536 #undef __FUNCT__
537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
539 {
540   Mat            B;
541   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
542   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
543   PetscErrorCode ierr;
544   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
545   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
546   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
547   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
548 
549   PetscFunctionBegin;
550   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
551   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
552 
553   if (scall == MAT_INITIAL_MATRIX) {
554     /* ----------------------------------------------------------------
555          Tell every processor the number of nonzeros per row
556     */
557     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
558     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
559       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
560     }
561     sendcount = A->rmap->rend - A->rmap->rstart;
562     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
563     for (i=0; i<size; i++) {
564       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
565       displs[i]     = A->rmap->range[i];
566     }
567 #if defined(PETSC_HAVE_MPI_IN_PLACE)
568     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
569 #else
570     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #endif
572     /* ---------------------------------------------------------------
573          Create the sequential matrix of the same type as the local block diagonal
574     */
575     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
576     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
577     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
578     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
579     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
580     **Bin = B;
581     b = (Mat_SeqAIJ *)B->data;
582 
583     /*--------------------------------------------------------------------
584        Copy my part of matrix column indices over
585     */
586     sendcount  = ad->nz + bd->nz;
587     jsendbuf   = b->j + b->i[rstarts[rank]];
588     a_jsendbuf = ad->j;
589     b_jsendbuf = bd->j;
590     n          = A->rmap->rend - A->rmap->rstart;
591     cnt        = 0;
592     for (i=0; i<n; i++) {
593 
594       /* put in lower diagonal portion */
595       m = bd->i[i+1] - bd->i[i];
596       while (m > 0) {
597         /* is it above diagonal (in bd (compressed) numbering) */
598         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
599         jsendbuf[cnt++] = garray[*b_jsendbuf++];
600         m--;
601       }
602 
603       /* put in diagonal portion */
604       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
605         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
606       }
607 
608       /* put in upper diagonal portion */
609       while (m-- > 0) {
610         jsendbuf[cnt++] = garray[*b_jsendbuf++];
611       }
612     }
613     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
614 
615     /*--------------------------------------------------------------------
616        Gather all column indices to all processors
617     */
618     for (i=0; i<size; i++) {
619       recvcounts[i] = 0;
620       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
621         recvcounts[i] += lens[j];
622       }
623     }
624     displs[0]  = 0;
625     for (i=1; i<size; i++) {
626       displs[i] = displs[i-1] + recvcounts[i-1];
627     }
628 #if defined(PETSC_HAVE_MPI_IN_PLACE)
629     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
630 #else
631     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
632 #endif
633     /*--------------------------------------------------------------------
634         Assemble the matrix into useable form (note numerical values not yet set)
635     */
636     /* set the b->ilen (length of each row) values */
637     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
638     /* set the b->i indices */
639     b->i[0] = 0;
640     for (i=1; i<=A->rmap->N; i++) {
641       b->i[i] = b->i[i-1] + lens[i-1];
642     }
643     ierr = PetscFree(lens);CHKERRQ(ierr);
644     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
645     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
646 
647   } else {
648     B  = **Bin;
649     b = (Mat_SeqAIJ *)B->data;
650   }
651 
652   /*--------------------------------------------------------------------
653        Copy my part of matrix numerical values into the values location
654   */
655   if (flag == MAT_GET_VALUES){
656     sendcount = ad->nz + bd->nz;
657     sendbuf   = b->a + b->i[rstarts[rank]];
658     a_sendbuf = ad->a;
659     b_sendbuf = bd->a;
660     b_sendj   = bd->j;
661     n         = A->rmap->rend - A->rmap->rstart;
662     cnt       = 0;
663     for (i=0; i<n; i++) {
664 
665       /* put in lower diagonal portion */
666       m = bd->i[i+1] - bd->i[i];
667       while (m > 0) {
668         /* is it above diagonal (in bd (compressed) numbering) */
669         if (garray[*b_sendj] > A->rmap->rstart + i) break;
670         sendbuf[cnt++] = *b_sendbuf++;
671         m--;
672         b_sendj++;
673       }
674 
675       /* put in diagonal portion */
676       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
677         sendbuf[cnt++] = *a_sendbuf++;
678       }
679 
680       /* put in upper diagonal portion */
681       while (m-- > 0) {
682         sendbuf[cnt++] = *b_sendbuf++;
683         b_sendj++;
684       }
685     }
686     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
687 
688     /* -----------------------------------------------------------------
689        Gather all numerical values to all processors
690     */
691     if (!recvcounts) {
692       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
693     }
694     for (i=0; i<size; i++) {
695       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
696     }
697     displs[0]  = 0;
698     for (i=1; i<size; i++) {
699       displs[i] = displs[i-1] + recvcounts[i-1];
700     }
701     recvbuf   = b->a;
702 #if defined(PETSC_HAVE_MPI_IN_PLACE)
703     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
704 #else
705     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
706 #endif
707   }  /* endof (flag == MAT_GET_VALUES) */
708   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
709 
710   if (A->symmetric){
711     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
712   } else if (A->hermitian) {
713     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
714   } else if (A->structurally_symmetric) {
715     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
716   }
717   PetscFunctionReturn(0);
718 }
719 
720 
721 
722 #undef __FUNCT__
723 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
724 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
725 {
726   PetscErrorCode ierr;
727   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
728   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
729 
730   PetscFunctionBegin;
731   /*
732        Check for special case: each processor gets entire matrix
733   */
734   if (ismax == 1 && C->rmap->N == C->cmap->N) {
735     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
736     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
737     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
738     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
739     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
740       wantallmatrix = PETSC_TRUE;
741       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
742     }
743   }
744   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
745   if (twantallmatrix) {
746     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
747     PetscFunctionReturn(0);
748   }
749 
750   /* Allocate memory to hold all the submatrices */
751   if (scall != MAT_REUSE_MATRIX) {
752     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
753   }
754 
755   /* Check for special case: each processor gets entire matrix columns */
756   ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr);
757   for (i=0; i<ismax; i++) {
758     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
759     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
760     if (colflag && ncol == C->cmap->N){
761       allcolumns[i] = PETSC_TRUE;
762     } else {
763       allcolumns[i] = PETSC_FALSE;
764     }
765   }
766 
767   /* Determine the number of stages through which submatrices are done */
768   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
769 
770   /*
771      Each stage will extract nmax submatrices.
772      nmax is determined by the matrix column dimension.
773      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
774   */
775   if (!nmax) nmax = 1;
776   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
777 
778   /* Make sure every processor loops through the nstages */
779   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
780 
781   for (i=0,pos=0; i<nstages; i++) {
782     if (pos+nmax <= ismax) max_no = nmax;
783     else if (pos == ismax) max_no = 0;
784     else                   max_no = ismax-pos;
785     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
786     pos += max_no;
787   }
788 
789   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
790   PetscFunctionReturn(0);
791 }
792 
793 /* -------------------------------------------------------------------------*/
794 #undef __FUNCT__
795 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
796 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
797 {
798   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
799   Mat            A = c->A;
800   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
801   const PetscInt **icol,**irow;
802   PetscInt       *nrow,*ncol,start;
803   PetscErrorCode ierr;
804   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
805   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
806   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
807   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
808   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
809 #if defined (PETSC_USE_CTABLE)
810   PetscTable     *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i;
811 #else
812   PetscInt       **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i;
813 #endif
814   const PetscInt *irow_i;
815   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
816   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
817   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
818   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
819   MPI_Status     *r_status3,*r_status4,*s_status4;
820   MPI_Comm       comm;
821   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
822   PetscMPIInt    *onodes1,*olengths1;
823   PetscMPIInt    idex,idex2,end;
824 
825   PetscFunctionBegin;
826   comm   = ((PetscObject)C)->comm;
827   tag0   = ((PetscObject)C)->tag;
828   size   = c->size;
829   rank   = c->rank;
830 
831   /* Get some new tags to keep the communication clean */
832   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
833   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
834   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
835 
836   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
837 
838   for (i=0; i<ismax; i++) {
839     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
840     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
841     if (allcolumns[i]){
842       icol[i] = PETSC_NULL;
843       ncol[i] = C->cmap->N;
844     } else {
845       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
846       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
847     }
848   }
849 
850   /* evaluate communication - mesg to who, length of mesg, and buffer space
851      required. Based on this, buffers are allocated, and data copied into them*/
852   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
853   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
854   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
855   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
856   for (i=0; i<ismax; i++) {
857     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
858     jmax   = nrow[i];
859     irow_i = irow[i];
860     for (j=0; j<jmax; j++) {
861       l = 0;
862       row  = irow_i[j];
863       while (row >= C->rmap->range[l+1]) l++;
864       proc = l;
865       w4[proc]++;
866     }
867     for (j=0; j<size; j++) {
868       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
869     }
870   }
871 
872   nrqs     = 0;              /* no of outgoing messages */
873   msz      = 0;              /* total mesg length (for all procs) */
874   w1[rank] = 0;              /* no mesg sent to self */
875   w3[rank] = 0;
876   for (i=0; i<size; i++) {
877     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
878   }
879   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
880   for (i=0,j=0; i<size; i++) {
881     if (w1[i]) { pa[j] = i; j++; }
882   }
883 
884   /* Each message would have a header = 1 + 2*(no of IS) + data */
885   for (i=0; i<nrqs; i++) {
886     j     = pa[i];
887     w1[j] += w2[j] + 2* w3[j];
888     msz   += w1[j];
889   }
890   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
891 
892   /* Determine the number of messages to expect, their lengths, from from-ids */
893   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
894   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
895 
896   /* Now post the Irecvs corresponding to these messages */
897   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
898 
899   ierr = PetscFree(onodes1);CHKERRQ(ierr);
900   ierr = PetscFree(olengths1);CHKERRQ(ierr);
901 
902   /* Allocate Memory for outgoing messages */
903   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
904   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
905   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
906 
907   {
908     PetscInt *iptr = tmp,ict = 0;
909     for (i=0; i<nrqs; i++) {
910       j         = pa[i];
911       iptr     += ict;
912       sbuf1[j]  = iptr;
913       ict       = w1[j];
914     }
915   }
916 
917   /* Form the outgoing messages */
918   /* Initialize the header space */
919   for (i=0; i<nrqs; i++) {
920     j           = pa[i];
921     sbuf1[j][0] = 0;
922     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
923     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
924   }
925 
926   /* Parse the isrow and copy data into outbuf */
927   for (i=0; i<ismax; i++) {
928     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
929     irow_i = irow[i];
930     jmax   = nrow[i];
931     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
932       l = 0;
933       row  = irow_i[j];
934       while (row >= C->rmap->range[l+1]) l++;
935       proc = l;
936       if (proc != rank) { /* copy to the outgoing buf*/
937         ctr[proc]++;
938         *ptr[proc] = row;
939         ptr[proc]++;
940       }
941     }
942     /* Update the headers for the current IS */
943     for (j=0; j<size; j++) { /* Can Optimise this loop too */
944       if ((ctr_j = ctr[j])) {
945         sbuf1_j        = sbuf1[j];
946         k              = ++sbuf1_j[0];
947         sbuf1_j[2*k]   = ctr_j;
948         sbuf1_j[2*k-1] = i;
949       }
950     }
951   }
952 
953   /*  Now  post the sends */
954   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
955   for (i=0; i<nrqs; ++i) {
956     j    = pa[i];
957     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
958   }
959 
960   /* Post Receives to capture the buffer size */
961   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
962   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
963   rbuf2[0] = tmp + msz;
964   for (i=1; i<nrqs; ++i) {
965     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
966   }
967   for (i=0; i<nrqs; ++i) {
968     j    = pa[i];
969     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
970   }
971 
972   /* Send to other procs the buf size they should allocate */
973 
974 
975   /* Receive messages*/
976   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
977   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
978   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
979   {
980     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
981     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
982     PetscInt    *sbuf2_i;
983 
984     for (i=0; i<nrqr; ++i) {
985       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
986       req_size[idex] = 0;
987       rbuf1_i         = rbuf1[idex];
988       start           = 2*rbuf1_i[0] + 1;
989       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
990       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
991       sbuf2_i         = sbuf2[idex];
992       for (j=start; j<end; j++) {
993         id               = rbuf1_i[j] - rstart;
994         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
995         sbuf2_i[j]       = ncols;
996         req_size[idex] += ncols;
997       }
998       req_source[idex] = r_status1[i].MPI_SOURCE;
999       /* form the header */
1000       sbuf2_i[0]   = req_size[idex];
1001       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1002       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1003     }
1004   }
1005   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1006   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1007 
1008   /*  recv buffer sizes */
1009   /* Receive messages*/
1010 
1011   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1012   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1013   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1014   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1015   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1016 
1017   for (i=0; i<nrqs; ++i) {
1018     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1019     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1020     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1021     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1022     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1023   }
1024   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1025   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1026 
1027   /* Wait on sends1 and sends2 */
1028   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1029   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1030 
1031   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1032   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1033   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1034   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1035   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1036   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1037 
1038   /* Now allocate buffers for a->j, and send them off */
1039   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1040   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1041   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1042   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1043 
1044   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1045   {
1046     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1047     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1048     PetscInt cend = C->cmap->rend;
1049     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1050 
1051     for (i=0; i<nrqr; i++) {
1052       rbuf1_i   = rbuf1[i];
1053       sbuf_aj_i = sbuf_aj[i];
1054       ct1       = 2*rbuf1_i[0] + 1;
1055       ct2       = 0;
1056       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1057         kmax = rbuf1[i][2*j];
1058         for (k=0; k<kmax; k++,ct1++) {
1059           row    = rbuf1_i[ct1] - rstart;
1060           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1061           ncols  = nzA + nzB;
1062           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1063 
1064           /* load the column indices for this row into cols*/
1065           cols  = sbuf_aj_i + ct2;
1066 
1067 	  lwrite = 0;
1068           for (l=0; l<nzB; l++) {
1069             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1070           }
1071           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1072           for (l=0; l<nzB; l++) {
1073             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1074           }
1075 
1076           ct2 += ncols;
1077         }
1078       }
1079       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1080     }
1081   }
1082   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1083   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1084 
1085   /* Allocate buffers for a->a, and send them off */
1086   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1087   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1088   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1089   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1090 
1091   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1092   {
1093     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1094     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1095     PetscInt    cend = C->cmap->rend;
1096     PetscInt    *b_j = b->j;
1097     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1098 
1099     for (i=0; i<nrqr; i++) {
1100       rbuf1_i   = rbuf1[i];
1101       sbuf_aa_i = sbuf_aa[i];
1102       ct1       = 2*rbuf1_i[0]+1;
1103       ct2       = 0;
1104       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1105         kmax = rbuf1_i[2*j];
1106         for (k=0; k<kmax; k++,ct1++) {
1107           row    = rbuf1_i[ct1] - rstart;
1108           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1109           ncols  = nzA + nzB;
1110           cworkB = b_j + b_i[row];
1111           vworkA = a_a + a_i[row];
1112           vworkB = b_a + b_i[row];
1113 
1114           /* load the column values for this row into vals*/
1115           vals  = sbuf_aa_i+ct2;
1116 
1117 	  lwrite = 0;
1118           for (l=0; l<nzB; l++) {
1119             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1120           }
1121           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1122           for (l=0; l<nzB; l++) {
1123             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1124           }
1125 
1126           ct2 += ncols;
1127         }
1128       }
1129       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1130     }
1131   }
1132   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1133   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1134   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1135   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1136 
1137   /* Form the matrix */
1138   /* create col map: global col of C -> local col of submatrices */
1139   {
1140     const PetscInt *icol_i;
1141 #if defined (PETSC_USE_CTABLE)
1142     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1143     for (i=0; i<ismax; i++) {
1144       if (!allcolumns[i]){
1145         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1146         jmax   = ncol[i];
1147         icol_i = icol[i];
1148         cmap_i = cmap[i];
1149         for (j=0; j<jmax; j++) {
1150           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1151         }
1152       } else {
1153         cmap[i] = PETSC_NULL;
1154       }
1155     }
1156 #else
1157     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1158     for (i=0; i<ismax; i++) {
1159       if (!allcolumns[i]){
1160         ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr);
1161         ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1162         jmax   = ncol[i];
1163         icol_i = icol[i];
1164         cmap_i = cmap[i];
1165         for (j=0; j<jmax; j++) {
1166           cmap_i[icol_i[j]] = j+1;
1167         }
1168       } else {
1169         cmap[i] = PETSC_NULL;
1170       }
1171     }
1172 #endif
1173   }
1174 
1175   /* Create lens which is required for MatCreate... */
1176   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1177   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1178   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1179   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1180   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1181 
1182   /* Update lens from local data */
1183   for (i=0; i<ismax; i++) {
1184     jmax   = nrow[i];
1185     if (!allcolumns[i]) cmap_i = cmap[i];
1186     irow_i = irow[i];
1187     lens_i = lens[i];
1188     for (j=0; j<jmax; j++) {
1189       l = 0;
1190       row  = irow_i[j];
1191       while (row >= C->rmap->range[l+1]) l++;
1192       proc = l;
1193       if (proc == rank) {
1194         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1195         if (!allcolumns[i]){
1196           for (k=0; k<ncols; k++) {
1197 #if defined (PETSC_USE_CTABLE)
1198             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1199 #else
1200             tcol = cmap_i[cols[k]];
1201 #endif
1202             if (tcol) { lens_i[j]++;}
1203           }
1204         } else { /* allcolumns */
1205           lens_i[j] = ncols;
1206         }
1207         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1208       }
1209     }
1210   }
1211 
1212   /* Create row map: global row of C -> local row of submatrices */
1213 #if defined (PETSC_USE_CTABLE)
1214   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1215     for (i=0; i<ismax; i++) {
1216       ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1217       rmap_i = rmap[i];
1218       irow_i = irow[i];
1219       jmax   = nrow[i];
1220       for (j=0; j<jmax; j++) {
1221         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1222       }
1223     }
1224 #else
1225   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1226   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1227   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1228   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1229   for (i=0; i<ismax; i++) {
1230     rmap_i = rmap[i];
1231     irow_i = irow[i];
1232     jmax   = nrow[i];
1233     for (j=0; j<jmax; j++) {
1234       rmap_i[irow_i[j]] = j;
1235     }
1236   }
1237 #endif
1238 
1239   /* Update lens from offproc data */
1240   {
1241     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1242 
1243     for (tmp2=0; tmp2<nrqs; tmp2++) {
1244       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1245       idex   = pa[idex2];
1246       sbuf1_i = sbuf1[idex];
1247       jmax    = sbuf1_i[0];
1248       ct1     = 2*jmax+1;
1249       ct2     = 0;
1250       rbuf2_i = rbuf2[idex2];
1251       rbuf3_i = rbuf3[idex2];
1252       for (j=1; j<=jmax; j++) {
1253         is_no   = sbuf1_i[2*j-1];
1254         max1    = sbuf1_i[2*j];
1255         lens_i  = lens[is_no];
1256         if (!allcolumns[is_no]){cmap_i  = cmap[is_no];}
1257         rmap_i  = rmap[is_no];
1258         for (k=0; k<max1; k++,ct1++) {
1259 #if defined (PETSC_USE_CTABLE)
1260           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1261           row--;
1262           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1263 #else
1264           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1265 #endif
1266           max2 = rbuf2_i[ct1];
1267           for (l=0; l<max2; l++,ct2++) {
1268             if (!allcolumns[is_no]){
1269 #if defined (PETSC_USE_CTABLE)
1270               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1271 #else
1272               tcol = cmap_i[rbuf3_i[ct2]];
1273 #endif
1274               if (tcol) {
1275                 lens_i[row]++;
1276               }
1277             } else { /* allcolumns */
1278               lens_i[row]++; /* lens_i[row] += max2 ? */
1279             }
1280           }
1281         }
1282       }
1283     }
1284   }
1285   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1286   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1287   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1288   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1289   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1290 
1291   /* Create the submatrices */
1292   if (scall == MAT_REUSE_MATRIX) {
1293     PetscBool  flag;
1294 
1295     /*
1296         Assumes new rows are same length as the old rows,hence bug!
1297     */
1298     for (i=0; i<ismax; i++) {
1299       mat = (Mat_SeqAIJ *)(submats[i]->data);
1300       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1301       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1302       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1303       /* Initial matrix as if empty */
1304       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1305       submats[i]->factortype = C->factortype;
1306     }
1307   } else {
1308     for (i=0; i<ismax; i++) {
1309       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1310       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1311       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1312       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1313     }
1314   }
1315 
1316   /* Assemble the matrices */
1317   /* First assemble the local rows */
1318   {
1319     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1320     PetscScalar *imat_a;
1321 
1322     for (i=0; i<ismax; i++) {
1323       mat       = (Mat_SeqAIJ*)submats[i]->data;
1324       imat_ilen = mat->ilen;
1325       imat_j    = mat->j;
1326       imat_i    = mat->i;
1327       imat_a    = mat->a;
1328       if (!allcolumns[i]) cmap_i = cmap[i];
1329       rmap_i    = rmap[i];
1330       irow_i    = irow[i];
1331       jmax      = nrow[i];
1332       for (j=0; j<jmax; j++) {
1333 	l = 0;
1334         row      = irow_i[j];
1335         while (row >= C->rmap->range[l+1]) l++;
1336         proc = l;
1337         if (proc == rank) {
1338           old_row  = row;
1339 #if defined (PETSC_USE_CTABLE)
1340           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1341           row--;
1342 #else
1343           row      = rmap_i[row];
1344 #endif
1345           ilen_row = imat_ilen[row];
1346           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1347           mat_i    = imat_i[row] ;
1348           mat_a    = imat_a + mat_i;
1349           mat_j    = imat_j + mat_i;
1350           if (!allcolumns[i]){
1351             for (k=0; k<ncols; k++) {
1352 #if defined (PETSC_USE_CTABLE)
1353               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1354 #else
1355               tcol = cmap_i[cols[k]];
1356 #endif
1357               if (tcol){
1358                 *mat_j++ = tcol - 1;
1359                 *mat_a++ = vals[k];
1360                 ilen_row++;
1361               }
1362             }
1363           } else { /* allcolumns */
1364             for (k=0; k<ncols; k++) {
1365               *mat_j++ = cols[k] ; /* global col index! */
1366               *mat_a++ = vals[k];
1367               ilen_row++;
1368             }
1369           }
1370           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1371           imat_ilen[row] = ilen_row;
1372         }
1373       }
1374     }
1375   }
1376 
1377   /*   Now assemble the off proc rows*/
1378   {
1379     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1380     PetscInt    *imat_j,*imat_i;
1381     PetscScalar *imat_a,*rbuf4_i;
1382 
1383     for (tmp2=0; tmp2<nrqs; tmp2++) {
1384       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1385       idex   = pa[idex2];
1386       sbuf1_i = sbuf1[idex];
1387       jmax    = sbuf1_i[0];
1388       ct1     = 2*jmax + 1;
1389       ct2     = 0;
1390       rbuf2_i = rbuf2[idex2];
1391       rbuf3_i = rbuf3[idex2];
1392       rbuf4_i = rbuf4[idex2];
1393       for (j=1; j<=jmax; j++) {
1394         is_no     = sbuf1_i[2*j-1];
1395         rmap_i    = rmap[is_no];
1396         if (!allcolumns[is_no]){cmap_i = cmap[is_no];}
1397         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1398         imat_ilen = mat->ilen;
1399         imat_j    = mat->j;
1400         imat_i    = mat->i;
1401         imat_a    = mat->a;
1402         max1      = sbuf1_i[2*j];
1403         for (k=0; k<max1; k++,ct1++) {
1404           row   = sbuf1_i[ct1];
1405 #if defined (PETSC_USE_CTABLE)
1406           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1407           row--;
1408 #else
1409           row   = rmap_i[row];
1410 #endif
1411           ilen  = imat_ilen[row];
1412           mat_i = imat_i[row] ;
1413           mat_a = imat_a + mat_i;
1414           mat_j = imat_j + mat_i;
1415           max2 = rbuf2_i[ct1];
1416           if (!allcolumns[is_no]){
1417             for (l=0; l<max2; l++,ct2++) {
1418 
1419 #if defined (PETSC_USE_CTABLE)
1420               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1421 #else
1422               tcol = cmap_i[rbuf3_i[ct2]];
1423 #endif
1424               if (tcol) {
1425                 *mat_j++ = tcol - 1;
1426                 *mat_a++ = rbuf4_i[ct2];
1427                 ilen++;
1428               }
1429             }
1430           } else { /* allcolumns */
1431             for (l=0; l<max2; l++,ct2++) {
1432               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1433               *mat_a++ = rbuf4_i[ct2];
1434               ilen++;
1435             }
1436           }
1437           imat_ilen[row] = ilen;
1438         }
1439       }
1440     }
1441   }
1442 
1443   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1444   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1445   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1446   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1447   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1448 
1449   /* Restore the indices */
1450   for (i=0; i<ismax; i++) {
1451     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1452     if (!allcolumns[i]){
1453       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1454     }
1455   }
1456 
1457   /* Destroy allocated memory */
1458   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1459   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1460   ierr = PetscFree(pa);CHKERRQ(ierr);
1461 
1462   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1463   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1464   for (i=0; i<nrqr; ++i) {
1465     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1466   }
1467   for (i=0; i<nrqs; ++i) {
1468     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1469     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1470   }
1471 
1472   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1473   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1474   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1475   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1476   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1477   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1478   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1479 
1480 #if defined (PETSC_USE_CTABLE)
1481   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1482 #else
1483   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1484 #endif
1485   ierr = PetscFree(rmap);CHKERRQ(ierr);
1486 
1487   for (i=0; i<ismax; i++) {
1488     if (!allcolumns[i]){
1489 #if defined (PETSC_USE_CTABLE)
1490       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1491 #else
1492       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1493 #endif
1494     }
1495   }
1496   ierr = PetscFree(cmap);CHKERRQ(ierr);
1497   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1498   ierr = PetscFree(lens);CHKERRQ(ierr);
1499 
1500   for (i=0; i<ismax; i++) {
1501     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1502     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1503   }
1504   PetscFunctionReturn(0);
1505 }
1506 
1507 /*
1508  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1509  Be careful not to destroy them elsewhere.
1510  */
1511 #undef __FUNCT__
1512 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1513 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1514 {
1515   /* If making this function public, change the error returned in this function away from _PLIB. */
1516   PetscErrorCode ierr;
1517   Mat_MPIAIJ *aij;
1518   PetscBool seqaij;
1519 
1520   PetscFunctionBegin;
1521   /* Check to make sure the component matrices are compatible with C. */
1522   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1523   if(!seqaij) {
1524     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1525   }
1526   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1527   if(!seqaij) {
1528     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1529   }
1530   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1531     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1532   }
1533   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1534   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1535   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1536   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1537   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1538   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1539   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1540     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1541   }
1542   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1543   aij = (Mat_MPIAIJ*)((*C)->data);
1544   aij->A = A;
1545   aij->B = B;
1546   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1547   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1548   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1549   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 #undef __FUNCT__
1554 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1555 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1556 {
1557   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1558   PetscFunctionBegin;
1559   PetscValidPointer(A,2);
1560   PetscValidPointer(B,3);
1561   *A = aij->A;
1562   *B = aij->B;
1563   /* Note that we don't incref *A and *B, so be careful! */
1564   PetscFunctionReturn(0);
1565 }
1566 
1567 
1568 #undef __FUNCT__
1569 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1570 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1571 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1572 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1573 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1574 {
1575   PetscErrorCode ierr;
1576   PetscMPIInt size, flag;
1577   PetscInt    i,ii;
1578   PetscInt    ismax_c;
1579   PetscFunctionBegin;
1580   if(!ismax) {
1581     PetscFunctionReturn(0);
1582   }
1583   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1584     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1585     if(flag != MPI_IDENT) {
1586       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1587     }
1588     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1589     if(size > 1) {
1590       ++ismax_c;
1591     }
1592   }
1593   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1594     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1595   }
1596   else { /* if(ismax_c) */
1597     Mat *A,*B;
1598     IS  *isrow_c, *iscol_c;
1599     PetscMPIInt size;
1600     /*
1601      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1602      array of sequential matrices underlying the resulting parallel matrices.
1603      Which arrays to allocate is based on the value of MatReuse scall.
1604      There are as many diag matrices as there are original index sets.
1605      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1606 
1607      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1608      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1609      will deposite the extracted diag and off-diag parts.
1610      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1611      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1612     */
1613 
1614     /* Parallel matrix array is allocated only if no reuse is taking place. */
1615     if (scall != MAT_REUSE_MATRIX) {
1616       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1617     }
1618     else {
1619       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1620       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1621       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1622       for(i = 0, ii = 0; i < ismax; ++i) {
1623         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1624         if(size > 1) {
1625           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1626           ++ii;
1627         }
1628         else {
1629           A[i] = (*submat)[i];
1630         }
1631       }
1632     }
1633     /*
1634      Construct the complements of the iscol ISs for parallel ISs only.
1635      These are used to extract the off-diag portion of the resulting parallel matrix.
1636      The row IS for the off-diag portion is the same as for the diag portion,
1637      so we merely alias the row IS, while skipping those that are sequential.
1638     */
1639     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1640     for(i = 0, ii = 0; i < ismax; ++i) {
1641       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1642       if(size > 1) {
1643 	isrow_c[ii] = isrow[i];
1644 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1645 	++ii;
1646       }
1647     }
1648     /* Now obtain the sequential A and B submatrices separately. */
1649     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1650     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1651     for(ii = 0; ii < ismax_c; ++ii) {
1652       ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr);
1653     }
1654     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1655     /*
1656      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1657      have been extracted directly into the parallel matrices containing them, or
1658      simply into the sequential matrix identical with the corresponding A (if size == 1).
1659      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1660      A is put into the correct submat slot (if size == 1).
1661      */
1662     if(scall != MAT_REUSE_MATRIX) {
1663       for(i = 0, ii = 0; i < ismax; ++i) {
1664         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1665         if(size > 1) {
1666           /*
1667            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1668            */
1669           /* Construct submat[i] from the Seq pieces A and B. */
1670           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1671 
1672           ++ii;
1673         }
1674         else {
1675           (*submat)[i] = A[i];
1676         }
1677       }
1678     }
1679     ierr = PetscFree(A); CHKERRQ(ierr);
1680     ierr = PetscFree(B); CHKERRQ(ierr);
1681   }
1682   PetscFunctionReturn(0);
1683 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1684 
1685 
1686 
1687 #undef __FUNCT__
1688 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1689 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1690 {
1691   PetscErrorCode ierr;
1692   PetscFunctionBegin;
1693   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1694                                            MatCreateMPIAIJFromSeqMatrices_Private,
1695                                            MatMPIAIJExtractSeqMatrices_Private);
1696   CHKERRQ(ierr);
1697   PetscFunctionReturn(0);
1698 }
1699