xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 732e2eb99afef1b984167973f126bbed28c96039)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69 
70   PetscFunctionBegin;
71   comm   = ((PetscObject)C)->comm;
72   size   = c->size;
73   rank   = c->rank;
74   M      = C->rmap->N;
75 
76   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
78 
79   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
80 
81   for (i=0; i<imax; i++) {
82     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
83     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
84   }
85 
86   /* evaluate communication - mesg to who,length of mesg, and buffer space
87      required. Based on this, buffers are allocated, and data copied into them*/
88   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
89   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
90   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   for (i=0; i<imax; i++) {
93     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
94     idx_i = idx[i];
95     len   = n[i];
96     for (j=0; j<len; j++) {
97       row  = idx_i[j];
98       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
99       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
100       w4[proc]++;
101     }
102     for (j=0; j<size; j++){
103       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
104     }
105   }
106 
107   nrqs     = 0;              /* no of outgoing messages */
108   msz      = 0;              /* total mesg length (for all proc */
109   w1[rank] = 0;              /* no mesg sent to intself */
110   w3[rank] = 0;
111   for (i=0; i<size; i++) {
112     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
113   }
114   /* pa - is list of processors to communicate with */
115   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
116   for (i=0,j=0; i<size; i++) {
117     if (w1[i]) {pa[j] = i; j++;}
118   }
119 
120   /* Each message would have a header = 1 + 2*(no of IS) + data */
121   for (i=0; i<nrqs; i++) {
122     j      = pa[i];
123     w1[j] += w2[j] + 2*w3[j];
124     msz   += w1[j];
125   }
126 
127   /* Determine the number of messages to expect, their lengths, from from-ids */
128   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
129   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
130 
131   /* Now post the Irecvs corresponding to these messages */
132   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
133 
134   /* Allocate Memory for outgoing messages */
135   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
136   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
137   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
138 
139   {
140     PetscInt *iptr = tmp,ict  = 0;
141     for (i=0; i<nrqs; i++) {
142       j         = pa[i];
143       iptr     +=  ict;
144       outdat[j] = iptr;
145       ict       = w1[j];
146     }
147   }
148 
149   /* Form the outgoing messages */
150   /*plug in the headers*/
151   for (i=0; i<nrqs; i++) {
152     j            = pa[i];
153     outdat[j][0] = 0;
154     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
155     ptr[j]       = outdat[j] + 2*w3[j] + 1;
156   }
157 
158   /* Memory for doing local proc's work*/
159   {
160     PetscInt  *d_p;
161     char      *t_p;
162 
163     /* should replace with PetscMallocN() */
164     ierr  = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
165       M*imax*sizeof(PetscInt)  + (M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr);
166     ierr  = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
167       M*imax*sizeof(PetscInt)  + (M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr);
168     data  = (PetscInt **)(table + imax);
169     isz   = (PetscInt  *)(data  + imax);
170     d_p   = (PetscInt  *)(isz   + imax);
171     t_p   = (char *)(d_p   + M*imax);
172     for (i=0; i<imax; i++) {
173       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
174       data[i]  = d_p + M*i;
175     }
176   }
177 
178   /* Parse the IS and update local tables and the outgoing buf with the data*/
179   {
180     PetscInt     n_i,*data_i,isz_i,*outdat_j,ctr_j;
181     PetscBT table_i;
182 
183     for (i=0; i<imax; i++) {
184       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
185       n_i     = n[i];
186       table_i = table[i];
187       idx_i   = idx[i];
188       data_i  = data[i];
189       isz_i   = isz[i];
190       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
191         row  = idx_i[j];
192         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
193         if (proc != rank) { /* copy to the outgoing buffer */
194           ctr[proc]++;
195           *ptr[proc] = row;
196           ptr[proc]++;
197         } else { /* Update the local table */
198           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
199         }
200       }
201       /* Update the headers for the current IS */
202       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
203         if ((ctr_j = ctr[j])) {
204           outdat_j        = outdat[j];
205           k               = ++outdat_j[0];
206           outdat_j[2*k]   = ctr_j;
207           outdat_j[2*k-1] = i;
208         }
209       }
210       isz[i] = isz_i;
211     }
212   }
213 
214   /*  Now  post the sends */
215   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
216   for (i=0; i<nrqs; ++i) {
217     j    = pa[i];
218     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
219   }
220 
221   /* No longer need the original indices*/
222   for (i=0; i<imax; ++i) {
223     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
224   }
225   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
226 
227   for (i=0; i<imax; ++i) {
228     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
229   }
230 
231   /* Do Local work*/
232   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
233 
234   /* Receive messages*/
235   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
236   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
237 
238   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
239   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
240 
241   /* Phase 1 sends are complete - deallocate buffers */
242   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
243   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
244 
245   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
246   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
247   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
248   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
249   ierr = PetscFree(rbuf);CHKERRQ(ierr);
250 
251 
252  /* Send the data back*/
253   /* Do a global reduction to know the buffer space req for incoming messages*/
254   {
255     PetscMPIInt *rw1;
256 
257     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
258     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
259 
260     for (i=0; i<nrqr; ++i) {
261       proc      = recv_status[i].MPI_SOURCE;
262       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
263       rw1[proc] = isz1[i];
264     }
265     ierr = PetscFree(onodes1);CHKERRQ(ierr);
266     ierr = PetscFree(olengths1);CHKERRQ(ierr);
267 
268     /* Determine the number of messages to expect, their lengths, from from-ids */
269     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
270     ierr = PetscFree(rw1);CHKERRQ(ierr);
271   }
272   /* Now post the Irecvs corresponding to these messages */
273   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
274 
275   /*  Now  post the sends */
276   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
277   for (i=0; i<nrqr; ++i) {
278     j    = recv_status[i].MPI_SOURCE;
279     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
280   }
281 
282   /* receive work done on other processors*/
283   {
284     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
285     PetscMPIInt idex;
286     PetscBT     table_i;
287     MPI_Status  *status2;
288 
289     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
290     for (i=0; i<nrqs; ++i) {
291       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
292       /* Process the message*/
293       rbuf2_i = rbuf2[idex];
294       ct1     = 2*rbuf2_i[0]+1;
295       jmax    = rbuf2[idex][0];
296       for (j=1; j<=jmax; j++) {
297         max     = rbuf2_i[2*j];
298         is_no   = rbuf2_i[2*j-1];
299         isz_i   = isz[is_no];
300         data_i  = data[is_no];
301         table_i = table[is_no];
302         for (k=0; k<max; k++,ct1++) {
303           row = rbuf2_i[ct1];
304           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
305         }
306         isz[is_no] = isz_i;
307       }
308     }
309 
310     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
311     ierr = PetscFree(status2);CHKERRQ(ierr);
312   }
313 
314   for (i=0; i<imax; ++i) {
315     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
316   }
317 
318   ierr = PetscFree(onodes2);CHKERRQ(ierr);
319   ierr = PetscFree(olengths2);CHKERRQ(ierr);
320 
321   ierr = PetscFree(pa);CHKERRQ(ierr);
322   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
323   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
324   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
325   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
326   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
327   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
328   ierr = PetscFree(table);CHKERRQ(ierr);
329   ierr = PetscFree(s_status);CHKERRQ(ierr);
330   ierr = PetscFree(recv_status);CHKERRQ(ierr);
331   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
332   ierr = PetscFree(xdata);CHKERRQ(ierr);
333   ierr = PetscFree(isz1);CHKERRQ(ierr);
334   PetscFunctionReturn(0);
335 }
336 
337 #undef __FUNCT__
338 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
339 /*
340    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
341        the work on the local processor.
342 
343      Inputs:
344       C      - MAT_MPIAIJ;
345       imax - total no of index sets processed at a time;
346       table  - an array of char - size = m bits.
347 
348      Output:
349       isz    - array containing the count of the solution elements correspondign
350                to each index set;
351       data   - pointer to the solutions
352 */
353 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
354 {
355   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
356   Mat        A = c->A,B = c->B;
357   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
358   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
359   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
360   PetscBT    table_i;
361 
362   PetscFunctionBegin;
363   rstart = C->rmap->rstart;
364   cstart = C->cmap->rstart;
365   ai     = a->i;
366   aj     = a->j;
367   bi     = b->i;
368   bj     = b->j;
369   garray = c->garray;
370 
371 
372   for (i=0; i<imax; i++) {
373     data_i  = data[i];
374     table_i = table[i];
375     isz_i   = isz[i];
376     for (j=0,max=isz[i]; j<max; j++) {
377       row   = data_i[j] - rstart;
378       start = ai[row];
379       end   = ai[row+1];
380       for (k=start; k<end; k++) { /* Amat */
381         val = aj[k] + cstart;
382         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
383       }
384       start = bi[row];
385       end   = bi[row+1];
386       for (k=start; k<end; k++) { /* Bmat */
387         val = garray[bj[k]];
388         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
389       }
390     }
391     isz[i] = isz_i;
392   }
393   PetscFunctionReturn(0);
394 }
395 
396 #undef __FUNCT__
397 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
398 /*
399       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
400          and return the output
401 
402          Input:
403            C    - the matrix
404            nrqr - no of messages being processed.
405            rbuf - an array of pointers to the recieved requests
406 
407          Output:
408            xdata - array of messages to be sent back
409            isz1  - size of each message
410 
411   For better efficiency perhaps we should malloc separately each xdata[i],
412 then if a remalloc is required we need only copy the data for that one row
413 rather then all previous rows as it is now where a single large chunck of
414 memory is used.
415 
416 */
417 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
418 {
419   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
420   Mat            A = c->A,B = c->B;
421   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
422   PetscErrorCode ierr;
423   PetscMPIInt    rank;
424   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
425   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
426   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
427   PetscInt       *rbuf_i,kmax,rbuf_0;
428   PetscBT        xtable;
429 
430   PetscFunctionBegin;
431   rank   = c->rank;
432   m      = C->rmap->N;
433   rstart = C->rmap->rstart;
434   cstart = C->cmap->rstart;
435   ai     = a->i;
436   aj     = a->j;
437   bi     = b->i;
438   bj     = b->j;
439   garray = c->garray;
440 
441 
442   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
443     rbuf_i  =  rbuf[i];
444     rbuf_0  =  rbuf_i[0];
445     ct     += rbuf_0;
446     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
447   }
448 
449   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
450   else      max1 = 1;
451   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
452   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
453   ++no_malloc;
454   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
455   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
456 
457   ct3 = 0;
458   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
459     rbuf_i =  rbuf[i];
460     rbuf_0 =  rbuf_i[0];
461     ct1    =  2*rbuf_0+1;
462     ct2    =  ct1;
463     ct3    += ct1;
464     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
465       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
466       oct2 = ct2;
467       kmax = rbuf_i[2*j];
468       for (k=0; k<kmax; k++,ct1++) {
469         row = rbuf_i[ct1];
470         if (!PetscBTLookupSet(xtable,row)) {
471           if (!(ct3 < mem_estimate)) {
472             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
473             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
474             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
475             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
476             xdata[0]     = tmp;
477             mem_estimate = new_estimate; ++no_malloc;
478             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
479           }
480           xdata[i][ct2++] = row;
481           ct3++;
482         }
483       }
484       for (k=oct2,max2=ct2; k<max2; k++) {
485         row   = xdata[i][k] - rstart;
486         start = ai[row];
487         end   = ai[row+1];
488         for (l=start; l<end; l++) {
489           val = aj[l] + cstart;
490           if (!PetscBTLookupSet(xtable,val)) {
491             if (!(ct3 < mem_estimate)) {
492               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
493               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
494               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
495               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
496               xdata[0]     = tmp;
497               mem_estimate = new_estimate; ++no_malloc;
498               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
499             }
500             xdata[i][ct2++] = val;
501             ct3++;
502           }
503         }
504         start = bi[row];
505         end   = bi[row+1];
506         for (l=start; l<end; l++) {
507           val = garray[bj[l]];
508           if (!PetscBTLookupSet(xtable,val)) {
509             if (!(ct3 < mem_estimate)) {
510               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
511               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
512               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
513               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
514               xdata[0]     = tmp;
515               mem_estimate = new_estimate; ++no_malloc;
516               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
517             }
518             xdata[i][ct2++] = val;
519             ct3++;
520           }
521         }
522       }
523       /* Update the header*/
524       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
525       xdata[i][2*j-1] = rbuf_i[2*j-1];
526     }
527     xdata[i][0] = rbuf_0;
528     xdata[i+1]  = xdata[i] + ct2;
529     isz1[i]     = ct2; /* size of each message */
530   }
531   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
532   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
533   PetscFunctionReturn(0);
534 }
535 /* -------------------------------------------------------------------------*/
536 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
537 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
538 /*
539     Every processor gets the entire matrix
540 */
541 #undef __FUNCT__
542 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
543 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
544 {
545   Mat            B;
546   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
547   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
548   PetscErrorCode ierr;
549   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
550   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
551   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
552   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
553 
554   PetscFunctionBegin;
555   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
556   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
557 
558   if (scall == MAT_INITIAL_MATRIX) {
559     /* ----------------------------------------------------------------
560          Tell every processor the number of nonzeros per row
561     */
562     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
563     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
564       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
565     }
566     sendcount = A->rmap->rend - A->rmap->rstart;
567     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
568     for (i=0; i<size; i++) {
569       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
570       displs[i]     = A->rmap->range[i];
571     }
572 #if defined(PETSC_HAVE_MPI_IN_PLACE)
573     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
574 #else
575     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
576 #endif
577     /* ---------------------------------------------------------------
578          Create the sequential matrix of the same type as the local block diagonal
579     */
580     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
581     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
582     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
583     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
584     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
585     **Bin = B;
586     b = (Mat_SeqAIJ *)B->data;
587 
588     /*--------------------------------------------------------------------
589        Copy my part of matrix column indices over
590     */
591     sendcount  = ad->nz + bd->nz;
592     jsendbuf   = b->j + b->i[rstarts[rank]];
593     a_jsendbuf = ad->j;
594     b_jsendbuf = bd->j;
595     n          = A->rmap->rend - A->rmap->rstart;
596     cnt        = 0;
597     for (i=0; i<n; i++) {
598 
599       /* put in lower diagonal portion */
600       m = bd->i[i+1] - bd->i[i];
601       while (m > 0) {
602         /* is it above diagonal (in bd (compressed) numbering) */
603         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
604         jsendbuf[cnt++] = garray[*b_jsendbuf++];
605         m--;
606       }
607 
608       /* put in diagonal portion */
609       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
610         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
611       }
612 
613       /* put in upper diagonal portion */
614       while (m-- > 0) {
615         jsendbuf[cnt++] = garray[*b_jsendbuf++];
616       }
617     }
618     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
619 
620     /*--------------------------------------------------------------------
621        Gather all column indices to all processors
622     */
623     for (i=0; i<size; i++) {
624       recvcounts[i] = 0;
625       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
626         recvcounts[i] += lens[j];
627       }
628     }
629     displs[0]  = 0;
630     for (i=1; i<size; i++) {
631       displs[i] = displs[i-1] + recvcounts[i-1];
632     }
633 #if defined(PETSC_HAVE_MPI_IN_PLACE)
634     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
635 #else
636     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
637 #endif
638     /*--------------------------------------------------------------------
639         Assemble the matrix into useable form (note numerical values not yet set)
640     */
641     /* set the b->ilen (length of each row) values */
642     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
643     /* set the b->i indices */
644     b->i[0] = 0;
645     for (i=1; i<=A->rmap->N; i++) {
646       b->i[i] = b->i[i-1] + lens[i-1];
647     }
648     ierr = PetscFree(lens);CHKERRQ(ierr);
649     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
650     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
651 
652   } else {
653     B  = **Bin;
654     b = (Mat_SeqAIJ *)B->data;
655   }
656 
657   /*--------------------------------------------------------------------
658        Copy my part of matrix numerical values into the values location
659   */
660   if (flag == MAT_GET_VALUES){
661     sendcount = ad->nz + bd->nz;
662     sendbuf   = b->a + b->i[rstarts[rank]];
663     a_sendbuf = ad->a;
664     b_sendbuf = bd->a;
665     b_sendj   = bd->j;
666     n         = A->rmap->rend - A->rmap->rstart;
667     cnt       = 0;
668     for (i=0; i<n; i++) {
669 
670       /* put in lower diagonal portion */
671       m = bd->i[i+1] - bd->i[i];
672       while (m > 0) {
673         /* is it above diagonal (in bd (compressed) numbering) */
674         if (garray[*b_sendj] > A->rmap->rstart + i) break;
675         sendbuf[cnt++] = *b_sendbuf++;
676         m--;
677         b_sendj++;
678       }
679 
680       /* put in diagonal portion */
681       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
682         sendbuf[cnt++] = *a_sendbuf++;
683       }
684 
685       /* put in upper diagonal portion */
686       while (m-- > 0) {
687         sendbuf[cnt++] = *b_sendbuf++;
688         b_sendj++;
689       }
690     }
691     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
692 
693     /* -----------------------------------------------------------------
694        Gather all numerical values to all processors
695     */
696     if (!recvcounts) {
697       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
698     }
699     for (i=0; i<size; i++) {
700       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
701     }
702     displs[0]  = 0;
703     for (i=1; i<size; i++) {
704       displs[i] = displs[i-1] + recvcounts[i-1];
705     }
706     recvbuf   = b->a;
707 #if defined(PETSC_HAVE_MPI_IN_PLACE)
708     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
709 #else
710     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
711 #endif
712   }  /* endof (flag == MAT_GET_VALUES) */
713   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
714 
715   if (A->symmetric){
716     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
717   } else if (A->hermitian) {
718     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
719   } else if (A->structurally_symmetric) {
720     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
721   }
722   PetscFunctionReturn(0);
723 }
724 
725 
726 
727 #undef __FUNCT__
728 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
729 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
730 {
731   PetscErrorCode ierr;
732   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
733   PetscBool      rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
734 
735   PetscFunctionBegin;
736   /*
737        Check for special case each processor gets entire matrix
738   */
739   if (ismax == 1 && C->rmap->N == C->cmap->N) {
740     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
741     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
742     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
743     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
744     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
745       wantallmatrix = PETSC_TRUE;
746       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
747     }
748   }
749   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
750   if (twantallmatrix) {
751     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
752     PetscFunctionReturn(0);
753   }
754 
755   /* Allocate memory to hold all the submatrices */
756   if (scall != MAT_REUSE_MATRIX) {
757     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
758   }
759   /* Determine the number of stages through which submatrices are done */
760   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
761   /*
762      Each stage will extract nmax submatrices.
763      nmax is determined by the matrix column dimension.
764      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
765   */
766   if (!nmax) nmax = 1;
767   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
768 
769   /* Make sure every processor loops through the nstages */
770   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
771 
772   for (i=0,pos=0; i<nstages; i++) {
773     if (pos+nmax <= ismax) max_no = nmax;
774     else if (pos == ismax) max_no = 0;
775     else                   max_no = ismax-pos;
776     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
777     pos += max_no;
778   }
779   PetscFunctionReturn(0);
780 }
781 
782 /* -------------------------------------------------------------------------*/
783 #undef __FUNCT__
784 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
785 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
786 {
787   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
788   Mat            A = c->A;
789   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
790   const PetscInt **icol,**irow;
791   PetscInt       *nrow,*ncol,start;
792   PetscErrorCode ierr;
793   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
794   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
795   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
796   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
797   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
798 #if defined (PETSC_USE_CTABLE)
799   PetscTable     *cmap,cmap_i,*rmap,rmap_i;
800 #else
801   PetscInt       **cmap,*cmap_i,**rmap,*rmap_i;
802 #endif
803   const PetscInt *irow_i;
804   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
805   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
806   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
807   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
808   MPI_Status     *r_status3,*r_status4,*s_status4;
809   MPI_Comm       comm;
810   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
811   PetscBool      sorted;
812   PetscMPIInt    *onodes1,*olengths1;
813   PetscMPIInt    idex,idex2,end;
814 
815   PetscFunctionBegin;
816   comm   = ((PetscObject)C)->comm;
817   tag0   = ((PetscObject)C)->tag;
818   size   = c->size;
819   rank   = c->rank;
820 
821   /* Get some new tags to keep the communication clean */
822   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
823   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
824   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
825 
826     /* Check if the col indices are sorted */
827   for (i=0; i<ismax; i++) {
828     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
829     /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
830     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
831     /*    if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
832   }
833   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
834 
835   for (i=0; i<ismax; i++) {
836     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
837     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
838     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
839     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
840   }
841 
842   /* evaluate communication - mesg to who, length of mesg, and buffer space
843      required. Based on this, buffers are allocated, and data copied into them*/
844   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
845   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
846   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
847   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
848   for (i=0; i<ismax; i++) {
849     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
850     jmax   = nrow[i];
851     irow_i = irow[i];
852     for (j=0; j<jmax; j++) {
853       l = 0;
854       row  = irow_i[j];
855       while (row >= C->rmap->range[l+1]) l++;
856       proc = l;
857       w4[proc]++;
858     }
859     for (j=0; j<size; j++) {
860       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
861     }
862   }
863 
864   nrqs     = 0;              /* no of outgoing messages */
865   msz      = 0;              /* total mesg length (for all procs) */
866   w1[rank] = 0;              /* no mesg sent to self */
867   w3[rank] = 0;
868   for (i=0; i<size; i++) {
869     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
870   }
871   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
872   for (i=0,j=0; i<size; i++) {
873     if (w1[i]) { pa[j] = i; j++; }
874   }
875 
876   /* Each message would have a header = 1 + 2*(no of IS) + data */
877   for (i=0; i<nrqs; i++) {
878     j     = pa[i];
879     w1[j] += w2[j] + 2* w3[j];
880     msz   += w1[j];
881   }
882   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
883 
884   /* Determine the number of messages to expect, their lengths, from from-ids */
885   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
886   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
887 
888   /* Now post the Irecvs corresponding to these messages */
889   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
890 
891   ierr = PetscFree(onodes1);CHKERRQ(ierr);
892   ierr = PetscFree(olengths1);CHKERRQ(ierr);
893 
894   /* Allocate Memory for outgoing messages */
895   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
896   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
897   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
898 
899   {
900     PetscInt *iptr = tmp,ict = 0;
901     for (i=0; i<nrqs; i++) {
902       j         = pa[i];
903       iptr     += ict;
904       sbuf1[j]  = iptr;
905       ict       = w1[j];
906     }
907   }
908 
909   /* Form the outgoing messages */
910   /* Initialize the header space */
911   for (i=0; i<nrqs; i++) {
912     j           = pa[i];
913     sbuf1[j][0] = 0;
914     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
915     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
916   }
917 
918   /* Parse the isrow and copy data into outbuf */
919   for (i=0; i<ismax; i++) {
920     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
921     irow_i = irow[i];
922     jmax   = nrow[i];
923     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
924       l = 0;
925       row  = irow_i[j];
926       while (row >= C->rmap->range[l+1]) l++;
927       proc = l;
928       if (proc != rank) { /* copy to the outgoing buf*/
929         ctr[proc]++;
930         *ptr[proc] = row;
931         ptr[proc]++;
932       }
933     }
934     /* Update the headers for the current IS */
935     for (j=0; j<size; j++) { /* Can Optimise this loop too */
936       if ((ctr_j = ctr[j])) {
937         sbuf1_j        = sbuf1[j];
938         k              = ++sbuf1_j[0];
939         sbuf1_j[2*k]   = ctr_j;
940         sbuf1_j[2*k-1] = i;
941       }
942     }
943   }
944 
945   /*  Now  post the sends */
946   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
947   for (i=0; i<nrqs; ++i) {
948     j    = pa[i];
949     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
950   }
951 
952   /* Post Receives to capture the buffer size */
953   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
954   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
955   rbuf2[0] = tmp + msz;
956   for (i=1; i<nrqs; ++i) {
957     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
958   }
959   for (i=0; i<nrqs; ++i) {
960     j    = pa[i];
961     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
962   }
963 
964   /* Send to other procs the buf size they should allocate */
965 
966 
967   /* Receive messages*/
968   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
969   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
970   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
971   {
972     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
973     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
974     PetscInt    *sbuf2_i;
975 
976     for (i=0; i<nrqr; ++i) {
977       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
978       req_size[idex] = 0;
979       rbuf1_i         = rbuf1[idex];
980       start           = 2*rbuf1_i[0] + 1;
981       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
982       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
983       sbuf2_i         = sbuf2[idex];
984       for (j=start; j<end; j++) {
985         id               = rbuf1_i[j] - rstart;
986         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
987         sbuf2_i[j]       = ncols;
988         req_size[idex] += ncols;
989       }
990       req_source[idex] = r_status1[i].MPI_SOURCE;
991       /* form the header */
992       sbuf2_i[0]   = req_size[idex];
993       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
994       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
995     }
996   }
997   ierr = PetscFree(r_status1);CHKERRQ(ierr);
998   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
999 
1000   /*  recv buffer sizes */
1001   /* Receive messages*/
1002 
1003   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1004   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1005   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1006   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1007   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1008 
1009   for (i=0; i<nrqs; ++i) {
1010     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1011     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1012     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1013     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1014     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1015   }
1016   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1017   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1018 
1019   /* Wait on sends1 and sends2 */
1020   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1021   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1022 
1023   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1024   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1025   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1026   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1027   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1028   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1029 
1030   /* Now allocate buffers for a->j, and send them off */
1031   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1032   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1033   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1034   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1035 
1036   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1037   {
1038     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1039     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1040     PetscInt cend = C->cmap->rend;
1041     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1042 
1043     for (i=0; i<nrqr; i++) {
1044       rbuf1_i   = rbuf1[i];
1045       sbuf_aj_i = sbuf_aj[i];
1046       ct1       = 2*rbuf1_i[0] + 1;
1047       ct2       = 0;
1048       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1049         kmax = rbuf1[i][2*j];
1050         for (k=0; k<kmax; k++,ct1++) {
1051           row    = rbuf1_i[ct1] - rstart;
1052           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1053           ncols  = nzA + nzB;
1054           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1055 
1056           /* load the column indices for this row into cols*/
1057           cols  = sbuf_aj_i + ct2;
1058 
1059 	  lwrite = 0;
1060           for (l=0; l<nzB; l++) {
1061             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1062           }
1063           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1064           for (l=0; l<nzB; l++) {
1065             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1066           }
1067 
1068           ct2 += ncols;
1069         }
1070       }
1071       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1072     }
1073   }
1074   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1075   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1076 
1077   /* Allocate buffers for a->a, and send them off */
1078   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1079   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1080   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1081   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1082 
1083   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1084   {
1085     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1086     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1087     PetscInt    cend = C->cmap->rend;
1088     PetscInt    *b_j = b->j;
1089     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1090 
1091     for (i=0; i<nrqr; i++) {
1092       rbuf1_i   = rbuf1[i];
1093       sbuf_aa_i = sbuf_aa[i];
1094       ct1       = 2*rbuf1_i[0]+1;
1095       ct2       = 0;
1096       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1097         kmax = rbuf1_i[2*j];
1098         for (k=0; k<kmax; k++,ct1++) {
1099           row    = rbuf1_i[ct1] - rstart;
1100           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1101           ncols  = nzA + nzB;
1102           cworkB = b_j + b_i[row];
1103           vworkA = a_a + a_i[row];
1104           vworkB = b_a + b_i[row];
1105 
1106           /* load the column values for this row into vals*/
1107           vals  = sbuf_aa_i+ct2;
1108 
1109 	  lwrite = 0;
1110           for (l=0; l<nzB; l++) {
1111             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1112           }
1113           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1114           for (l=0; l<nzB; l++) {
1115             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1116           }
1117 
1118           ct2 += ncols;
1119         }
1120       }
1121       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1122     }
1123   }
1124   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1125   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1126   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1127   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1128 
1129   /* Form the matrix */
1130   /* create col map */
1131   {
1132     const PetscInt *icol_i;
1133 #if defined (PETSC_USE_CTABLE)
1134     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1135     for (i=0; i<ismax; i++) {
1136       ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr);
1137       jmax   = ncol[i];
1138       icol_i = icol[i];
1139       cmap_i = cmap[i];
1140       for (j=0; j<jmax; j++) {
1141         ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr);
1142       }
1143     }
1144 #else
1145     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1146     ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1147     ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1148     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1149     for (i=0; i<ismax; i++) {
1150       jmax   = ncol[i];
1151       icol_i = icol[i];
1152       cmap_i = cmap[i];
1153       for (j=0; j<jmax; j++) {
1154         cmap_i[icol_i[j]] = j+1;
1155       }
1156     }
1157 #endif
1158   }
1159 
1160   /* Create lens which is required for MatCreate... */
1161   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1162   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1163   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1164   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1165   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1166 
1167   /* Update lens from local data */
1168   for (i=0; i<ismax; i++) {
1169     jmax   = nrow[i];
1170     cmap_i = cmap[i];
1171     irow_i = irow[i];
1172     lens_i = lens[i];
1173     for (j=0; j<jmax; j++) {
1174       l = 0;
1175       row  = irow_i[j];
1176       while (row >= C->rmap->range[l+1]) l++;
1177       proc = l;
1178       if (proc == rank) {
1179         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1180         for (k=0; k<ncols; k++) {
1181 #if defined (PETSC_USE_CTABLE)
1182           ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1183 #else
1184           tcol = cmap_i[cols[k]];
1185 #endif
1186           if (tcol) { lens_i[j]++;}
1187         }
1188         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1189       }
1190     }
1191   }
1192 
1193   /* Create row map*/
1194 #if defined (PETSC_USE_CTABLE)
1195   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1196     for (i=0; i<ismax; i++) {
1197       ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr);
1198       rmap_i = rmap[i];
1199       irow_i = irow[i];
1200       jmax   = nrow[i];
1201       for (j=0; j<jmax; j++) {
1202         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr);
1203       }
1204     }
1205 #else
1206   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1207   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1208   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1209   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1210   for (i=0; i<ismax; i++) {
1211     rmap_i = rmap[i];
1212     irow_i = irow[i];
1213     jmax   = nrow[i];
1214     for (j=0; j<jmax; j++) {
1215       rmap_i[irow_i[j]] = j;
1216     }
1217   }
1218 #endif
1219 
1220   /* Update lens from offproc data */
1221   {
1222     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1223 
1224     for (tmp2=0; tmp2<nrqs; tmp2++) {
1225       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1226       idex   = pa[idex2];
1227       sbuf1_i = sbuf1[idex];
1228       jmax    = sbuf1_i[0];
1229       ct1     = 2*jmax+1;
1230       ct2     = 0;
1231       rbuf2_i = rbuf2[idex2];
1232       rbuf3_i = rbuf3[idex2];
1233       for (j=1; j<=jmax; j++) {
1234         is_no   = sbuf1_i[2*j-1];
1235         max1    = sbuf1_i[2*j];
1236         lens_i  = lens[is_no];
1237         cmap_i  = cmap[is_no];
1238         rmap_i  = rmap[is_no];
1239         for (k=0; k<max1; k++,ct1++) {
1240 #if defined (PETSC_USE_CTABLE)
1241           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1242           row--;
1243           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1244 #else
1245           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1246 #endif
1247           max2 = rbuf2_i[ct1];
1248           for (l=0; l<max2; l++,ct2++) {
1249 #if defined (PETSC_USE_CTABLE)
1250             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1251 #else
1252             tcol = cmap_i[rbuf3_i[ct2]];
1253 #endif
1254             if (tcol) {
1255               lens_i[row]++;
1256             }
1257           }
1258         }
1259       }
1260     }
1261   }
1262   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1263   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1264   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1265   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1266   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1267 
1268   /* Create the submatrices */
1269   if (scall == MAT_REUSE_MATRIX) {
1270     PetscBool  flag;
1271 
1272     /*
1273         Assumes new rows are same length as the old rows,hence bug!
1274     */
1275     for (i=0; i<ismax; i++) {
1276       mat = (Mat_SeqAIJ *)(submats[i]->data);
1277       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1278       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1279       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1280       /* Initial matrix as if empty */
1281       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1282       submats[i]->factortype = C->factortype;
1283     }
1284   } else {
1285     for (i=0; i<ismax; i++) {
1286       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1287       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1288       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1289       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1290     }
1291   }
1292 
1293   /* Assemble the matrices */
1294   /* First assemble the local rows */
1295   {
1296     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1297     PetscScalar *imat_a;
1298 
1299     for (i=0; i<ismax; i++) {
1300       mat       = (Mat_SeqAIJ*)submats[i]->data;
1301       imat_ilen = mat->ilen;
1302       imat_j    = mat->j;
1303       imat_i    = mat->i;
1304       imat_a    = mat->a;
1305       cmap_i    = cmap[i];
1306       rmap_i    = rmap[i];
1307       irow_i    = irow[i];
1308       jmax      = nrow[i];
1309       for (j=0; j<jmax; j++) {
1310 	l = 0;
1311         row      = irow_i[j];
1312         while (row >= C->rmap->range[l+1]) l++;
1313         proc = l;
1314         if (proc == rank) {
1315           old_row  = row;
1316 #if defined (PETSC_USE_CTABLE)
1317           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1318           row--;
1319 #else
1320           row      = rmap_i[row];
1321 #endif
1322           ilen_row = imat_ilen[row];
1323           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1324           mat_i    = imat_i[row] ;
1325           mat_a    = imat_a + mat_i;
1326           mat_j    = imat_j + mat_i;
1327           for (k=0; k<ncols; k++) {
1328 #if defined (PETSC_USE_CTABLE)
1329             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1330 #else
1331             tcol = cmap_i[cols[k]];
1332 #endif
1333             if (tcol){
1334               *mat_j++ = tcol - 1;
1335               *mat_a++ = vals[k];
1336               ilen_row++;
1337             }
1338           }
1339           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1340           imat_ilen[row] = ilen_row;
1341         }
1342       }
1343     }
1344   }
1345 
1346   /*   Now assemble the off proc rows*/
1347   {
1348     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1349     PetscInt    *imat_j,*imat_i;
1350     PetscScalar *imat_a,*rbuf4_i;
1351 
1352     for (tmp2=0; tmp2<nrqs; tmp2++) {
1353       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1354       idex   = pa[idex2];
1355       sbuf1_i = sbuf1[idex];
1356       jmax    = sbuf1_i[0];
1357       ct1     = 2*jmax + 1;
1358       ct2     = 0;
1359       rbuf2_i = rbuf2[idex2];
1360       rbuf3_i = rbuf3[idex2];
1361       rbuf4_i = rbuf4[idex2];
1362       for (j=1; j<=jmax; j++) {
1363         is_no     = sbuf1_i[2*j-1];
1364         rmap_i    = rmap[is_no];
1365         cmap_i    = cmap[is_no];
1366         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1367         imat_ilen = mat->ilen;
1368         imat_j    = mat->j;
1369         imat_i    = mat->i;
1370         imat_a    = mat->a;
1371         max1      = sbuf1_i[2*j];
1372         for (k=0; k<max1; k++,ct1++) {
1373           row   = sbuf1_i[ct1];
1374 #if defined (PETSC_USE_CTABLE)
1375           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1376           row--;
1377 #else
1378           row   = rmap_i[row];
1379 #endif
1380           ilen  = imat_ilen[row];
1381           mat_i = imat_i[row] ;
1382           mat_a = imat_a + mat_i;
1383           mat_j = imat_j + mat_i;
1384           max2 = rbuf2_i[ct1];
1385           for (l=0; l<max2; l++,ct2++) {
1386 
1387 #if defined (PETSC_USE_CTABLE)
1388             ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1389 #else
1390             tcol = cmap_i[rbuf3_i[ct2]];
1391 #endif
1392             if (tcol) {
1393               *mat_j++ = tcol - 1;
1394               *mat_a++ = rbuf4_i[ct2];
1395               ilen++;
1396             }
1397           }
1398           imat_ilen[row] = ilen;
1399         }
1400       }
1401     }
1402   }
1403   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1404   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1405   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1406   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1407   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1408 
1409   /* Restore the indices */
1410   for (i=0; i<ismax; i++) {
1411     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1412     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1413   }
1414 
1415   /* Destroy allocated memory */
1416   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1417   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1418   ierr = PetscFree(pa);CHKERRQ(ierr);
1419 
1420   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1421   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1422   for (i=0; i<nrqr; ++i) {
1423     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1424   }
1425   for (i=0; i<nrqs; ++i) {
1426     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1427     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1428   }
1429 
1430   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1431   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1432   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1433   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1434   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1435   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1436   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1437 #if defined (PETSC_USE_CTABLE)
1438   for (i=0; i<ismax; i++) {
1439     ierr = PetscTableDestroy((PetscTable)cmap[i]);CHKERRQ(ierr);
1440     ierr = PetscTableDestroy((PetscTable)rmap[i]);CHKERRQ(ierr);
1441   }
1442 #else
1443   ierr = PetscFree(cmap[0]);CHKERRQ(ierr);
1444   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1445 #endif
1446   ierr = PetscFree(cmap);CHKERRQ(ierr);
1447   ierr = PetscFree(rmap);CHKERRQ(ierr);
1448   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1449   ierr = PetscFree(lens);CHKERRQ(ierr);
1450 
1451   for (i=0; i<ismax; i++) {
1452     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1453     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1454   }
1455   PetscFunctionReturn(0);
1456 }
1457 
1458 /*
1459  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1460  Be careful not to destroy them elsewhere.
1461  */
1462 #undef __FUNCT__
1463 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1464 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1465 {
1466   /* If making this function public, change the error returned in this function away from _PLIB. */
1467   PetscErrorCode ierr;
1468   Mat_MPIAIJ *aij;
1469   PetscBool seqaij;
1470 
1471   PetscFunctionBegin;
1472   /* Check to make sure the component matrices are compatible with C. */
1473   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1474   if(!seqaij) {
1475     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1476   }
1477   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1478   if(!seqaij) {
1479     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1480   }
1481   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1482     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1483   }
1484   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1485   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1486   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1487   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1488   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1489   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1490   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1491     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1492   }
1493   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1494   aij = (Mat_MPIAIJ*)((*C)->data);
1495   aij->A = A;
1496   aij->B = B;
1497   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1498   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1499   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1500   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1501   PetscFunctionReturn(0);
1502 }
1503 
1504 #undef __FUNCT__
1505 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1506 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1507 {
1508   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1509   PetscFunctionBegin;
1510   PetscValidPointer(A,2);
1511   PetscValidPointer(B,3);
1512   *A = aij->A;
1513   *B = aij->B;
1514   /* Note that we don't incref *A and *B, so be careful! */
1515   PetscFunctionReturn(0);
1516 }
1517 
1518 
1519 #undef __FUNCT__
1520 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1521 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1522 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1523 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1524 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1525 {
1526   PetscErrorCode ierr;
1527   PetscMPIInt size, flag;
1528   PetscInt    i,ii;
1529   PetscInt    ismax_c;
1530   PetscFunctionBegin;
1531   if(!ismax) {
1532     PetscFunctionReturn(0);
1533   }
1534   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1535     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1536     if(flag != MPI_IDENT) {
1537       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1538     }
1539     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1540     if(size > 1) {
1541       ++ismax_c;
1542     }
1543   }
1544   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1545     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1546   }
1547   else { /* if(ismax_c) */
1548     Mat *A,*B;
1549     IS  *isrow_c, *iscol_c;
1550     PetscMPIInt size;
1551     /*
1552      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1553      array of sequential matrices underlying the resulting parallel matrices.
1554      Which arrays to allocate is based on the value of MatReuse scall.
1555      There are as many diag matrices as there are original index sets.
1556      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1557 
1558      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1559      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1560      will deposite the extracted diag and off-diag parts.
1561      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1562      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1563     */
1564 
1565     /* Parallel matrix array is allocated only if no reuse is taking place. */
1566     if (scall != MAT_REUSE_MATRIX) {
1567       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1568     }
1569     else {
1570       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1571       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1572       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1573       for(i = 0, ii = 0; i < ismax; ++i) {
1574         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1575         if(size > 1) {
1576           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1577           ++ii;
1578         }
1579         else {
1580           A[i] = (*submat)[i];
1581         }
1582       }
1583     }
1584     /*
1585      Construct the complements of the iscol ISs for parallel ISs only.
1586      These are used to extract the off-diag portion of the resulting parallel matrix.
1587      The row IS for the off-diag portion is the same as for the diag portion,
1588      so we merely alias the row IS, while skipping those that are sequential.
1589     */
1590     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1591     for(i = 0, ii = 0; i < ismax; ++i) {
1592       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1593       if(size > 1) {
1594 	isrow_c[ii] = isrow[i];
1595 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1596 	++ii;
1597       }
1598     }
1599     /* Now obtain the sequential A and B submatrices separately. */
1600     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1601     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1602     for(ii = 0; ii < ismax_c; ++ii) {
1603       ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr);
1604     }
1605     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1606     /*
1607      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1608      have been extracted directly into the parallel matrices containing them, or
1609      simply into the sequential matrix identical with the corresponding A (if size == 1).
1610      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1611      A is put into the correct submat slot (if size == 1).
1612      */
1613     if(scall != MAT_REUSE_MATRIX) {
1614       for(i = 0, ii = 0; i < ismax; ++i) {
1615         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1616         if(size > 1) {
1617           /*
1618            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1619            */
1620           /* Construct submat[i] from the Seq pieces A and B. */
1621           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1622 
1623           ++ii;
1624         }
1625         else {
1626           (*submat)[i] = A[i];
1627         }
1628       }
1629     }
1630     ierr = PetscFree(A); CHKERRQ(ierr);
1631     ierr = PetscFree(B); CHKERRQ(ierr);
1632   }
1633   PetscFunctionReturn(0);
1634 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1635 
1636 
1637 
1638 #undef __FUNCT__
1639 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1640 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1641 {
1642   PetscErrorCode ierr;
1643   PetscFunctionBegin;
1644   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1645                                            MatCreateMPIAIJFromSeqMatrices_Private,
1646                                            MatMPIAIJExtractSeqMatrices_Private);
1647   CHKERRQ(ierr);
1648   PetscFunctionReturn(0);
1649 }
1650