xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 84df9cb40eca90ea9b18a456fab7a4ecc7f6c1a4)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscMPIInt    rank;
421   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
422   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
423   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
424   PetscInt       *rbuf_i,kmax,rbuf_0;
425   PetscBT        xtable;
426 
427   PetscFunctionBegin;
428   rank   = c->rank;
429   m      = C->rmap->N;
430   rstart = C->rmap->rstart;
431   cstart = C->cmap->rstart;
432   ai     = a->i;
433   aj     = a->j;
434   bi     = b->i;
435   bj     = b->j;
436   garray = c->garray;
437 
438 
439   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
440     rbuf_i  =  rbuf[i];
441     rbuf_0  =  rbuf_i[0];
442     ct     += rbuf_0;
443     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
444   }
445 
446   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
447   else      max1 = 1;
448   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
449   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
450   ++no_malloc;
451   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
452   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
453 
454   ct3 = 0;
455   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
456     rbuf_i =  rbuf[i];
457     rbuf_0 =  rbuf_i[0];
458     ct1    =  2*rbuf_0+1;
459     ct2    =  ct1;
460     ct3    += ct1;
461     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
462       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
463       oct2 = ct2;
464       kmax = rbuf_i[2*j];
465       for (k=0; k<kmax; k++,ct1++) {
466         row = rbuf_i[ct1];
467         if (!PetscBTLookupSet(xtable,row)) {
468           if (!(ct3 < mem_estimate)) {
469             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
470             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
471             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
472             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
473             xdata[0]     = tmp;
474             mem_estimate = new_estimate; ++no_malloc;
475             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
476           }
477           xdata[i][ct2++] = row;
478           ct3++;
479         }
480       }
481       for (k=oct2,max2=ct2; k<max2; k++) {
482         row   = xdata[i][k] - rstart;
483         start = ai[row];
484         end   = ai[row+1];
485         for (l=start; l<end; l++) {
486           val = aj[l] + cstart;
487           if (!PetscBTLookupSet(xtable,val)) {
488             if (!(ct3 < mem_estimate)) {
489               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
490               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
491               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
492               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
493               xdata[0]     = tmp;
494               mem_estimate = new_estimate; ++no_malloc;
495               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
496             }
497             xdata[i][ct2++] = val;
498             ct3++;
499           }
500         }
501         start = bi[row];
502         end   = bi[row+1];
503         for (l=start; l<end; l++) {
504           val = garray[bj[l]];
505           if (!PetscBTLookupSet(xtable,val)) {
506             if (!(ct3 < mem_estimate)) {
507               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
508               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
509               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
510               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
511               xdata[0]     = tmp;
512               mem_estimate = new_estimate; ++no_malloc;
513               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
514             }
515             xdata[i][ct2++] = val;
516             ct3++;
517           }
518         }
519       }
520       /* Update the header*/
521       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
522       xdata[i][2*j-1] = rbuf_i[2*j-1];
523     }
524     xdata[i][0] = rbuf_0;
525     xdata[i+1]  = xdata[i] + ct2;
526     isz1[i]     = ct2; /* size of each message */
527   }
528   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
529   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
530   PetscFunctionReturn(0);
531 }
532 /* -------------------------------------------------------------------------*/
533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
535 /*
536     Every processor gets the entire matrix
537 */
538 #undef __FUNCT__
539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
541 {
542   Mat            B;
543   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
544   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
545   PetscErrorCode ierr;
546   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
547   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
548   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
549   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
550 
551   PetscFunctionBegin;
552   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
553   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
554 
555   if (scall == MAT_INITIAL_MATRIX) {
556     /* ----------------------------------------------------------------
557          Tell every processor the number of nonzeros per row
558     */
559     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
560     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
561       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
562     }
563     sendcount = A->rmap->rend - A->rmap->rstart;
564     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
565     for (i=0; i<size; i++) {
566       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
567       displs[i]     = A->rmap->range[i];
568     }
569 #if defined(PETSC_HAVE_MPI_IN_PLACE)
570     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #else
572     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
573 #endif
574     /* ---------------------------------------------------------------
575          Create the sequential matrix of the same type as the local block diagonal
576     */
577     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
578     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
579     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
580     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
581     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
582     **Bin = B;
583     b = (Mat_SeqAIJ *)B->data;
584 
585     /*--------------------------------------------------------------------
586        Copy my part of matrix column indices over
587     */
588     sendcount  = ad->nz + bd->nz;
589     jsendbuf   = b->j + b->i[rstarts[rank]];
590     a_jsendbuf = ad->j;
591     b_jsendbuf = bd->j;
592     n          = A->rmap->rend - A->rmap->rstart;
593     cnt        = 0;
594     for (i=0; i<n; i++) {
595 
596       /* put in lower diagonal portion */
597       m = bd->i[i+1] - bd->i[i];
598       while (m > 0) {
599         /* is it above diagonal (in bd (compressed) numbering) */
600         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
601         jsendbuf[cnt++] = garray[*b_jsendbuf++];
602         m--;
603       }
604 
605       /* put in diagonal portion */
606       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
607         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
608       }
609 
610       /* put in upper diagonal portion */
611       while (m-- > 0) {
612         jsendbuf[cnt++] = garray[*b_jsendbuf++];
613       }
614     }
615     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
616 
617     /*--------------------------------------------------------------------
618        Gather all column indices to all processors
619     */
620     for (i=0; i<size; i++) {
621       recvcounts[i] = 0;
622       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
623         recvcounts[i] += lens[j];
624       }
625     }
626     displs[0]  = 0;
627     for (i=1; i<size; i++) {
628       displs[i] = displs[i-1] + recvcounts[i-1];
629     }
630 #if defined(PETSC_HAVE_MPI_IN_PLACE)
631     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
632 #else
633     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
634 #endif
635     /*--------------------------------------------------------------------
636         Assemble the matrix into useable form (note numerical values not yet set)
637     */
638     /* set the b->ilen (length of each row) values */
639     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
640     /* set the b->i indices */
641     b->i[0] = 0;
642     for (i=1; i<=A->rmap->N; i++) {
643       b->i[i] = b->i[i-1] + lens[i-1];
644     }
645     ierr = PetscFree(lens);CHKERRQ(ierr);
646     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
648 
649   } else {
650     B  = **Bin;
651     b = (Mat_SeqAIJ *)B->data;
652   }
653 
654   /*--------------------------------------------------------------------
655        Copy my part of matrix numerical values into the values location
656   */
657   if (flag == MAT_GET_VALUES){
658     sendcount = ad->nz + bd->nz;
659     sendbuf   = b->a + b->i[rstarts[rank]];
660     a_sendbuf = ad->a;
661     b_sendbuf = bd->a;
662     b_sendj   = bd->j;
663     n         = A->rmap->rend - A->rmap->rstart;
664     cnt       = 0;
665     for (i=0; i<n; i++) {
666 
667       /* put in lower diagonal portion */
668       m = bd->i[i+1] - bd->i[i];
669       while (m > 0) {
670         /* is it above diagonal (in bd (compressed) numbering) */
671         if (garray[*b_sendj] > A->rmap->rstart + i) break;
672         sendbuf[cnt++] = *b_sendbuf++;
673         m--;
674         b_sendj++;
675       }
676 
677       /* put in diagonal portion */
678       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
679         sendbuf[cnt++] = *a_sendbuf++;
680       }
681 
682       /* put in upper diagonal portion */
683       while (m-- > 0) {
684         sendbuf[cnt++] = *b_sendbuf++;
685         b_sendj++;
686       }
687     }
688     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
689 
690     /* -----------------------------------------------------------------
691        Gather all numerical values to all processors
692     */
693     if (!recvcounts) {
694       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
695     }
696     for (i=0; i<size; i++) {
697       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
698     }
699     displs[0]  = 0;
700     for (i=1; i<size; i++) {
701       displs[i] = displs[i-1] + recvcounts[i-1];
702     }
703     recvbuf   = b->a;
704 #if defined(PETSC_HAVE_MPI_IN_PLACE)
705     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
706 #else
707     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
708 #endif
709   }  /* endof (flag == MAT_GET_VALUES) */
710   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
711 
712   if (A->symmetric){
713     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
714   } else if (A->hermitian) {
715     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
716   } else if (A->structurally_symmetric) {
717     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
718   }
719   PetscFunctionReturn(0);
720 }
721 
722 
723 
724 #undef __FUNCT__
725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
727 {
728   PetscErrorCode ierr;
729   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
730   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
731 
732   PetscFunctionBegin;
733   /*
734        Check for special case: each processor gets entire matrix
735   */
736   if (ismax == 1 && C->rmap->N == C->cmap->N) {
737     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
738     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
739     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
740     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
741     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
742       wantallmatrix = PETSC_TRUE;
743       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
744     }
745   }
746   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
747   if (twantallmatrix) {
748     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
749     PetscFunctionReturn(0);
750   }
751 
752   /* Allocate memory to hold all the submatrices */
753   if (scall != MAT_REUSE_MATRIX) {
754     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
755   }
756 
757   /* Check for special case: each processor gets entire matrix columns */
758   ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr);
759   for (i=0; i<ismax; i++) {
760     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
761     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
762     if (colflag && ncol == C->cmap->N){
763       allcolumns[i] = PETSC_TRUE;
764     } else {
765       allcolumns[i] = PETSC_FALSE;
766     }
767   }
768 
769   /* Determine the number of stages through which submatrices are done */
770   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
771 
772   /*
773      Each stage will extract nmax submatrices.
774      nmax is determined by the matrix column dimension.
775      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
776   */
777   if (!nmax) nmax = 1;
778   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
779 
780   /* Make sure every processor loops through the nstages */
781   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
782 
783   for (i=0,pos=0; i<nstages; i++) {
784     if (pos+nmax <= ismax) max_no = nmax;
785     else if (pos == ismax) max_no = 0;
786     else                   max_no = ismax-pos;
787     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
788     pos += max_no;
789   }
790 
791   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
792   PetscFunctionReturn(0);
793 }
794 
795 /* -------------------------------------------------------------------------*/
796 #undef __FUNCT__
797 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
798 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
799 {
800   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
801   Mat            A = c->A;
802   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
803   const PetscInt **icol,**irow;
804   PetscInt       *nrow,*ncol,start;
805   PetscErrorCode ierr;
806   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
807   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
808   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
809   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
810   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
811 #if defined (PETSC_USE_CTABLE)
812   PetscTable     *cmap,cmap_i,*rmap,rmap_i;
813 #else
814   PetscInt       **cmap,*cmap_i,**rmap,*rmap_i;
815 #endif
816   const PetscInt *irow_i;
817   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
818   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
819   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
820   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
821   MPI_Status     *r_status3,*r_status4,*s_status4;
822   MPI_Comm       comm;
823   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
824   PetscMPIInt    *onodes1,*olengths1;
825   PetscMPIInt    idex,idex2,end;
826 
827   PetscFunctionBegin;
828   comm   = ((PetscObject)C)->comm;
829   tag0   = ((PetscObject)C)->tag;
830   size   = c->size;
831   rank   = c->rank;
832 
833   /* Get some new tags to keep the communication clean */
834   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
835   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
836   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
837 
838   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
839 
840   for (i=0; i<ismax; i++) {
841     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
842     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
843     if (allcolumns[i]){
844       icol[i] = PETSC_NULL;
845       ncol[i] = C->cmap->N;
846     } else {
847       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
848       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
849     }
850   }
851 
852   /* evaluate communication - mesg to who, length of mesg, and buffer space
853      required. Based on this, buffers are allocated, and data copied into them*/
854   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
855   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
856   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
857   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
858   for (i=0; i<ismax; i++) {
859     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
860     jmax   = nrow[i];
861     irow_i = irow[i];
862     for (j=0; j<jmax; j++) {
863       l = 0;
864       row  = irow_i[j];
865       while (row >= C->rmap->range[l+1]) l++;
866       proc = l;
867       w4[proc]++;
868     }
869     for (j=0; j<size; j++) {
870       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
871     }
872   }
873 
874   nrqs     = 0;              /* no of outgoing messages */
875   msz      = 0;              /* total mesg length (for all procs) */
876   w1[rank] = 0;              /* no mesg sent to self */
877   w3[rank] = 0;
878   for (i=0; i<size; i++) {
879     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
880   }
881   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
882   for (i=0,j=0; i<size; i++) {
883     if (w1[i]) { pa[j] = i; j++; }
884   }
885 
886   /* Each message would have a header = 1 + 2*(no of IS) + data */
887   for (i=0; i<nrqs; i++) {
888     j     = pa[i];
889     w1[j] += w2[j] + 2* w3[j];
890     msz   += w1[j];
891   }
892   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
893 
894   /* Determine the number of messages to expect, their lengths, from from-ids */
895   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
896   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
897 
898   /* Now post the Irecvs corresponding to these messages */
899   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
900 
901   ierr = PetscFree(onodes1);CHKERRQ(ierr);
902   ierr = PetscFree(olengths1);CHKERRQ(ierr);
903 
904   /* Allocate Memory for outgoing messages */
905   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
906   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
907   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
908 
909   {
910     PetscInt *iptr = tmp,ict = 0;
911     for (i=0; i<nrqs; i++) {
912       j         = pa[i];
913       iptr     += ict;
914       sbuf1[j]  = iptr;
915       ict       = w1[j];
916     }
917   }
918 
919   /* Form the outgoing messages */
920   /* Initialize the header space */
921   for (i=0; i<nrqs; i++) {
922     j           = pa[i];
923     sbuf1[j][0] = 0;
924     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
925     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
926   }
927 
928   /* Parse the isrow and copy data into outbuf */
929   for (i=0; i<ismax; i++) {
930     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
931     irow_i = irow[i];
932     jmax   = nrow[i];
933     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
934       l = 0;
935       row  = irow_i[j];
936       while (row >= C->rmap->range[l+1]) l++;
937       proc = l;
938       if (proc != rank) { /* copy to the outgoing buf*/
939         ctr[proc]++;
940         *ptr[proc] = row;
941         ptr[proc]++;
942       }
943     }
944     /* Update the headers for the current IS */
945     for (j=0; j<size; j++) { /* Can Optimise this loop too */
946       if ((ctr_j = ctr[j])) {
947         sbuf1_j        = sbuf1[j];
948         k              = ++sbuf1_j[0];
949         sbuf1_j[2*k]   = ctr_j;
950         sbuf1_j[2*k-1] = i;
951       }
952     }
953   }
954 
955   /*  Now  post the sends */
956   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
957   for (i=0; i<nrqs; ++i) {
958     j    = pa[i];
959     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
960   }
961 
962   /* Post Receives to capture the buffer size */
963   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
964   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
965   rbuf2[0] = tmp + msz;
966   for (i=1; i<nrqs; ++i) {
967     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
968   }
969   for (i=0; i<nrqs; ++i) {
970     j    = pa[i];
971     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
972   }
973 
974   /* Send to other procs the buf size they should allocate */
975 
976 
977   /* Receive messages*/
978   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
979   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
980   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
981   {
982     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
983     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
984     PetscInt    *sbuf2_i;
985 
986     for (i=0; i<nrqr; ++i) {
987       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
988       req_size[idex] = 0;
989       rbuf1_i         = rbuf1[idex];
990       start           = 2*rbuf1_i[0] + 1;
991       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
992       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
993       sbuf2_i         = sbuf2[idex];
994       for (j=start; j<end; j++) {
995         id               = rbuf1_i[j] - rstart;
996         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
997         sbuf2_i[j]       = ncols;
998         req_size[idex] += ncols;
999       }
1000       req_source[idex] = r_status1[i].MPI_SOURCE;
1001       /* form the header */
1002       sbuf2_i[0]   = req_size[idex];
1003       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1004       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1005     }
1006   }
1007   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1008   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1009 
1010   /*  recv buffer sizes */
1011   /* Receive messages*/
1012 
1013   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1014   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1015   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1016   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1017   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1018 
1019   for (i=0; i<nrqs; ++i) {
1020     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1021     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1022     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1023     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1024     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1025   }
1026   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1027   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1028 
1029   /* Wait on sends1 and sends2 */
1030   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1031   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1032 
1033   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1034   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1035   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1036   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1037   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1038   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1039 
1040   /* Now allocate buffers for a->j, and send them off */
1041   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1042   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1043   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1044   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1045 
1046   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1047   {
1048     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1049     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1050     PetscInt cend = C->cmap->rend;
1051     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1052 
1053     for (i=0; i<nrqr; i++) {
1054       rbuf1_i   = rbuf1[i];
1055       sbuf_aj_i = sbuf_aj[i];
1056       ct1       = 2*rbuf1_i[0] + 1;
1057       ct2       = 0;
1058       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1059         kmax = rbuf1[i][2*j];
1060         for (k=0; k<kmax; k++,ct1++) {
1061           row    = rbuf1_i[ct1] - rstart;
1062           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1063           ncols  = nzA + nzB;
1064           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1065 
1066           /* load the column indices for this row into cols*/
1067           cols  = sbuf_aj_i + ct2;
1068 
1069 	  lwrite = 0;
1070           for (l=0; l<nzB; l++) {
1071             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1072           }
1073           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1074           for (l=0; l<nzB; l++) {
1075             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1076           }
1077 
1078           ct2 += ncols;
1079         }
1080       }
1081       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1082     }
1083   }
1084   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1085   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1086 
1087   /* Allocate buffers for a->a, and send them off */
1088   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1089   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1090   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1091   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1092 
1093   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1094   {
1095     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1096     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1097     PetscInt    cend = C->cmap->rend;
1098     PetscInt    *b_j = b->j;
1099     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1100 
1101     for (i=0; i<nrqr; i++) {
1102       rbuf1_i   = rbuf1[i];
1103       sbuf_aa_i = sbuf_aa[i];
1104       ct1       = 2*rbuf1_i[0]+1;
1105       ct2       = 0;
1106       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1107         kmax = rbuf1_i[2*j];
1108         for (k=0; k<kmax; k++,ct1++) {
1109           row    = rbuf1_i[ct1] - rstart;
1110           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1111           ncols  = nzA + nzB;
1112           cworkB = b_j + b_i[row];
1113           vworkA = a_a + a_i[row];
1114           vworkB = b_a + b_i[row];
1115 
1116           /* load the column values for this row into vals*/
1117           vals  = sbuf_aa_i+ct2;
1118 
1119 	  lwrite = 0;
1120           for (l=0; l<nzB; l++) {
1121             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1122           }
1123           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1124           for (l=0; l<nzB; l++) {
1125             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1126           }
1127 
1128           ct2 += ncols;
1129         }
1130       }
1131       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1132     }
1133   }
1134   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1135   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1136   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1137   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1138 
1139   /* Form the matrix */
1140   /* create col map: global col of C -> local col of submatrices */
1141   {
1142     const PetscInt *icol_i;
1143 #if defined (PETSC_USE_CTABLE)
1144     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1145     for (i=0; i<ismax; i++) {
1146       if (!allcolumns[i]){
1147         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1148         jmax   = ncol[i];
1149         icol_i = icol[i];
1150         cmap_i = cmap[i];
1151         for (j=0; j<jmax; j++) {
1152           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr);
1153         }
1154       } else {
1155         cmap[i] = PETSC_NULL;
1156       }
1157     }
1158 #else
1159     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1160     for (i=0; i<ismax; i++) {
1161       if (!allcolumns[i]){
1162         ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr);
1163         ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1164         jmax   = ncol[i];
1165         icol_i = icol[i];
1166         cmap_i = cmap[i];
1167         for (j=0; j<jmax; j++) {
1168           cmap_i[icol_i[j]] = j+1;
1169         }
1170       } else {
1171         cmap[i] = PETSC_NULL;
1172       }
1173     }
1174 #endif
1175   }
1176 
1177   /* Create lens which is required for MatCreate... */
1178   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1179   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1180   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1181   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1182   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1183 
1184   /* Update lens from local data */
1185   for (i=0; i<ismax; i++) {
1186     jmax   = nrow[i];
1187     if (!allcolumns[i]) cmap_i = cmap[i];
1188     irow_i = irow[i];
1189     lens_i = lens[i];
1190     for (j=0; j<jmax; j++) {
1191       l = 0;
1192       row  = irow_i[j];
1193       while (row >= C->rmap->range[l+1]) l++;
1194       proc = l;
1195       if (proc == rank) {
1196         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1197         if (!allcolumns[i]){
1198           for (k=0; k<ncols; k++) {
1199 #if defined (PETSC_USE_CTABLE)
1200             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1201 #else
1202             tcol = cmap_i[cols[k]];
1203 #endif
1204             if (tcol) { lens_i[j]++;}
1205           }
1206         } else { /* allcolumns */
1207           lens_i[j] = ncols;
1208         }
1209         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1210       }
1211     }
1212   }
1213 
1214   /* Create row map: global row of C -> local row of submatrices */
1215 #if defined (PETSC_USE_CTABLE)
1216   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1217     for (i=0; i<ismax; i++) {
1218       ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1219       rmap_i = rmap[i];
1220       irow_i = irow[i];
1221       jmax   = nrow[i];
1222       for (j=0; j<jmax; j++) {
1223         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr);
1224       }
1225     }
1226 #else
1227   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1228   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1229   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1230   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1231   for (i=0; i<ismax; i++) {
1232     rmap_i = rmap[i];
1233     irow_i = irow[i];
1234     jmax   = nrow[i];
1235     for (j=0; j<jmax; j++) {
1236       rmap_i[irow_i[j]] = j;
1237     }
1238   }
1239 #endif
1240 
1241   /* Update lens from offproc data */
1242   {
1243     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1244 
1245     for (tmp2=0; tmp2<nrqs; tmp2++) {
1246       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1247       idex   = pa[idex2];
1248       sbuf1_i = sbuf1[idex];
1249       jmax    = sbuf1_i[0];
1250       ct1     = 2*jmax+1;
1251       ct2     = 0;
1252       rbuf2_i = rbuf2[idex2];
1253       rbuf3_i = rbuf3[idex2];
1254       for (j=1; j<=jmax; j++) {
1255         is_no   = sbuf1_i[2*j-1];
1256         max1    = sbuf1_i[2*j];
1257         lens_i  = lens[is_no];
1258         if (!allcolumns[is_no]){cmap_i  = cmap[is_no];}
1259         rmap_i  = rmap[is_no];
1260         for (k=0; k<max1; k++,ct1++) {
1261 #if defined (PETSC_USE_CTABLE)
1262           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1263           row--;
1264           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1265 #else
1266           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1267 #endif
1268           max2 = rbuf2_i[ct1];
1269           for (l=0; l<max2; l++,ct2++) {
1270             if (!allcolumns[is_no]){
1271 #if defined (PETSC_USE_CTABLE)
1272               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1273 #else
1274               tcol = cmap_i[rbuf3_i[ct2]];
1275 #endif
1276               if (tcol) {
1277                 lens_i[row]++;
1278               }
1279             } else { /* allcolumns */
1280               lens_i[row]++; /* lens_i[row] += max2 ? */
1281             }
1282           }
1283         }
1284       }
1285     }
1286   }
1287   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1288   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1289   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1290   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1291   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1292 
1293   /* Create the submatrices */
1294   if (scall == MAT_REUSE_MATRIX) {
1295     PetscBool  flag;
1296 
1297     /*
1298         Assumes new rows are same length as the old rows,hence bug!
1299     */
1300     for (i=0; i<ismax; i++) {
1301       mat = (Mat_SeqAIJ *)(submats[i]->data);
1302       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1303       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1304       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1305       /* Initial matrix as if empty */
1306       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1307       submats[i]->factortype = C->factortype;
1308     }
1309   } else {
1310     for (i=0; i<ismax; i++) {
1311       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1312       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1313       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1314       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1315     }
1316   }
1317 
1318   /* Assemble the matrices */
1319   /* First assemble the local rows */
1320   {
1321     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1322     PetscScalar *imat_a;
1323 
1324     for (i=0; i<ismax; i++) {
1325       mat       = (Mat_SeqAIJ*)submats[i]->data;
1326       imat_ilen = mat->ilen;
1327       imat_j    = mat->j;
1328       imat_i    = mat->i;
1329       imat_a    = mat->a;
1330       if (!allcolumns[i]) cmap_i = cmap[i];
1331       rmap_i    = rmap[i];
1332       irow_i    = irow[i];
1333       jmax      = nrow[i];
1334       for (j=0; j<jmax; j++) {
1335 	l = 0;
1336         row      = irow_i[j];
1337         while (row >= C->rmap->range[l+1]) l++;
1338         proc = l;
1339         if (proc == rank) {
1340           old_row  = row;
1341 #if defined (PETSC_USE_CTABLE)
1342           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1343           row--;
1344 #else
1345           row      = rmap_i[row];
1346 #endif
1347           ilen_row = imat_ilen[row];
1348           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1349           mat_i    = imat_i[row] ;
1350           mat_a    = imat_a + mat_i;
1351           mat_j    = imat_j + mat_i;
1352           if (!allcolumns[i]){
1353             for (k=0; k<ncols; k++) {
1354 #if defined (PETSC_USE_CTABLE)
1355               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1356 #else
1357               tcol = cmap_i[cols[k]];
1358 #endif
1359               if (tcol){
1360                 *mat_j++ = tcol - 1;
1361                 *mat_a++ = vals[k];
1362                 ilen_row++;
1363               }
1364             }
1365           } else { /* allcolumns */
1366             for (k=0; k<ncols; k++) {
1367               *mat_j++ = cols[k] ; /* global col index! */
1368               *mat_a++ = vals[k];
1369               ilen_row++;
1370             }
1371           }
1372           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1373           imat_ilen[row] = ilen_row;
1374         }
1375       }
1376     }
1377   }
1378 
1379   /*   Now assemble the off proc rows*/
1380   {
1381     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1382     PetscInt    *imat_j,*imat_i;
1383     PetscScalar *imat_a,*rbuf4_i;
1384 
1385     for (tmp2=0; tmp2<nrqs; tmp2++) {
1386       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1387       idex   = pa[idex2];
1388       sbuf1_i = sbuf1[idex];
1389       jmax    = sbuf1_i[0];
1390       ct1     = 2*jmax + 1;
1391       ct2     = 0;
1392       rbuf2_i = rbuf2[idex2];
1393       rbuf3_i = rbuf3[idex2];
1394       rbuf4_i = rbuf4[idex2];
1395       for (j=1; j<=jmax; j++) {
1396         is_no     = sbuf1_i[2*j-1];
1397         rmap_i    = rmap[is_no];
1398         if (!allcolumns[is_no]){cmap_i = cmap[is_no];}
1399         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1400         imat_ilen = mat->ilen;
1401         imat_j    = mat->j;
1402         imat_i    = mat->i;
1403         imat_a    = mat->a;
1404         max1      = sbuf1_i[2*j];
1405         for (k=0; k<max1; k++,ct1++) {
1406           row   = sbuf1_i[ct1];
1407 #if defined (PETSC_USE_CTABLE)
1408           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1409           row--;
1410 #else
1411           row   = rmap_i[row];
1412 #endif
1413           ilen  = imat_ilen[row];
1414           mat_i = imat_i[row] ;
1415           mat_a = imat_a + mat_i;
1416           mat_j = imat_j + mat_i;
1417           max2 = rbuf2_i[ct1];
1418           if (!allcolumns[is_no]){
1419             for (l=0; l<max2; l++,ct2++) {
1420 
1421 #if defined (PETSC_USE_CTABLE)
1422               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1423 #else
1424               tcol = cmap_i[rbuf3_i[ct2]];
1425 #endif
1426               if (tcol) {
1427                 *mat_j++ = tcol - 1;
1428                 *mat_a++ = rbuf4_i[ct2];
1429                 ilen++;
1430               }
1431             }
1432           } else { /* allcolumns */
1433             for (l=0; l<max2; l++,ct2++) {
1434               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1435               *mat_a++ = rbuf4_i[ct2];
1436               ilen++;
1437             }
1438           }
1439           imat_ilen[row] = ilen;
1440         }
1441       }
1442     }
1443   }
1444 
1445   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1446   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1447   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1448   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1449   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1450 
1451   /* Restore the indices */
1452   for (i=0; i<ismax; i++) {
1453     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1454     if (!allcolumns[i]){
1455       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1456     }
1457   }
1458 
1459   /* Destroy allocated memory */
1460   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1461   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1462   ierr = PetscFree(pa);CHKERRQ(ierr);
1463 
1464   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1465   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1466   for (i=0; i<nrqr; ++i) {
1467     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1468   }
1469   for (i=0; i<nrqs; ++i) {
1470     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1471     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1472   }
1473 
1474   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1475   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1476   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1477   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1478   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1479   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1480   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1481 
1482 #if defined (PETSC_USE_CTABLE)
1483   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1484 #else
1485   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1486 #endif
1487   ierr = PetscFree(rmap);CHKERRQ(ierr);
1488 
1489   for (i=0; i<ismax; i++) {
1490     if (!allcolumns[i]){
1491 #if defined (PETSC_USE_CTABLE)
1492       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1493 #else
1494       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1495 #endif
1496     }
1497   }
1498   ierr = PetscFree(cmap);CHKERRQ(ierr);
1499   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1500   ierr = PetscFree(lens);CHKERRQ(ierr);
1501 
1502   for (i=0; i<ismax; i++) {
1503     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1504     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1505   }
1506   PetscFunctionReturn(0);
1507 }
1508 
1509 /*
1510  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1511  Be careful not to destroy them elsewhere.
1512  */
1513 #undef __FUNCT__
1514 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1515 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1516 {
1517   /* If making this function public, change the error returned in this function away from _PLIB. */
1518   PetscErrorCode ierr;
1519   Mat_MPIAIJ *aij;
1520   PetscBool seqaij;
1521 
1522   PetscFunctionBegin;
1523   /* Check to make sure the component matrices are compatible with C. */
1524   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1525   if(!seqaij) {
1526     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1527   }
1528   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1529   if(!seqaij) {
1530     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1531   }
1532   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1533     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1534   }
1535   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1536   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1537   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1538   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1539   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1540   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1541   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1542     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1543   }
1544   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1545   aij = (Mat_MPIAIJ*)((*C)->data);
1546   aij->A = A;
1547   aij->B = B;
1548   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1549   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1550   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1551   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 #undef __FUNCT__
1556 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1557 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1558 {
1559   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1560   PetscFunctionBegin;
1561   PetscValidPointer(A,2);
1562   PetscValidPointer(B,3);
1563   *A = aij->A;
1564   *B = aij->B;
1565   /* Note that we don't incref *A and *B, so be careful! */
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 
1570 #undef __FUNCT__
1571 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1572 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1573 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1574 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1575 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1576 {
1577   PetscErrorCode ierr;
1578   PetscMPIInt size, flag;
1579   PetscInt    i,ii;
1580   PetscInt    ismax_c;
1581   PetscFunctionBegin;
1582   if(!ismax) {
1583     PetscFunctionReturn(0);
1584   }
1585   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1586     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1587     if(flag != MPI_IDENT) {
1588       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1589     }
1590     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1591     if(size > 1) {
1592       ++ismax_c;
1593     }
1594   }
1595   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1596     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1597   }
1598   else { /* if(ismax_c) */
1599     Mat *A,*B;
1600     IS  *isrow_c, *iscol_c;
1601     PetscMPIInt size;
1602     /*
1603      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1604      array of sequential matrices underlying the resulting parallel matrices.
1605      Which arrays to allocate is based on the value of MatReuse scall.
1606      There are as many diag matrices as there are original index sets.
1607      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1608 
1609      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1610      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1611      will deposite the extracted diag and off-diag parts.
1612      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1613      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1614     */
1615 
1616     /* Parallel matrix array is allocated only if no reuse is taking place. */
1617     if (scall != MAT_REUSE_MATRIX) {
1618       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1619     }
1620     else {
1621       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1622       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1623       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1624       for(i = 0, ii = 0; i < ismax; ++i) {
1625         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1626         if(size > 1) {
1627           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1628           ++ii;
1629         }
1630         else {
1631           A[i] = (*submat)[i];
1632         }
1633       }
1634     }
1635     /*
1636      Construct the complements of the iscol ISs for parallel ISs only.
1637      These are used to extract the off-diag portion of the resulting parallel matrix.
1638      The row IS for the off-diag portion is the same as for the diag portion,
1639      so we merely alias the row IS, while skipping those that are sequential.
1640     */
1641     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1642     for(i = 0, ii = 0; i < ismax; ++i) {
1643       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1644       if(size > 1) {
1645 	isrow_c[ii] = isrow[i];
1646 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1647 	++ii;
1648       }
1649     }
1650     /* Now obtain the sequential A and B submatrices separately. */
1651     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1652     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1653     for(ii = 0; ii < ismax_c; ++ii) {
1654       ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr);
1655     }
1656     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1657     /*
1658      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1659      have been extracted directly into the parallel matrices containing them, or
1660      simply into the sequential matrix identical with the corresponding A (if size == 1).
1661      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1662      A is put into the correct submat slot (if size == 1).
1663      */
1664     if(scall != MAT_REUSE_MATRIX) {
1665       for(i = 0, ii = 0; i < ismax; ++i) {
1666         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1667         if(size > 1) {
1668           /*
1669            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1670            */
1671           /* Construct submat[i] from the Seq pieces A and B. */
1672           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1673 
1674           ++ii;
1675         }
1676         else {
1677           (*submat)[i] = A[i];
1678         }
1679       }
1680     }
1681     ierr = PetscFree(A); CHKERRQ(ierr);
1682     ierr = PetscFree(B); CHKERRQ(ierr);
1683   }
1684   PetscFunctionReturn(0);
1685 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1686 
1687 
1688 
1689 #undef __FUNCT__
1690 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1691 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1692 {
1693   PetscErrorCode ierr;
1694   PetscFunctionBegin;
1695   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1696                                            MatCreateMPIAIJFromSeqMatrices_Private,
1697                                            MatMPIAIJExtractSeqMatrices_Private);
1698   CHKERRQ(ierr);
1699   PetscFunctionReturn(0);
1700 }
1701