xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision a23e4ea714e49b3ffb9a7ce16012d5cc88dd9166)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscMPIInt    rank;
421   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
422   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
423   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
424   PetscInt       *rbuf_i,kmax,rbuf_0;
425   PetscBT        xtable;
426 
427   PetscFunctionBegin;
428   rank   = c->rank;
429   m      = C->rmap->N;
430   rstart = C->rmap->rstart;
431   cstart = C->cmap->rstart;
432   ai     = a->i;
433   aj     = a->j;
434   bi     = b->i;
435   bj     = b->j;
436   garray = c->garray;
437 
438 
439   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
440     rbuf_i  =  rbuf[i];
441     rbuf_0  =  rbuf_i[0];
442     ct     += rbuf_0;
443     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
444   }
445 
446   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
447   else      max1 = 1;
448   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
449   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
450   ++no_malloc;
451   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
452   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
453 
454   ct3 = 0;
455   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
456     rbuf_i =  rbuf[i];
457     rbuf_0 =  rbuf_i[0];
458     ct1    =  2*rbuf_0+1;
459     ct2    =  ct1;
460     ct3    += ct1;
461     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
462       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
463       oct2 = ct2;
464       kmax = rbuf_i[2*j];
465       for (k=0; k<kmax; k++,ct1++) {
466         row = rbuf_i[ct1];
467         if (!PetscBTLookupSet(xtable,row)) {
468           if (!(ct3 < mem_estimate)) {
469             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
470             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
471             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
472             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
473             xdata[0]     = tmp;
474             mem_estimate = new_estimate; ++no_malloc;
475             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
476           }
477           xdata[i][ct2++] = row;
478           ct3++;
479         }
480       }
481       for (k=oct2,max2=ct2; k<max2; k++) {
482         row   = xdata[i][k] - rstart;
483         start = ai[row];
484         end   = ai[row+1];
485         for (l=start; l<end; l++) {
486           val = aj[l] + cstart;
487           if (!PetscBTLookupSet(xtable,val)) {
488             if (!(ct3 < mem_estimate)) {
489               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
490               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
491               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
492               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
493               xdata[0]     = tmp;
494               mem_estimate = new_estimate; ++no_malloc;
495               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
496             }
497             xdata[i][ct2++] = val;
498             ct3++;
499           }
500         }
501         start = bi[row];
502         end   = bi[row+1];
503         for (l=start; l<end; l++) {
504           val = garray[bj[l]];
505           if (!PetscBTLookupSet(xtable,val)) {
506             if (!(ct3 < mem_estimate)) {
507               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
508               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
509               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
510               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
511               xdata[0]     = tmp;
512               mem_estimate = new_estimate; ++no_malloc;
513               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
514             }
515             xdata[i][ct2++] = val;
516             ct3++;
517           }
518         }
519       }
520       /* Update the header*/
521       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
522       xdata[i][2*j-1] = rbuf_i[2*j-1];
523     }
524     xdata[i][0] = rbuf_0;
525     xdata[i+1]  = xdata[i] + ct2;
526     isz1[i]     = ct2; /* size of each message */
527   }
528   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
529   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
530   PetscFunctionReturn(0);
531 }
532 /* -------------------------------------------------------------------------*/
533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool,Mat*);
534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
535 /*
536     Every processor gets the entire matrix
537 */
538 #undef __FUNCT__
539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
541 {
542   Mat            B;
543   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
544   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
545   PetscErrorCode ierr;
546   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
547   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
548   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
549   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
550 
551   PetscFunctionBegin;
552   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
553   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
554 
555   if (scall == MAT_INITIAL_MATRIX) {
556     /* ----------------------------------------------------------------
557          Tell every processor the number of nonzeros per row
558     */
559     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
560     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
561       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
562     }
563     sendcount = A->rmap->rend - A->rmap->rstart;
564     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
565     for (i=0; i<size; i++) {
566       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
567       displs[i]     = A->rmap->range[i];
568     }
569 #if defined(PETSC_HAVE_MPI_IN_PLACE)
570     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #else
572     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
573 #endif
574     /* ---------------------------------------------------------------
575          Create the sequential matrix of the same type as the local block diagonal
576     */
577     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
578     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
579     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
580     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
581     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
582     **Bin = B;
583     b = (Mat_SeqAIJ *)B->data;
584 
585     /*--------------------------------------------------------------------
586        Copy my part of matrix column indices over
587     */
588     sendcount  = ad->nz + bd->nz;
589     jsendbuf   = b->j + b->i[rstarts[rank]];
590     a_jsendbuf = ad->j;
591     b_jsendbuf = bd->j;
592     n          = A->rmap->rend - A->rmap->rstart;
593     cnt        = 0;
594     for (i=0; i<n; i++) {
595 
596       /* put in lower diagonal portion */
597       m = bd->i[i+1] - bd->i[i];
598       while (m > 0) {
599         /* is it above diagonal (in bd (compressed) numbering) */
600         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
601         jsendbuf[cnt++] = garray[*b_jsendbuf++];
602         m--;
603       }
604 
605       /* put in diagonal portion */
606       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
607         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
608       }
609 
610       /* put in upper diagonal portion */
611       while (m-- > 0) {
612         jsendbuf[cnt++] = garray[*b_jsendbuf++];
613       }
614     }
615     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
616 
617     /*--------------------------------------------------------------------
618        Gather all column indices to all processors
619     */
620     for (i=0; i<size; i++) {
621       recvcounts[i] = 0;
622       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
623         recvcounts[i] += lens[j];
624       }
625     }
626     displs[0]  = 0;
627     for (i=1; i<size; i++) {
628       displs[i] = displs[i-1] + recvcounts[i-1];
629     }
630 #if defined(PETSC_HAVE_MPI_IN_PLACE)
631     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
632 #else
633     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
634 #endif
635     /*--------------------------------------------------------------------
636         Assemble the matrix into useable form (note numerical values not yet set)
637     */
638     /* set the b->ilen (length of each row) values */
639     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
640     /* set the b->i indices */
641     b->i[0] = 0;
642     for (i=1; i<=A->rmap->N; i++) {
643       b->i[i] = b->i[i-1] + lens[i-1];
644     }
645     ierr = PetscFree(lens);CHKERRQ(ierr);
646     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
648 
649   } else {
650     B  = **Bin;
651     b = (Mat_SeqAIJ *)B->data;
652   }
653 
654   /*--------------------------------------------------------------------
655        Copy my part of matrix numerical values into the values location
656   */
657   if (flag == MAT_GET_VALUES){
658     sendcount = ad->nz + bd->nz;
659     sendbuf   = b->a + b->i[rstarts[rank]];
660     a_sendbuf = ad->a;
661     b_sendbuf = bd->a;
662     b_sendj   = bd->j;
663     n         = A->rmap->rend - A->rmap->rstart;
664     cnt       = 0;
665     for (i=0; i<n; i++) {
666 
667       /* put in lower diagonal portion */
668       m = bd->i[i+1] - bd->i[i];
669       while (m > 0) {
670         /* is it above diagonal (in bd (compressed) numbering) */
671         if (garray[*b_sendj] > A->rmap->rstart + i) break;
672         sendbuf[cnt++] = *b_sendbuf++;
673         m--;
674         b_sendj++;
675       }
676 
677       /* put in diagonal portion */
678       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
679         sendbuf[cnt++] = *a_sendbuf++;
680       }
681 
682       /* put in upper diagonal portion */
683       while (m-- > 0) {
684         sendbuf[cnt++] = *b_sendbuf++;
685         b_sendj++;
686       }
687     }
688     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
689 
690     /* -----------------------------------------------------------------
691        Gather all numerical values to all processors
692     */
693     if (!recvcounts) {
694       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
695     }
696     for (i=0; i<size; i++) {
697       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
698     }
699     displs[0]  = 0;
700     for (i=1; i<size; i++) {
701       displs[i] = displs[i-1] + recvcounts[i-1];
702     }
703     recvbuf   = b->a;
704 #if defined(PETSC_HAVE_MPI_IN_PLACE)
705     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
706 #else
707     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
708 #endif
709   }  /* endof (flag == MAT_GET_VALUES) */
710   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
711 
712   if (A->symmetric){
713     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
714   } else if (A->hermitian) {
715     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
716   } else if (A->structurally_symmetric) {
717     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
718   }
719   PetscFunctionReturn(0);
720 }
721 
722 
723 
724 #undef __FUNCT__
725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
727 {
728   PetscErrorCode ierr;
729   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
730   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,wantallmatrixcolumns=PETSC_FALSE;
731 
732   PetscFunctionBegin;
733   /*
734        Check for special case: each processor gets entire matrix
735   */
736   if (ismax == 1 && C->rmap->N == C->cmap->N) {
737     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
738     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
739     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
740     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
741     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
742       wantallmatrix = PETSC_TRUE;
743       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
744     }
745   }
746   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
747   if (twantallmatrix) {
748     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
749     PetscFunctionReturn(0);
750   }
751 
752   /* Allocate memory to hold all the submatrices */
753   if (scall != MAT_REUSE_MATRIX) {
754     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
755   }
756 
757   /* Check for special case: each processor gets entire matrix columns */
758   if (ismax == 1){
759     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
760     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
761     if (colflag && ncol == C->cmap->N){
762       wantallmatrixcolumns = PETSC_TRUE;
763       ierr = MatGetSubMatrices_MPIAIJ_Local(C,ismax,isrow,iscol,scall,wantallmatrixcolumns,*submat);CHKERRQ(ierr);
764       PetscFunctionReturn(0);
765     }
766   }
767 
768   /* Determine the number of stages through which submatrices are done */
769   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
770   /*
771      Each stage will extract nmax submatrices.
772      nmax is determined by the matrix column dimension.
773      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
774   */
775   if (!nmax) nmax = 1;
776   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
777 
778   /* Make sure every processor loops through the nstages */
779   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
780 
781   for (i=0,pos=0; i<nstages; i++) {
782     if (pos+nmax <= ismax) max_no = nmax;
783     else if (pos == ismax) max_no = 0;
784     else                   max_no = ismax-pos;
785     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,wantallmatrixcolumns,*submat+pos);CHKERRQ(ierr);
786     pos += max_no;
787   }
788   PetscFunctionReturn(0);
789 }
790 
791 /* -------------------------------------------------------------------------*/
792 #undef __FUNCT__
793 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
794 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
795 {
796   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
797   Mat            A = c->A;
798   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
799   const PetscInt **icol,**irow;
800   PetscInt       *nrow,*ncol,start;
801   PetscErrorCode ierr;
802   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
803   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
804   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
805   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
806   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
807 #if defined (PETSC_USE_CTABLE)
808   PetscTable     *cmap,cmap_i,*rmap,rmap_i;
809 #else
810   PetscInt       **cmap,*cmap_i,**rmap,*rmap_i;
811 #endif
812   const PetscInt *irow_i;
813   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
814   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
815   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
816   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
817   MPI_Status     *r_status3,*r_status4,*s_status4;
818   MPI_Comm       comm;
819   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
820   PetscMPIInt    *onodes1,*olengths1;
821   PetscMPIInt    idex,idex2,end;
822 
823   PetscFunctionBegin;
824   comm   = ((PetscObject)C)->comm;
825   tag0   = ((PetscObject)C)->tag;
826   size   = c->size;
827   rank   = c->rank;
828   //printf("[%d] MatGetSubMatrices_MPIAIJ_Local() allcolumns %d\n",rank,allcolumns);
829   //allcolumns = PETSC_FALSE;
830 
831   /* Get some new tags to keep the communication clean */
832   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
833   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
834   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
835 
836   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
837 
838   for (i=0; i<ismax; i++) {
839     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
840     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
841     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
842     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
843   }
844 
845   /* evaluate communication - mesg to who, length of mesg, and buffer space
846      required. Based on this, buffers are allocated, and data copied into them*/
847   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
848   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
849   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
850   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
851   for (i=0; i<ismax; i++) {
852     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
853     jmax   = nrow[i];
854     irow_i = irow[i];
855     for (j=0; j<jmax; j++) {
856       l = 0;
857       row  = irow_i[j];
858       while (row >= C->rmap->range[l+1]) l++;
859       proc = l;
860       w4[proc]++;
861     }
862     for (j=0; j<size; j++) {
863       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
864     }
865   }
866 
867   nrqs     = 0;              /* no of outgoing messages */
868   msz      = 0;              /* total mesg length (for all procs) */
869   w1[rank] = 0;              /* no mesg sent to self */
870   w3[rank] = 0;
871   for (i=0; i<size; i++) {
872     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
873   }
874   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
875   for (i=0,j=0; i<size; i++) {
876     if (w1[i]) { pa[j] = i; j++; }
877   }
878 
879   /* Each message would have a header = 1 + 2*(no of IS) + data */
880   for (i=0; i<nrqs; i++) {
881     j     = pa[i];
882     w1[j] += w2[j] + 2* w3[j];
883     msz   += w1[j];
884   }
885   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
886 
887   /* Determine the number of messages to expect, their lengths, from from-ids */
888   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
889   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
890 
891   /* Now post the Irecvs corresponding to these messages */
892   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
893 
894   ierr = PetscFree(onodes1);CHKERRQ(ierr);
895   ierr = PetscFree(olengths1);CHKERRQ(ierr);
896 
897   /* Allocate Memory for outgoing messages */
898   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
899   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
900   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
901 
902   {
903     PetscInt *iptr = tmp,ict = 0;
904     for (i=0; i<nrqs; i++) {
905       j         = pa[i];
906       iptr     += ict;
907       sbuf1[j]  = iptr;
908       ict       = w1[j];
909     }
910   }
911 
912   /* Form the outgoing messages */
913   /* Initialize the header space */
914   for (i=0; i<nrqs; i++) {
915     j           = pa[i];
916     sbuf1[j][0] = 0;
917     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
918     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
919   }
920 
921   /* Parse the isrow and copy data into outbuf */
922   for (i=0; i<ismax; i++) {
923     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
924     irow_i = irow[i];
925     jmax   = nrow[i];
926     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
927       l = 0;
928       row  = irow_i[j];
929       while (row >= C->rmap->range[l+1]) l++;
930       proc = l;
931       if (proc != rank) { /* copy to the outgoing buf*/
932         ctr[proc]++;
933         *ptr[proc] = row;
934         ptr[proc]++;
935       }
936     }
937     /* Update the headers for the current IS */
938     for (j=0; j<size; j++) { /* Can Optimise this loop too */
939       if ((ctr_j = ctr[j])) {
940         sbuf1_j        = sbuf1[j];
941         k              = ++sbuf1_j[0];
942         sbuf1_j[2*k]   = ctr_j;
943         sbuf1_j[2*k-1] = i;
944       }
945     }
946   }
947 
948   /*  Now  post the sends */
949   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
950   for (i=0; i<nrqs; ++i) {
951     j    = pa[i];
952     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
953   }
954 
955   /* Post Receives to capture the buffer size */
956   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
957   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
958   rbuf2[0] = tmp + msz;
959   for (i=1; i<nrqs; ++i) {
960     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
961   }
962   for (i=0; i<nrqs; ++i) {
963     j    = pa[i];
964     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
965   }
966 
967   /* Send to other procs the buf size they should allocate */
968 
969 
970   /* Receive messages*/
971   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
972   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
973   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
974   {
975     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
976     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
977     PetscInt    *sbuf2_i;
978 
979     for (i=0; i<nrqr; ++i) {
980       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
981       req_size[idex] = 0;
982       rbuf1_i         = rbuf1[idex];
983       start           = 2*rbuf1_i[0] + 1;
984       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
985       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
986       sbuf2_i         = sbuf2[idex];
987       for (j=start; j<end; j++) {
988         id               = rbuf1_i[j] - rstart;
989         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
990         sbuf2_i[j]       = ncols;
991         req_size[idex] += ncols;
992       }
993       req_source[idex] = r_status1[i].MPI_SOURCE;
994       /* form the header */
995       sbuf2_i[0]   = req_size[idex];
996       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
997       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
998     }
999   }
1000   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1001   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1002 
1003   /*  recv buffer sizes */
1004   /* Receive messages*/
1005 
1006   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1007   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1008   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1009   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1010   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1011 
1012   for (i=0; i<nrqs; ++i) {
1013     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1014     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1015     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1016     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1017     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1018   }
1019   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1020   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1021 
1022   /* Wait on sends1 and sends2 */
1023   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1024   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1025 
1026   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1027   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1028   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1029   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1030   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1031   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1032 
1033   /* Now allocate buffers for a->j, and send them off */
1034   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1035   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1036   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1037   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1038 
1039   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1040   {
1041     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1042     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1043     PetscInt cend = C->cmap->rend;
1044     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1045 
1046     for (i=0; i<nrqr; i++) {
1047       rbuf1_i   = rbuf1[i];
1048       sbuf_aj_i = sbuf_aj[i];
1049       ct1       = 2*rbuf1_i[0] + 1;
1050       ct2       = 0;
1051       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1052         kmax = rbuf1[i][2*j];
1053         for (k=0; k<kmax; k++,ct1++) {
1054           row    = rbuf1_i[ct1] - rstart;
1055           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1056           ncols  = nzA + nzB;
1057           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1058 
1059           /* load the column indices for this row into cols*/
1060           cols  = sbuf_aj_i + ct2;
1061 
1062 	  lwrite = 0;
1063           for (l=0; l<nzB; l++) {
1064             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1065           }
1066           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1067           for (l=0; l<nzB; l++) {
1068             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1069           }
1070 
1071           ct2 += ncols;
1072         }
1073       }
1074       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1075     }
1076   }
1077   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1078   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1079 
1080   /* Allocate buffers for a->a, and send them off */
1081   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1082   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1083   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1084   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1085 
1086   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1087   {
1088     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1089     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1090     PetscInt    cend = C->cmap->rend;
1091     PetscInt    *b_j = b->j;
1092     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1093 
1094     for (i=0; i<nrqr; i++) {
1095       rbuf1_i   = rbuf1[i];
1096       sbuf_aa_i = sbuf_aa[i];
1097       ct1       = 2*rbuf1_i[0]+1;
1098       ct2       = 0;
1099       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1100         kmax = rbuf1_i[2*j];
1101         for (k=0; k<kmax; k++,ct1++) {
1102           row    = rbuf1_i[ct1] - rstart;
1103           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1104           ncols  = nzA + nzB;
1105           cworkB = b_j + b_i[row];
1106           vworkA = a_a + a_i[row];
1107           vworkB = b_a + b_i[row];
1108 
1109           /* load the column values for this row into vals*/
1110           vals  = sbuf_aa_i+ct2;
1111 
1112 	  lwrite = 0;
1113           for (l=0; l<nzB; l++) {
1114             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1115           }
1116           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1117           for (l=0; l<nzB; l++) {
1118             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1119           }
1120 
1121           ct2 += ncols;
1122         }
1123       }
1124       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1125     }
1126   }
1127   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1128   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1129   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1130   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1131 
1132   /* Form the matrix */
1133   /* create col map: global col of C -> local col of submatrices */
1134   if (!allcolumns)
1135   {
1136     const PetscInt *icol_i;
1137 #if defined (PETSC_USE_CTABLE)
1138     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1139     for (i=0; i<ismax; i++) {
1140       ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr);
1141       jmax   = ncol[i];
1142       icol_i = icol[i];
1143       cmap_i = cmap[i];
1144       for (j=0; j<jmax; j++) {
1145         ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr);
1146       }
1147     }
1148 #else
1149     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1150     ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1151     ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1152     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1153     for (i=0; i<ismax; i++) {
1154       jmax   = ncol[i];
1155       icol_i = icol[i];
1156       cmap_i = cmap[i];
1157       for (j=0; j<jmax; j++) {
1158         cmap_i[icol_i[j]] = j+1;
1159         //printf("[%d] cmap_i[%d]=%d\n",rank,icol_i[j],j+1);
1160       }
1161     }
1162 #endif
1163   }
1164 
1165   /* Create lens which is required for MatCreate... */
1166   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1167   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1168   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1169   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1170   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1171 
1172   /* Update lens from local data */
1173   for (i=0; i<ismax; i++) {
1174     jmax   = nrow[i];
1175     if (!allcolumns) cmap_i = cmap[i];
1176     irow_i = irow[i];
1177     lens_i = lens[i];
1178     for (j=0; j<jmax; j++) { // for each row
1179       l = 0;
1180       row  = irow_i[j];
1181       while (row >= C->rmap->range[l+1]) l++;
1182       proc = l;
1183       if (proc == rank) {
1184         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1185         if (!allcolumns){
1186           for (k=0; k<ncols; k++) {
1187 #if defined (PETSC_USE_CTABLE)
1188             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1189 #else
1190             tcol = cmap_i[cols[k]];
1191             //printf("[%d] cmap_i[%d]=%d\n",rank,cols[k],tcol);
1192 #endif
1193             if (tcol) { lens_i[j]++;} //
1194           }
1195         } else { // allcolumns
1196           lens_i[j] = ncols;
1197         }
1198         //if (rank==0) printf("\n");
1199         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1200       } //if (proc == rank) { -> lens_i[j]=ncols;
1201     } //for (j=0; j<jmax; j++)
1202   } // for (i=0; i<ismax; i++)
1203 
1204   /* Create row map: global row of C -> local row of submatrices */
1205 #if defined (PETSC_USE_CTABLE)
1206   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1207     for (i=0; i<ismax; i++) {
1208       ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr);
1209       rmap_i = rmap[i];
1210       irow_i = irow[i];
1211       jmax   = nrow[i];
1212       for (j=0; j<jmax; j++) {
1213         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr);
1214       }
1215     }
1216 #else
1217   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1218   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1219   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1220   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1221   for (i=0; i<ismax; i++) {
1222     rmap_i = rmap[i];
1223     irow_i = irow[i];
1224     jmax   = nrow[i];
1225     for (j=0; j<jmax; j++) {
1226       rmap_i[irow_i[j]] = j;
1227     }
1228   }
1229 #endif
1230 
1231   /* Update lens from offproc data */
1232   {
1233     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1234 
1235     for (tmp2=0; tmp2<nrqs; tmp2++) {
1236       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1237       idex   = pa[idex2];
1238       sbuf1_i = sbuf1[idex];
1239       jmax    = sbuf1_i[0];
1240       ct1     = 2*jmax+1;
1241       ct2     = 0;
1242       rbuf2_i = rbuf2[idex2];
1243       rbuf3_i = rbuf3[idex2];
1244       for (j=1; j<=jmax; j++) {
1245         is_no   = sbuf1_i[2*j-1];
1246         max1    = sbuf1_i[2*j];
1247         lens_i  = lens[is_no];
1248         if (!allcolumns){cmap_i  = cmap[is_no];}
1249         rmap_i  = rmap[is_no];
1250         for (k=0; k<max1; k++,ct1++) {
1251 #if defined (PETSC_USE_CTABLE)
1252           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1253           row--;
1254           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1255 #else
1256           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1257 #endif
1258           max2 = rbuf2_i[ct1];
1259           for (l=0; l<max2; l++,ct2++) {
1260             if (!allcolumns){
1261 #if defined (PETSC_USE_CTABLE)
1262               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1263 #else
1264               tcol = cmap_i[rbuf3_i[ct2]];
1265 #endif
1266               //printf("[%d] tcol %d\n",rank,tcol);
1267               if (tcol) {
1268                 lens_i[row]++;
1269               }
1270             } else { //allcolumns
1271               lens_i[row]++; //lens_i[row] += max2 ?
1272             }
1273           } //for (l=0; l<max2; l++,ct2++) {
1274         }
1275       }
1276     }
1277   }
1278   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1279   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1280   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1281   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1282   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1283 
1284   /* Create the submatrices */
1285   if (scall == MAT_REUSE_MATRIX) {
1286     PetscBool  flag;
1287 
1288     /*
1289         Assumes new rows are same length as the old rows,hence bug!
1290     */
1291     for (i=0; i<ismax; i++) {
1292       mat = (Mat_SeqAIJ *)(submats[i]->data);
1293       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1294       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1295       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1296       /* Initial matrix as if empty */
1297       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1298       submats[i]->factortype = C->factortype;
1299     }
1300   } else {
1301     for (i=0; i<ismax; i++) {
1302       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1303       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1304       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1305       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1306     }
1307   }
1308 
1309   /* Assemble the matrices */
1310   /* First assemble the local rows */
1311   {
1312     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1313     PetscScalar *imat_a;
1314 
1315     for (i=0; i<ismax; i++) {
1316       mat       = (Mat_SeqAIJ*)submats[i]->data;
1317       imat_ilen = mat->ilen;
1318       imat_j    = mat->j;
1319       imat_i    = mat->i;
1320       imat_a    = mat->a;
1321       if (!allcolumns) cmap_i = cmap[i];
1322       rmap_i    = rmap[i];
1323       irow_i    = irow[i];
1324       jmax      = nrow[i];
1325       for (j=0; j<jmax; j++) {
1326 	l = 0;
1327         row      = irow_i[j];
1328         while (row >= C->rmap->range[l+1]) l++;
1329         proc = l;
1330         if (proc == rank) {
1331           old_row  = row;
1332 #if defined (PETSC_USE_CTABLE)
1333           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1334           row--;
1335 #else
1336           row      = rmap_i[row];
1337 #endif
1338           ilen_row = imat_ilen[row];
1339           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1340           mat_i    = imat_i[row] ;
1341           mat_a    = imat_a + mat_i;
1342           mat_j    = imat_j + mat_i;
1343           if (!allcolumns){
1344             for (k=0; k<ncols; k++) {
1345 #if defined (PETSC_USE_CTABLE)
1346               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1347 #else
1348               tcol = cmap_i[cols[k]];
1349 #endif
1350               //printf("[%d] cols %d, tcol-1 %d\n",rank,cols[k],tcol-1);
1351               if (tcol){
1352                 *mat_j++ = tcol - 1;
1353                 *mat_a++ = vals[k];
1354                 ilen_row++;
1355               }
1356             }
1357           } else { //allcolumns
1358             for (k=0; k<ncols; k++) {
1359               *mat_j++ = cols[k] ; /* global col index! */
1360               *mat_a++ = vals[k];
1361               ilen_row++;
1362             }
1363           }
1364           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1365           imat_ilen[row] = ilen_row;
1366         }
1367       }
1368     }
1369   }
1370 
1371   /*   Now assemble the off proc rows*/
1372   {
1373     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1374     PetscInt    *imat_j,*imat_i;
1375     PetscScalar *imat_a,*rbuf4_i;
1376 
1377     for (tmp2=0; tmp2<nrqs; tmp2++) {
1378       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1379       idex   = pa[idex2];
1380       sbuf1_i = sbuf1[idex];
1381       jmax    = sbuf1_i[0];
1382       ct1     = 2*jmax + 1;
1383       ct2     = 0;
1384       rbuf2_i = rbuf2[idex2];
1385       rbuf3_i = rbuf3[idex2];
1386       rbuf4_i = rbuf4[idex2];
1387       for (j=1; j<=jmax; j++) {
1388         is_no     = sbuf1_i[2*j-1];
1389         rmap_i    = rmap[is_no];
1390         if (!allcolumns){cmap_i    = cmap[is_no];}
1391         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1392         imat_ilen = mat->ilen;
1393         imat_j    = mat->j;
1394         imat_i    = mat->i;
1395         imat_a    = mat->a;
1396         max1      = sbuf1_i[2*j];
1397         for (k=0; k<max1; k++,ct1++) {
1398           row   = sbuf1_i[ct1];
1399 #if defined (PETSC_USE_CTABLE)
1400           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1401           row--;
1402 #else
1403           row   = rmap_i[row];
1404 #endif
1405           ilen  = imat_ilen[row];
1406           mat_i = imat_i[row] ;
1407           mat_a = imat_a + mat_i;
1408           mat_j = imat_j + mat_i;
1409           max2 = rbuf2_i[ct1];
1410           if (!allcolumns){
1411             for (l=0; l<max2; l++,ct2++) {
1412 
1413 #if defined (PETSC_USE_CTABLE)
1414               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1415 #else
1416               tcol = cmap_i[rbuf3_i[ct2]];
1417 #endif
1418               if (tcol) {
1419                 //if (rank) printf("[%d] rbuf3_i[ct2] %d, tcol %d\n",rank,rbuf3_i[ct2],tcol-1);
1420                 *mat_j++ = tcol - 1;
1421                 *mat_a++ = rbuf4_i[ct2];
1422                 ilen++;
1423               }
1424             }
1425           } else { //allcolumns
1426             for (l=0; l<max2; l++,ct2++) {
1427               *mat_j++ = rbuf3_i[ct2]; /* global col index! */
1428               *mat_a++ = rbuf4_i[ct2];
1429               ilen++;
1430             }
1431           }
1432           imat_ilen[row] = ilen;
1433         }
1434       }
1435     }
1436   }
1437   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1438   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1439   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1440   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1441   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1442 
1443   /* Restore the indices */
1444   for (i=0; i<ismax; i++) {
1445     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1446     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1447   }
1448 
1449   /* Destroy allocated memory */
1450   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1451   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1452   ierr = PetscFree(pa);CHKERRQ(ierr);
1453 
1454   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1455   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1456   for (i=0; i<nrqr; ++i) {
1457     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1458   }
1459   for (i=0; i<nrqs; ++i) {
1460     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1461     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1462   }
1463 
1464   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1465   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1466   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1467   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1468   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1469   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1470   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1471 #if defined (PETSC_USE_CTABLE)
1472   for (i=0; i<ismax; i++) {
1473     if (!allcolumns){ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);}
1474     ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);
1475   }
1476 #else
1477   if (!allcolumns){ierr = PetscFree(cmap[0]);CHKERRQ(ierr);}
1478   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1479 #endif
1480   if (!allcolumns){ierr = PetscFree(cmap);CHKERRQ(ierr);}
1481   ierr = PetscFree(rmap);CHKERRQ(ierr);
1482   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1483   ierr = PetscFree(lens);CHKERRQ(ierr);
1484 
1485   for (i=0; i<ismax; i++) {
1486     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1487     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1488   }
1489   PetscFunctionReturn(0);
1490 }
1491 
1492 /*
1493  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1494  Be careful not to destroy them elsewhere.
1495  */
1496 #undef __FUNCT__
1497 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1498 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1499 {
1500   /* If making this function public, change the error returned in this function away from _PLIB. */
1501   PetscErrorCode ierr;
1502   Mat_MPIAIJ *aij;
1503   PetscBool seqaij;
1504 
1505   PetscFunctionBegin;
1506   /* Check to make sure the component matrices are compatible with C. */
1507   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1508   if(!seqaij) {
1509     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1510   }
1511   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1512   if(!seqaij) {
1513     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1514   }
1515   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1516     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1517   }
1518   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1519   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1520   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1521   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1522   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1523   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1524   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1525     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1526   }
1527   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1528   aij = (Mat_MPIAIJ*)((*C)->data);
1529   aij->A = A;
1530   aij->B = B;
1531   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1532   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1533   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1534   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 #undef __FUNCT__
1539 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1540 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1541 {
1542   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1543   PetscFunctionBegin;
1544   PetscValidPointer(A,2);
1545   PetscValidPointer(B,3);
1546   *A = aij->A;
1547   *B = aij->B;
1548   /* Note that we don't incref *A and *B, so be careful! */
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 
1553 #undef __FUNCT__
1554 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1555 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1556 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1557 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1558 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1559 {
1560   PetscErrorCode ierr;
1561   PetscMPIInt size, flag;
1562   PetscInt    i,ii;
1563   PetscInt    ismax_c;
1564   PetscFunctionBegin;
1565   if(!ismax) {
1566     PetscFunctionReturn(0);
1567   }
1568   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1569     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1570     if(flag != MPI_IDENT) {
1571       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1572     }
1573     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1574     if(size > 1) {
1575       ++ismax_c;
1576     }
1577   }
1578   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1579     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1580   }
1581   else { /* if(ismax_c) */
1582     Mat *A,*B;
1583     IS  *isrow_c, *iscol_c;
1584     PetscMPIInt size;
1585     /*
1586      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1587      array of sequential matrices underlying the resulting parallel matrices.
1588      Which arrays to allocate is based on the value of MatReuse scall.
1589      There are as many diag matrices as there are original index sets.
1590      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1591 
1592      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1593      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1594      will deposite the extracted diag and off-diag parts.
1595      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1596      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1597     */
1598 
1599     /* Parallel matrix array is allocated only if no reuse is taking place. */
1600     if (scall != MAT_REUSE_MATRIX) {
1601       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1602     }
1603     else {
1604       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1605       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1606       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1607       for(i = 0, ii = 0; i < ismax; ++i) {
1608         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1609         if(size > 1) {
1610           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1611           ++ii;
1612         }
1613         else {
1614           A[i] = (*submat)[i];
1615         }
1616       }
1617     }
1618     /*
1619      Construct the complements of the iscol ISs for parallel ISs only.
1620      These are used to extract the off-diag portion of the resulting parallel matrix.
1621      The row IS for the off-diag portion is the same as for the diag portion,
1622      so we merely alias the row IS, while skipping those that are sequential.
1623     */
1624     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1625     for(i = 0, ii = 0; i < ismax; ++i) {
1626       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1627       if(size > 1) {
1628 	isrow_c[ii] = isrow[i];
1629 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1630 	++ii;
1631       }
1632     }
1633     /* Now obtain the sequential A and B submatrices separately. */
1634     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1635     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1636     for(ii = 0; ii < ismax_c; ++ii) {
1637       ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr);
1638     }
1639     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1640     /*
1641      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1642      have been extracted directly into the parallel matrices containing them, or
1643      simply into the sequential matrix identical with the corresponding A (if size == 1).
1644      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1645      A is put into the correct submat slot (if size == 1).
1646      */
1647     if(scall != MAT_REUSE_MATRIX) {
1648       for(i = 0, ii = 0; i < ismax; ++i) {
1649         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1650         if(size > 1) {
1651           /*
1652            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1653            */
1654           /* Construct submat[i] from the Seq pieces A and B. */
1655           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1656 
1657           ++ii;
1658         }
1659         else {
1660           (*submat)[i] = A[i];
1661         }
1662       }
1663     }
1664     ierr = PetscFree(A); CHKERRQ(ierr);
1665     ierr = PetscFree(B); CHKERRQ(ierr);
1666   }
1667   PetscFunctionReturn(0);
1668 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1669 
1670 
1671 
1672 #undef __FUNCT__
1673 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1674 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1675 {
1676   PetscErrorCode ierr;
1677   PetscFunctionBegin;
1678   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1679                                            MatCreateMPIAIJFromSeqMatrices_Private,
1680                                            MatMPIAIJExtractSeqMatrices_Private);
1681   CHKERRQ(ierr);
1682   PetscFunctionReturn(0);
1683 }
1684