xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision fcd14d6b088c23ca4e3d4b3f56f7c2bfda36916c)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscMPIInt    rank;
421   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
422   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
423   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
424   PetscInt       *rbuf_i,kmax,rbuf_0;
425   PetscBT        xtable;
426 
427   PetscFunctionBegin;
428   rank   = c->rank;
429   m      = C->rmap->N;
430   rstart = C->rmap->rstart;
431   cstart = C->cmap->rstart;
432   ai     = a->i;
433   aj     = a->j;
434   bi     = b->i;
435   bj     = b->j;
436   garray = c->garray;
437 
438 
439   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
440     rbuf_i  =  rbuf[i];
441     rbuf_0  =  rbuf_i[0];
442     ct     += rbuf_0;
443     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
444   }
445 
446   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
447   else      max1 = 1;
448   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
449   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
450   ++no_malloc;
451   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
452   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
453 
454   ct3 = 0;
455   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
456     rbuf_i =  rbuf[i];
457     rbuf_0 =  rbuf_i[0];
458     ct1    =  2*rbuf_0+1;
459     ct2    =  ct1;
460     ct3    += ct1;
461     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
462       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
463       oct2 = ct2;
464       kmax = rbuf_i[2*j];
465       for (k=0; k<kmax; k++,ct1++) {
466         row = rbuf_i[ct1];
467         if (!PetscBTLookupSet(xtable,row)) {
468           if (!(ct3 < mem_estimate)) {
469             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
470             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
471             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
472             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
473             xdata[0]     = tmp;
474             mem_estimate = new_estimate; ++no_malloc;
475             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
476           }
477           xdata[i][ct2++] = row;
478           ct3++;
479         }
480       }
481       for (k=oct2,max2=ct2; k<max2; k++) {
482         row   = xdata[i][k] - rstart;
483         start = ai[row];
484         end   = ai[row+1];
485         for (l=start; l<end; l++) {
486           val = aj[l] + cstart;
487           if (!PetscBTLookupSet(xtable,val)) {
488             if (!(ct3 < mem_estimate)) {
489               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
490               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
491               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
492               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
493               xdata[0]     = tmp;
494               mem_estimate = new_estimate; ++no_malloc;
495               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
496             }
497             xdata[i][ct2++] = val;
498             ct3++;
499           }
500         }
501         start = bi[row];
502         end   = bi[row+1];
503         for (l=start; l<end; l++) {
504           val = garray[bj[l]];
505           if (!PetscBTLookupSet(xtable,val)) {
506             if (!(ct3 < mem_estimate)) {
507               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
508               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
509               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
510               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
511               xdata[0]     = tmp;
512               mem_estimate = new_estimate; ++no_malloc;
513               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
514             }
515             xdata[i][ct2++] = val;
516             ct3++;
517           }
518         }
519       }
520       /* Update the header*/
521       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
522       xdata[i][2*j-1] = rbuf_i[2*j-1];
523     }
524     xdata[i][0] = rbuf_0;
525     xdata[i+1]  = xdata[i] + ct2;
526     isz1[i]     = ct2; /* size of each message */
527   }
528   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
529   ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
530   PetscFunctionReturn(0);
531 }
532 /* -------------------------------------------------------------------------*/
533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool,Mat*);
534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
535 /*
536     Every processor gets the entire matrix
537 */
538 #undef __FUNCT__
539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
541 {
542   Mat            B;
543   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
544   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
545   PetscErrorCode ierr;
546   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
547   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
548   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
549   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
550 
551   PetscFunctionBegin;
552   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
553   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
554 
555   if (scall == MAT_INITIAL_MATRIX) {
556     /* ----------------------------------------------------------------
557          Tell every processor the number of nonzeros per row
558     */
559     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
560     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
561       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
562     }
563     sendcount = A->rmap->rend - A->rmap->rstart;
564     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
565     for (i=0; i<size; i++) {
566       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
567       displs[i]     = A->rmap->range[i];
568     }
569 #if defined(PETSC_HAVE_MPI_IN_PLACE)
570     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #else
572     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
573 #endif
574     /* ---------------------------------------------------------------
575          Create the sequential matrix of the same type as the local block diagonal
576     */
577     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
578     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
579     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
580     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
581     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
582     **Bin = B;
583     b = (Mat_SeqAIJ *)B->data;
584 
585     /*--------------------------------------------------------------------
586        Copy my part of matrix column indices over
587     */
588     sendcount  = ad->nz + bd->nz;
589     jsendbuf   = b->j + b->i[rstarts[rank]];
590     a_jsendbuf = ad->j;
591     b_jsendbuf = bd->j;
592     n          = A->rmap->rend - A->rmap->rstart;
593     cnt        = 0;
594     for (i=0; i<n; i++) {
595 
596       /* put in lower diagonal portion */
597       m = bd->i[i+1] - bd->i[i];
598       while (m > 0) {
599         /* is it above diagonal (in bd (compressed) numbering) */
600         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
601         jsendbuf[cnt++] = garray[*b_jsendbuf++];
602         m--;
603       }
604 
605       /* put in diagonal portion */
606       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
607         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
608       }
609 
610       /* put in upper diagonal portion */
611       while (m-- > 0) {
612         jsendbuf[cnt++] = garray[*b_jsendbuf++];
613       }
614     }
615     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
616 
617     /*--------------------------------------------------------------------
618        Gather all column indices to all processors
619     */
620     for (i=0; i<size; i++) {
621       recvcounts[i] = 0;
622       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
623         recvcounts[i] += lens[j];
624       }
625     }
626     displs[0]  = 0;
627     for (i=1; i<size; i++) {
628       displs[i] = displs[i-1] + recvcounts[i-1];
629     }
630 #if defined(PETSC_HAVE_MPI_IN_PLACE)
631     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
632 #else
633     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
634 #endif
635     /*--------------------------------------------------------------------
636         Assemble the matrix into useable form (note numerical values not yet set)
637     */
638     /* set the b->ilen (length of each row) values */
639     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
640     /* set the b->i indices */
641     b->i[0] = 0;
642     for (i=1; i<=A->rmap->N; i++) {
643       b->i[i] = b->i[i-1] + lens[i-1];
644     }
645     ierr = PetscFree(lens);CHKERRQ(ierr);
646     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
648 
649   } else {
650     B  = **Bin;
651     b = (Mat_SeqAIJ *)B->data;
652   }
653 
654   /*--------------------------------------------------------------------
655        Copy my part of matrix numerical values into the values location
656   */
657   if (flag == MAT_GET_VALUES){
658     sendcount = ad->nz + bd->nz;
659     sendbuf   = b->a + b->i[rstarts[rank]];
660     a_sendbuf = ad->a;
661     b_sendbuf = bd->a;
662     b_sendj   = bd->j;
663     n         = A->rmap->rend - A->rmap->rstart;
664     cnt       = 0;
665     for (i=0; i<n; i++) {
666 
667       /* put in lower diagonal portion */
668       m = bd->i[i+1] - bd->i[i];
669       while (m > 0) {
670         /* is it above diagonal (in bd (compressed) numbering) */
671         if (garray[*b_sendj] > A->rmap->rstart + i) break;
672         sendbuf[cnt++] = *b_sendbuf++;
673         m--;
674         b_sendj++;
675       }
676 
677       /* put in diagonal portion */
678       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
679         sendbuf[cnt++] = *a_sendbuf++;
680       }
681 
682       /* put in upper diagonal portion */
683       while (m-- > 0) {
684         sendbuf[cnt++] = *b_sendbuf++;
685         b_sendj++;
686       }
687     }
688     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
689 
690     /* -----------------------------------------------------------------
691        Gather all numerical values to all processors
692     */
693     if (!recvcounts) {
694       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
695     }
696     for (i=0; i<size; i++) {
697       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
698     }
699     displs[0]  = 0;
700     for (i=1; i<size; i++) {
701       displs[i] = displs[i-1] + recvcounts[i-1];
702     }
703     recvbuf   = b->a;
704 #if defined(PETSC_HAVE_MPI_IN_PLACE)
705     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
706 #else
707     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
708 #endif
709   }  /* endof (flag == MAT_GET_VALUES) */
710   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
711 
712   if (A->symmetric){
713     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
714   } else if (A->hermitian) {
715     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
716   } else if (A->structurally_symmetric) {
717     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
718   }
719   PetscFunctionReturn(0);
720 }
721 
722 
723 
724 #undef __FUNCT__
725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
727 {
728   PetscErrorCode ierr;
729   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
730   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,wantallmatrixcolumns=PETSC_FALSE;
731 
732   PetscFunctionBegin;
733   /*
734        Check for special case: each processor gets entire matrix
735   */
736   if (ismax == 1 && C->rmap->N == C->cmap->N) {
737     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
738     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
739     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
740     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
741     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
742       wantallmatrix = PETSC_TRUE;
743       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
744     }
745   }
746   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
747   if (twantallmatrix) {
748     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
749     PetscFunctionReturn(0);
750   }
751 
752   /* Allocate memory to hold all the submatrices */
753   if (scall != MAT_REUSE_MATRIX) {
754     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
755   }
756 
757   /* Check for special case: each processor gets entire matrix columns */
758   if (ismax == 1){
759     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
760     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
761     if (colflag && ncol == C->cmap->N){
762       wantallmatrixcolumns = PETSC_TRUE;
763       ierr = MatGetSubMatrices_MPIAIJ_Local(C,ismax,isrow,iscol,scall,wantallmatrixcolumns,*submat);CHKERRQ(ierr);
764       PetscFunctionReturn(0);
765     }
766   }
767 
768   /* Determine the number of stages through which submatrices are done */
769   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
770   /*
771      Each stage will extract nmax submatrices.
772      nmax is determined by the matrix column dimension.
773      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
774   */
775   if (!nmax) nmax = 1;
776   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
777 
778   /* Make sure every processor loops through the nstages */
779   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
780 
781   for (i=0,pos=0; i<nstages; i++) {
782     if (pos+nmax <= ismax) max_no = nmax;
783     else if (pos == ismax) max_no = 0;
784     else                   max_no = ismax-pos;
785     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,wantallmatrixcolumns,*submat+pos);CHKERRQ(ierr);
786     pos += max_no;
787   }
788   PetscFunctionReturn(0);
789 }
790 
791 /* -------------------------------------------------------------------------*/
792 #undef __FUNCT__
793 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
794 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
795 {
796   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
797   Mat            A = c->A;
798   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
799   const PetscInt **icol,**irow;
800   PetscInt       *nrow,*ncol,start;
801   PetscErrorCode ierr;
802   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
803   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
804   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
805   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
806   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
807 #if defined (PETSC_USE_CTABLE)
808   PetscTable     *cmap,cmap_i,*rmap,rmap_i;
809 #else
810   PetscInt       **cmap,*cmap_i,**rmap,*rmap_i;
811 #endif
812   const PetscInt *irow_i;
813   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
814   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
815   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
816   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
817   MPI_Status     *r_status3,*r_status4,*s_status4;
818   MPI_Comm       comm;
819   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
820   PetscMPIInt    *onodes1,*olengths1;
821   PetscMPIInt    idex,idex2,end;
822 
823   PetscFunctionBegin;
824   comm   = ((PetscObject)C)->comm;
825   tag0   = ((PetscObject)C)->tag;
826   size   = c->size;
827   rank   = c->rank;
828 
829   /* Get some new tags to keep the communication clean */
830   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
831   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
832   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
833 
834   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
835 
836   for (i=0; i<ismax; i++) {
837     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
838     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
839     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
840     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
841   }
842 
843   /* evaluate communication - mesg to who, length of mesg, and buffer space
844      required. Based on this, buffers are allocated, and data copied into them*/
845   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
846   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
847   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
848   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
849   for (i=0; i<ismax; i++) {
850     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
851     jmax   = nrow[i];
852     irow_i = irow[i];
853     for (j=0; j<jmax; j++) {
854       l = 0;
855       row  = irow_i[j];
856       while (row >= C->rmap->range[l+1]) l++;
857       proc = l;
858       w4[proc]++;
859     }
860     for (j=0; j<size; j++) {
861       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
862     }
863   }
864 
865   nrqs     = 0;              /* no of outgoing messages */
866   msz      = 0;              /* total mesg length (for all procs) */
867   w1[rank] = 0;              /* no mesg sent to self */
868   w3[rank] = 0;
869   for (i=0; i<size; i++) {
870     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
871   }
872   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
873   for (i=0,j=0; i<size; i++) {
874     if (w1[i]) { pa[j] = i; j++; }
875   }
876 
877   /* Each message would have a header = 1 + 2*(no of IS) + data */
878   for (i=0; i<nrqs; i++) {
879     j     = pa[i];
880     w1[j] += w2[j] + 2* w3[j];
881     msz   += w1[j];
882   }
883   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
884 
885   /* Determine the number of messages to expect, their lengths, from from-ids */
886   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
887   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
888 
889   /* Now post the Irecvs corresponding to these messages */
890   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
891 
892   ierr = PetscFree(onodes1);CHKERRQ(ierr);
893   ierr = PetscFree(olengths1);CHKERRQ(ierr);
894 
895   /* Allocate Memory for outgoing messages */
896   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
897   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
898   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
899 
900   {
901     PetscInt *iptr = tmp,ict = 0;
902     for (i=0; i<nrqs; i++) {
903       j         = pa[i];
904       iptr     += ict;
905       sbuf1[j]  = iptr;
906       ict       = w1[j];
907     }
908   }
909 
910   /* Form the outgoing messages */
911   /* Initialize the header space */
912   for (i=0; i<nrqs; i++) {
913     j           = pa[i];
914     sbuf1[j][0] = 0;
915     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
916     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
917   }
918 
919   /* Parse the isrow and copy data into outbuf */
920   for (i=0; i<ismax; i++) {
921     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
922     irow_i = irow[i];
923     jmax   = nrow[i];
924     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
925       l = 0;
926       row  = irow_i[j];
927       while (row >= C->rmap->range[l+1]) l++;
928       proc = l;
929       if (proc != rank) { /* copy to the outgoing buf*/
930         ctr[proc]++;
931         *ptr[proc] = row;
932         ptr[proc]++;
933       }
934     }
935     /* Update the headers for the current IS */
936     for (j=0; j<size; j++) { /* Can Optimise this loop too */
937       if ((ctr_j = ctr[j])) {
938         sbuf1_j        = sbuf1[j];
939         k              = ++sbuf1_j[0];
940         sbuf1_j[2*k]   = ctr_j;
941         sbuf1_j[2*k-1] = i;
942       }
943     }
944   }
945 
946   /*  Now  post the sends */
947   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
948   for (i=0; i<nrqs; ++i) {
949     j    = pa[i];
950     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
951   }
952 
953   /* Post Receives to capture the buffer size */
954   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
955   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
956   rbuf2[0] = tmp + msz;
957   for (i=1; i<nrqs; ++i) {
958     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
959   }
960   for (i=0; i<nrqs; ++i) {
961     j    = pa[i];
962     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
963   }
964 
965   /* Send to other procs the buf size they should allocate */
966 
967 
968   /* Receive messages*/
969   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
970   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
971   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
972   {
973     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
974     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
975     PetscInt    *sbuf2_i;
976 
977     for (i=0; i<nrqr; ++i) {
978       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
979       req_size[idex] = 0;
980       rbuf1_i         = rbuf1[idex];
981       start           = 2*rbuf1_i[0] + 1;
982       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
983       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
984       sbuf2_i         = sbuf2[idex];
985       for (j=start; j<end; j++) {
986         id               = rbuf1_i[j] - rstart;
987         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
988         sbuf2_i[j]       = ncols;
989         req_size[idex] += ncols;
990       }
991       req_source[idex] = r_status1[i].MPI_SOURCE;
992       /* form the header */
993       sbuf2_i[0]   = req_size[idex];
994       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
995       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
996     }
997   }
998   ierr = PetscFree(r_status1);CHKERRQ(ierr);
999   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1000 
1001   /*  recv buffer sizes */
1002   /* Receive messages*/
1003 
1004   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1005   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1006   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1007   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1008   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1009 
1010   for (i=0; i<nrqs; ++i) {
1011     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1012     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1013     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1014     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1015     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1016   }
1017   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1018   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1019 
1020   /* Wait on sends1 and sends2 */
1021   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1022   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1023 
1024   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1025   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1026   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1027   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1028   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1029   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1030 
1031   /* Now allocate buffers for a->j, and send them off */
1032   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1033   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1034   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1035   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1036 
1037   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1038   {
1039     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1040     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1041     PetscInt cend = C->cmap->rend;
1042     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1043 
1044     for (i=0; i<nrqr; i++) {
1045       rbuf1_i   = rbuf1[i];
1046       sbuf_aj_i = sbuf_aj[i];
1047       ct1       = 2*rbuf1_i[0] + 1;
1048       ct2       = 0;
1049       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1050         kmax = rbuf1[i][2*j];
1051         for (k=0; k<kmax; k++,ct1++) {
1052           row    = rbuf1_i[ct1] - rstart;
1053           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1054           ncols  = nzA + nzB;
1055           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1056 
1057           /* load the column indices for this row into cols*/
1058           cols  = sbuf_aj_i + ct2;
1059 
1060 	  lwrite = 0;
1061           for (l=0; l<nzB; l++) {
1062             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1063           }
1064           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1065           for (l=0; l<nzB; l++) {
1066             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1067           }
1068 
1069           ct2 += ncols;
1070         }
1071       }
1072       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1073     }
1074   }
1075   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1076   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1077 
1078   /* Allocate buffers for a->a, and send them off */
1079   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1080   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1081   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1082   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1083 
1084   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1085   {
1086     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1087     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1088     PetscInt    cend = C->cmap->rend;
1089     PetscInt    *b_j = b->j;
1090     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1091 
1092     for (i=0; i<nrqr; i++) {
1093       rbuf1_i   = rbuf1[i];
1094       sbuf_aa_i = sbuf_aa[i];
1095       ct1       = 2*rbuf1_i[0]+1;
1096       ct2       = 0;
1097       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1098         kmax = rbuf1_i[2*j];
1099         for (k=0; k<kmax; k++,ct1++) {
1100           row    = rbuf1_i[ct1] - rstart;
1101           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1102           ncols  = nzA + nzB;
1103           cworkB = b_j + b_i[row];
1104           vworkA = a_a + a_i[row];
1105           vworkB = b_a + b_i[row];
1106 
1107           /* load the column values for this row into vals*/
1108           vals  = sbuf_aa_i+ct2;
1109 
1110 	  lwrite = 0;
1111           for (l=0; l<nzB; l++) {
1112             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1113           }
1114           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1115           for (l=0; l<nzB; l++) {
1116             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1117           }
1118 
1119           ct2 += ncols;
1120         }
1121       }
1122       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1123     }
1124   }
1125   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1126   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1127   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1128   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1129 
1130   /* Form the matrix */
1131   /* create col map: global col of C -> local col of submatrices */
1132   if (!allcolumns){
1133     const PetscInt *icol_i;
1134 #if defined (PETSC_USE_CTABLE)
1135     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1136     for (i=0; i<ismax; i++) {
1137       ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr);
1138       jmax   = ncol[i];
1139       icol_i = icol[i];
1140       cmap_i = cmap[i];
1141       for (j=0; j<jmax; j++) {
1142         ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr);
1143       }
1144     }
1145 #else
1146     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1147     ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr);
1148     ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1149     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; }
1150     for (i=0; i<ismax; i++) {
1151       jmax   = ncol[i];
1152       icol_i = icol[i];
1153       cmap_i = cmap[i];
1154       for (j=0; j<jmax; j++) {
1155         cmap_i[icol_i[j]] = j+1;
1156       }
1157     }
1158 #endif
1159   }
1160 
1161   /* Create lens which is required for MatCreate... */
1162   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1163   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1164   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1165   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1166   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1167 
1168   /* Update lens from local data */
1169   for (i=0; i<ismax; i++) {
1170     jmax   = nrow[i];
1171     if (!allcolumns) cmap_i = cmap[i];
1172     irow_i = irow[i];
1173     lens_i = lens[i];
1174     for (j=0; j<jmax; j++) {
1175       l = 0;
1176       row  = irow_i[j];
1177       while (row >= C->rmap->range[l+1]) l++;
1178       proc = l;
1179       if (proc == rank) {
1180         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1181         if (!allcolumns){
1182           for (k=0; k<ncols; k++) {
1183 #if defined (PETSC_USE_CTABLE)
1184             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1185 #else
1186             tcol = cmap_i[cols[k]];
1187 #endif
1188             if (tcol) { lens_i[j]++;}
1189           }
1190         } else { /* allcolumns */
1191           lens_i[j] = ncols;
1192         }
1193         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1194       }
1195     }
1196   }
1197 
1198   /* Create row map: global row of C -> local row of submatrices */
1199 #if defined (PETSC_USE_CTABLE)
1200   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1201     for (i=0; i<ismax; i++) {
1202       ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr);
1203       rmap_i = rmap[i];
1204       irow_i = irow[i];
1205       jmax   = nrow[i];
1206       for (j=0; j<jmax; j++) {
1207         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr);
1208       }
1209     }
1210 #else
1211   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1212   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1213   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1214   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1215   for (i=0; i<ismax; i++) {
1216     rmap_i = rmap[i];
1217     irow_i = irow[i];
1218     jmax   = nrow[i];
1219     for (j=0; j<jmax; j++) {
1220       rmap_i[irow_i[j]] = j;
1221     }
1222   }
1223 #endif
1224 
1225   /* Update lens from offproc data */
1226   {
1227     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1228 
1229     for (tmp2=0; tmp2<nrqs; tmp2++) {
1230       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1231       idex   = pa[idex2];
1232       sbuf1_i = sbuf1[idex];
1233       jmax    = sbuf1_i[0];
1234       ct1     = 2*jmax+1;
1235       ct2     = 0;
1236       rbuf2_i = rbuf2[idex2];
1237       rbuf3_i = rbuf3[idex2];
1238       for (j=1; j<=jmax; j++) {
1239         is_no   = sbuf1_i[2*j-1];
1240         max1    = sbuf1_i[2*j];
1241         lens_i  = lens[is_no];
1242         if (!allcolumns){cmap_i  = cmap[is_no];}
1243         rmap_i  = rmap[is_no];
1244         for (k=0; k<max1; k++,ct1++) {
1245 #if defined (PETSC_USE_CTABLE)
1246           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1247           row--;
1248           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1249 #else
1250           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1251 #endif
1252           max2 = rbuf2_i[ct1];
1253           for (l=0; l<max2; l++,ct2++) {
1254             if (!allcolumns){
1255 #if defined (PETSC_USE_CTABLE)
1256               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1257 #else
1258               tcol = cmap_i[rbuf3_i[ct2]];
1259 #endif
1260               if (tcol) {
1261                 lens_i[row]++;
1262               }
1263             } else { /* allcolumns */
1264               lens_i[row]++; /* lens_i[row] += max2 ? */
1265             }
1266           }
1267         }
1268       }
1269     }
1270   }
1271   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1272   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1273   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1274   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1275   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1276 
1277   /* Create the submatrices */
1278   if (scall == MAT_REUSE_MATRIX) {
1279     PetscBool  flag;
1280 
1281     /*
1282         Assumes new rows are same length as the old rows,hence bug!
1283     */
1284     for (i=0; i<ismax; i++) {
1285       mat = (Mat_SeqAIJ *)(submats[i]->data);
1286       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1287       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1288       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1289       /* Initial matrix as if empty */
1290       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1291       submats[i]->factortype = C->factortype;
1292     }
1293   } else {
1294     for (i=0; i<ismax; i++) {
1295       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1296       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1297       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1298       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1299     }
1300   }
1301 
1302   /* Assemble the matrices */
1303   /* First assemble the local rows */
1304   {
1305     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1306     PetscScalar *imat_a;
1307 
1308     for (i=0; i<ismax; i++) {
1309       mat       = (Mat_SeqAIJ*)submats[i]->data;
1310       imat_ilen = mat->ilen;
1311       imat_j    = mat->j;
1312       imat_i    = mat->i;
1313       imat_a    = mat->a;
1314       if (!allcolumns) cmap_i = cmap[i];
1315       rmap_i    = rmap[i];
1316       irow_i    = irow[i];
1317       jmax      = nrow[i];
1318       for (j=0; j<jmax; j++) {
1319 	l = 0;
1320         row      = irow_i[j];
1321         while (row >= C->rmap->range[l+1]) l++;
1322         proc = l;
1323         if (proc == rank) {
1324           old_row  = row;
1325 #if defined (PETSC_USE_CTABLE)
1326           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1327           row--;
1328 #else
1329           row      = rmap_i[row];
1330 #endif
1331           ilen_row = imat_ilen[row];
1332           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1333           mat_i    = imat_i[row] ;
1334           mat_a    = imat_a + mat_i;
1335           mat_j    = imat_j + mat_i;
1336           if (!allcolumns){
1337             for (k=0; k<ncols; k++) {
1338 #if defined (PETSC_USE_CTABLE)
1339               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1340 #else
1341               tcol = cmap_i[cols[k]];
1342 #endif
1343               if (tcol){
1344                 *mat_j++ = tcol - 1;
1345                 *mat_a++ = vals[k];
1346                 ilen_row++;
1347               }
1348             }
1349           } else { /* allcolumns */
1350             for (k=0; k<ncols; k++) {
1351               *mat_j++ = cols[k] ; /* global col index! */
1352               *mat_a++ = vals[k];
1353               ilen_row++;
1354             }
1355           }
1356           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1357           imat_ilen[row] = ilen_row;
1358         }
1359       }
1360     }
1361   }
1362 
1363   /*   Now assemble the off proc rows*/
1364   {
1365     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1366     PetscInt    *imat_j,*imat_i;
1367     PetscScalar *imat_a,*rbuf4_i;
1368 
1369     for (tmp2=0; tmp2<nrqs; tmp2++) {
1370       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1371       idex   = pa[idex2];
1372       sbuf1_i = sbuf1[idex];
1373       jmax    = sbuf1_i[0];
1374       ct1     = 2*jmax + 1;
1375       ct2     = 0;
1376       rbuf2_i = rbuf2[idex2];
1377       rbuf3_i = rbuf3[idex2];
1378       rbuf4_i = rbuf4[idex2];
1379       for (j=1; j<=jmax; j++) {
1380         is_no     = sbuf1_i[2*j-1];
1381         rmap_i    = rmap[is_no];
1382         if (!allcolumns){cmap_i    = cmap[is_no];}
1383         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1384         imat_ilen = mat->ilen;
1385         imat_j    = mat->j;
1386         imat_i    = mat->i;
1387         imat_a    = mat->a;
1388         max1      = sbuf1_i[2*j];
1389         for (k=0; k<max1; k++,ct1++) {
1390           row   = sbuf1_i[ct1];
1391 #if defined (PETSC_USE_CTABLE)
1392           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1393           row--;
1394 #else
1395           row   = rmap_i[row];
1396 #endif
1397           ilen  = imat_ilen[row];
1398           mat_i = imat_i[row] ;
1399           mat_a = imat_a + mat_i;
1400           mat_j = imat_j + mat_i;
1401           max2 = rbuf2_i[ct1];
1402           if (!allcolumns){
1403             for (l=0; l<max2; l++,ct2++) {
1404 
1405 #if defined (PETSC_USE_CTABLE)
1406               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1407 #else
1408               tcol = cmap_i[rbuf3_i[ct2]];
1409 #endif
1410               if (tcol) {
1411                 *mat_j++ = tcol - 1;
1412                 *mat_a++ = rbuf4_i[ct2];
1413                 ilen++;
1414               }
1415             }
1416           } else { /* allcolumns */
1417             for (l=0; l<max2; l++,ct2++) {
1418               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1419               *mat_a++ = rbuf4_i[ct2];
1420               ilen++;
1421             }
1422           }
1423           imat_ilen[row] = ilen;
1424         }
1425       }
1426     }
1427   }
1428   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1429   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1430   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1431   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1432   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1433 
1434   /* Restore the indices */
1435   for (i=0; i<ismax; i++) {
1436     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1437     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1438   }
1439 
1440   /* Destroy allocated memory */
1441   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1442   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1443   ierr = PetscFree(pa);CHKERRQ(ierr);
1444 
1445   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1446   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1447   for (i=0; i<nrqr; ++i) {
1448     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1449   }
1450   for (i=0; i<nrqs; ++i) {
1451     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1452     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1453   }
1454 
1455   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1456   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1457   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1458   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1459   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1460   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1461   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1462 #if defined (PETSC_USE_CTABLE)
1463   for (i=0; i<ismax; i++) {
1464     if (!allcolumns){ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);}
1465     ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);
1466   }
1467 #else
1468   if (!allcolumns){ierr = PetscFree(cmap[0]);CHKERRQ(ierr);}
1469   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1470 #endif
1471   if (!allcolumns){ierr = PetscFree(cmap);CHKERRQ(ierr);}
1472   ierr = PetscFree(rmap);CHKERRQ(ierr);
1473   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1474   ierr = PetscFree(lens);CHKERRQ(ierr);
1475 
1476   for (i=0; i<ismax; i++) {
1477     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1478     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1479   }
1480   PetscFunctionReturn(0);
1481 }
1482 
1483 /*
1484  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1485  Be careful not to destroy them elsewhere.
1486  */
1487 #undef __FUNCT__
1488 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1489 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1490 {
1491   /* If making this function public, change the error returned in this function away from _PLIB. */
1492   PetscErrorCode ierr;
1493   Mat_MPIAIJ *aij;
1494   PetscBool seqaij;
1495 
1496   PetscFunctionBegin;
1497   /* Check to make sure the component matrices are compatible with C. */
1498   ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1499   if(!seqaij) {
1500     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1501   }
1502   ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1503   if(!seqaij) {
1504     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1505   }
1506   if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) {
1507     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1508   }
1509   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1510   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1511   ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr);
1512   ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr);
1513   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1514   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1515   if((*C)->cmap->N != A->cmap->n + B->cmap->n) {
1516     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1517   }
1518   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1519   aij = (Mat_MPIAIJ*)((*C)->data);
1520   aij->A = A;
1521   aij->B = B;
1522   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1523   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1524   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1525   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 #undef __FUNCT__
1530 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1531 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1532 {
1533   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1534   PetscFunctionBegin;
1535   PetscValidPointer(A,2);
1536   PetscValidPointer(B,3);
1537   *A = aij->A;
1538   *B = aij->B;
1539   /* Note that we don't incref *A and *B, so be careful! */
1540   PetscFunctionReturn(0);
1541 }
1542 
1543 
1544 #undef __FUNCT__
1545 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1546 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1547 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1548 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1549 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1550 {
1551   PetscErrorCode ierr;
1552   PetscMPIInt size, flag;
1553   PetscInt    i,ii;
1554   PetscInt    ismax_c;
1555   PetscFunctionBegin;
1556   if(!ismax) {
1557     PetscFunctionReturn(0);
1558   }
1559   for(i = 0, ismax_c = 0; i < ismax; ++i) {
1560     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1561     if(flag != MPI_IDENT) {
1562       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1563     }
1564     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1565     if(size > 1) {
1566       ++ismax_c;
1567     }
1568   }
1569   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1570     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1571   }
1572   else { /* if(ismax_c) */
1573     Mat *A,*B;
1574     IS  *isrow_c, *iscol_c;
1575     PetscMPIInt size;
1576     /*
1577      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1578      array of sequential matrices underlying the resulting parallel matrices.
1579      Which arrays to allocate is based on the value of MatReuse scall.
1580      There are as many diag matrices as there are original index sets.
1581      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1582 
1583      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1584      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1585      will deposite the extracted diag and off-diag parts.
1586      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1587      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1588     */
1589 
1590     /* Parallel matrix array is allocated only if no reuse is taking place. */
1591     if (scall != MAT_REUSE_MATRIX) {
1592       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1593     }
1594     else {
1595       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1596       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1597       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1598       for(i = 0, ii = 0; i < ismax; ++i) {
1599         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1600         if(size > 1) {
1601           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1602           ++ii;
1603         }
1604         else {
1605           A[i] = (*submat)[i];
1606         }
1607       }
1608     }
1609     /*
1610      Construct the complements of the iscol ISs for parallel ISs only.
1611      These are used to extract the off-diag portion of the resulting parallel matrix.
1612      The row IS for the off-diag portion is the same as for the diag portion,
1613      so we merely alias the row IS, while skipping those that are sequential.
1614     */
1615     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1616     for(i = 0, ii = 0; i < ismax; ++i) {
1617       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1618       if(size > 1) {
1619 	isrow_c[ii] = isrow[i];
1620 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1621 	++ii;
1622       }
1623     }
1624     /* Now obtain the sequential A and B submatrices separately. */
1625     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1626     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1627     for(ii = 0; ii < ismax_c; ++ii) {
1628       ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr);
1629     }
1630     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1631     /*
1632      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1633      have been extracted directly into the parallel matrices containing them, or
1634      simply into the sequential matrix identical with the corresponding A (if size == 1).
1635      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1636      A is put into the correct submat slot (if size == 1).
1637      */
1638     if(scall != MAT_REUSE_MATRIX) {
1639       for(i = 0, ii = 0; i < ismax; ++i) {
1640         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1641         if(size > 1) {
1642           /*
1643            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1644            */
1645           /* Construct submat[i] from the Seq pieces A and B. */
1646           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1647 
1648           ++ii;
1649         }
1650         else {
1651           (*submat)[i] = A[i];
1652         }
1653       }
1654     }
1655     ierr = PetscFree(A); CHKERRQ(ierr);
1656     ierr = PetscFree(B); CHKERRQ(ierr);
1657   }
1658   PetscFunctionReturn(0);
1659 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1660 
1661 
1662 
1663 #undef __FUNCT__
1664 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1665 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1666 {
1667   PetscErrorCode ierr;
1668   PetscFunctionBegin;
1669   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,
1670                                            MatCreateMPIAIJFromSeqMatrices_Private,
1671                                            MatMPIAIJExtractSeqMatrices_Private);
1672   CHKERRQ(ierr);
1673   PetscFunctionReturn(0);
1674 }
1675