xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision a790c00499656cdc34e95ec8f15f35cf5f4cdcbb)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   comm   = ((PetscObject)C)->comm;
73   size   = c->size;
74   rank   = c->rank;
75   M      = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row  = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++){
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
162                         M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr);
163     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
164     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
165     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
167     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
168 
169     for (i=0; i<imax; i++) {
170       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
171       data[i]  = d_p + M*i;
172     }
173   }
174 
175   /* Parse the IS and update local tables and the outgoing buf with the data*/
176   {
177     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
178     PetscBT  table_i;
179 
180     for (i=0; i<imax; i++) {
181       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
182       n_i     = n[i];
183       table_i = table[i];
184       idx_i   = idx[i];
185       data_i  = data[i];
186       isz_i   = isz[i];
187       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
188         row  = idx_i[j];
189         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
190         if (proc != rank) { /* copy to the outgoing buffer */
191           ctr[proc]++;
192           *ptr[proc] = row;
193           ptr[proc]++;
194         } else { /* Update the local table */
195           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
196         }
197       }
198       /* Update the headers for the current IS */
199       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
200         if ((ctr_j = ctr[j])) {
201           outdat_j        = outdat[j];
202           k               = ++outdat_j[0];
203           outdat_j[2*k]   = ctr_j;
204           outdat_j[2*k-1] = i;
205         }
206       }
207       isz[i] = isz_i;
208     }
209   }
210 
211   /*  Now  post the sends */
212   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
213   for (i=0; i<nrqs; ++i) {
214     j    = pa[i];
215     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
216   }
217 
218   /* No longer need the original indices*/
219   for (i=0; i<imax; ++i) {
220     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
221   }
222   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
223 
224   for (i=0; i<imax; ++i) {
225     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
226   }
227 
228   /* Do Local work*/
229   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
230 
231   /* Receive messages*/
232   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
233   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
234 
235   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
236   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
237 
238   /* Phase 1 sends are complete - deallocate buffers */
239   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
240   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
241 
242   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr);
243   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr);
244   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
245   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
246   ierr = PetscFree(rbuf);CHKERRQ(ierr);
247 
248 
249  /* Send the data back*/
250   /* Do a global reduction to know the buffer space req for incoming messages*/
251   {
252     PetscMPIInt *rw1;
253 
254     ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr);
255     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
256 
257     for (i=0; i<nrqr; ++i) {
258       proc      = recv_status[i].MPI_SOURCE;
259       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
260       rw1[proc] = isz1[i];
261     }
262     ierr = PetscFree(onodes1);CHKERRQ(ierr);
263     ierr = PetscFree(olengths1);CHKERRQ(ierr);
264 
265     /* Determine the number of messages to expect, their lengths, from from-ids */
266     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
267     ierr = PetscFree(rw1);CHKERRQ(ierr);
268   }
269   /* Now post the Irecvs corresponding to these messages */
270   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
271 
272   /*  Now  post the sends */
273   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
274   for (i=0; i<nrqr; ++i) {
275     j    = recv_status[i].MPI_SOURCE;
276     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
277   }
278 
279   /* receive work done on other processors*/
280   {
281     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
282     PetscMPIInt idex;
283     PetscBT     table_i;
284     MPI_Status  *status2;
285 
286     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
287     for (i=0; i<nrqs; ++i) {
288       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
289       /* Process the message*/
290       rbuf2_i = rbuf2[idex];
291       ct1     = 2*rbuf2_i[0]+1;
292       jmax    = rbuf2[idex][0];
293       for (j=1; j<=jmax; j++) {
294         max     = rbuf2_i[2*j];
295         is_no   = rbuf2_i[2*j-1];
296         isz_i   = isz[is_no];
297         data_i  = data[is_no];
298         table_i = table[is_no];
299         for (k=0; k<max; k++,ct1++) {
300           row = rbuf2_i[ct1];
301           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
302         }
303         isz[is_no] = isz_i;
304       }
305     }
306 
307     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
308     ierr = PetscFree(status2);CHKERRQ(ierr);
309   }
310 
311   for (i=0; i<imax; ++i) {
312     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(onodes2);CHKERRQ(ierr);
316   ierr = PetscFree(olengths2);CHKERRQ(ierr);
317 
318   ierr = PetscFree(pa);CHKERRQ(ierr);
319   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
320   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
323   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
324   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
325   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
326   ierr = PetscFree(s_status);CHKERRQ(ierr);
327   ierr = PetscFree(recv_status);CHKERRQ(ierr);
328   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
329   ierr = PetscFree(xdata);CHKERRQ(ierr);
330   ierr = PetscFree(isz1);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
336 /*
337    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
338        the work on the local processor.
339 
340      Inputs:
341       C      - MAT_MPIAIJ;
342       imax - total no of index sets processed at a time;
343       table  - an array of char - size = m bits.
344 
345      Output:
346       isz    - array containing the count of the solution elements corresponding
347                to each index set;
348       data   - pointer to the solutions
349 */
350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
351 {
352   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
353   Mat        A = c->A,B = c->B;
354   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
355   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
356   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
357   PetscBT    table_i;
358 
359   PetscFunctionBegin;
360   rstart = C->rmap->rstart;
361   cstart = C->cmap->rstart;
362   ai     = a->i;
363   aj     = a->j;
364   bi     = b->i;
365   bj     = b->j;
366   garray = c->garray;
367 
368 
369   for (i=0; i<imax; i++) {
370     data_i  = data[i];
371     table_i = table[i];
372     isz_i   = isz[i];
373     for (j=0,max=isz[i]; j<max; j++) {
374       row   = data_i[j] - rstart;
375       start = ai[row];
376       end   = ai[row+1];
377       for (k=start; k<end; k++) { /* Amat */
378         val = aj[k] + cstart;
379         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
380       }
381       start = bi[row];
382       end   = bi[row+1];
383       for (k=start; k<end; k++) { /* Bmat */
384         val = garray[bj[k]];
385         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
386       }
387     }
388     isz[i] = isz_i;
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 #undef __FUNCT__
394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
395 /*
396       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
397          and return the output
398 
399          Input:
400            C    - the matrix
401            nrqr - no of messages being processed.
402            rbuf - an array of pointers to the recieved requests
403 
404          Output:
405            xdata - array of messages to be sent back
406            isz1  - size of each message
407 
408   For better efficiency perhaps we should malloc separately each xdata[i],
409 then if a remalloc is required we need only copy the data for that one row
410 rather then all previous rows as it is now where a single large chunck of
411 memory is used.
412 
413 */
414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
415 {
416   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
417   Mat            A = c->A,B = c->B;
418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
419   PetscErrorCode ierr;
420   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
421   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
422   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
423   PetscInt       *rbuf_i,kmax,rbuf_0;
424   PetscBT        xtable;
425 
426   PetscFunctionBegin;
427   m      = C->rmap->N;
428   rstart = C->rmap->rstart;
429   cstart = C->cmap->rstart;
430   ai     = a->i;
431   aj     = a->j;
432   bi     = b->i;
433   bj     = b->j;
434   garray = c->garray;
435 
436 
437   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
438     rbuf_i  =  rbuf[i];
439     rbuf_0  =  rbuf_i[0];
440     ct     += rbuf_0;
441     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
442   }
443 
444   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
445   else      max1 = 1;
446   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
447   ierr         = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr);
448   ++no_malloc;
449   ierr         = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
450   ierr         = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
451 
452   ct3 = 0;
453   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
454     rbuf_i =  rbuf[i];
455     rbuf_0 =  rbuf_i[0];
456     ct1    =  2*rbuf_0+1;
457     ct2    =  ct1;
458     ct3    += ct1;
459     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
460       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
461       oct2 = ct2;
462       kmax = rbuf_i[2*j];
463       for (k=0; k<kmax; k++,ct1++) {
464         row = rbuf_i[ct1];
465         if (!PetscBTLookupSet(xtable,row)) {
466           if (!(ct3 < mem_estimate)) {
467             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
468             ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
469             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
470             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
471             xdata[0]     = tmp;
472             mem_estimate = new_estimate; ++no_malloc;
473             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
474           }
475           xdata[i][ct2++] = row;
476           ct3++;
477         }
478       }
479       for (k=oct2,max2=ct2; k<max2; k++) {
480         row   = xdata[i][k] - rstart;
481         start = ai[row];
482         end   = ai[row+1];
483         for (l=start; l<end; l++) {
484           val = aj[l] + cstart;
485           if (!PetscBTLookupSet(xtable,val)) {
486             if (!(ct3 < mem_estimate)) {
487               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
488               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
489               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
490               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
491               xdata[0]     = tmp;
492               mem_estimate = new_estimate; ++no_malloc;
493               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
494             }
495             xdata[i][ct2++] = val;
496             ct3++;
497           }
498         }
499         start = bi[row];
500         end   = bi[row+1];
501         for (l=start; l<end; l++) {
502           val = garray[bj[l]];
503           if (!PetscBTLookupSet(xtable,val)) {
504             if (!(ct3 < mem_estimate)) {
505               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
506               ierr         = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr);
507               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
508               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
509               xdata[0]     = tmp;
510               mem_estimate = new_estimate; ++no_malloc;
511               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
512             }
513             xdata[i][ct2++] = val;
514             ct3++;
515           }
516         }
517       }
518       /* Update the header*/
519       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
520       xdata[i][2*j-1] = rbuf_i[2*j-1];
521     }
522     xdata[i][0] = rbuf_0;
523     xdata[i+1]  = xdata[i] + ct2;
524     isz1[i]     = ct2; /* size of each message */
525   }
526   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
527   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
528   PetscFunctionReturn(0);
529 }
530 /* -------------------------------------------------------------------------*/
531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
533 /*
534     Every processor gets the entire matrix
535 */
536 #undef __FUNCT__
537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
539 {
540   Mat            B;
541   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
542   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
543   PetscErrorCode ierr;
544   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
545   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
546   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
547   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
548 
549   PetscFunctionBegin;
550   ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr);
551   ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr);
552 
553   if (scall == MAT_INITIAL_MATRIX) {
554     /* ----------------------------------------------------------------
555          Tell every processor the number of nonzeros per row
556     */
557     ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr);
558     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
559       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
560     }
561     sendcount = A->rmap->rend - A->rmap->rstart;
562     ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
563     for (i=0; i<size; i++) {
564       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
565       displs[i]     = A->rmap->range[i];
566     }
567 #if defined(PETSC_HAVE_MPI_IN_PLACE)
568     ierr  = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
569 #else
570     ierr  = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
571 #endif
572     /* ---------------------------------------------------------------
573          Create the sequential matrix of the same type as the local block diagonal
574     */
575     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
576     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
577     ierr  = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
578     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
579     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
580     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
581     **Bin = B;
582     b = (Mat_SeqAIJ *)B->data;
583 
584     /*--------------------------------------------------------------------
585        Copy my part of matrix column indices over
586     */
587     sendcount  = ad->nz + bd->nz;
588     jsendbuf   = b->j + b->i[rstarts[rank]];
589     a_jsendbuf = ad->j;
590     b_jsendbuf = bd->j;
591     n          = A->rmap->rend - A->rmap->rstart;
592     cnt        = 0;
593     for (i=0; i<n; i++) {
594 
595       /* put in lower diagonal portion */
596       m = bd->i[i+1] - bd->i[i];
597       while (m > 0) {
598         /* is it above diagonal (in bd (compressed) numbering) */
599         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
600         jsendbuf[cnt++] = garray[*b_jsendbuf++];
601         m--;
602       }
603 
604       /* put in diagonal portion */
605       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
606         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
607       }
608 
609       /* put in upper diagonal portion */
610       while (m-- > 0) {
611         jsendbuf[cnt++] = garray[*b_jsendbuf++];
612       }
613     }
614     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
615 
616     /*--------------------------------------------------------------------
617        Gather all column indices to all processors
618     */
619     for (i=0; i<size; i++) {
620       recvcounts[i] = 0;
621       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
622         recvcounts[i] += lens[j];
623       }
624     }
625     displs[0]  = 0;
626     for (i=1; i<size; i++) {
627       displs[i] = displs[i-1] + recvcounts[i-1];
628     }
629 #if defined(PETSC_HAVE_MPI_IN_PLACE)
630     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
631 #else
632     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr);
633 #endif
634     /*--------------------------------------------------------------------
635         Assemble the matrix into useable form (note numerical values not yet set)
636     */
637     /* set the b->ilen (length of each row) values */
638     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
639     /* set the b->i indices */
640     b->i[0] = 0;
641     for (i=1; i<=A->rmap->N; i++) {
642       b->i[i] = b->i[i-1] + lens[i-1];
643     }
644     ierr = PetscFree(lens);CHKERRQ(ierr);
645     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
646     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
647 
648   } else {
649     B  = **Bin;
650     b = (Mat_SeqAIJ *)B->data;
651   }
652 
653   /*--------------------------------------------------------------------
654        Copy my part of matrix numerical values into the values location
655   */
656   if (flag == MAT_GET_VALUES){
657     sendcount = ad->nz + bd->nz;
658     sendbuf   = b->a + b->i[rstarts[rank]];
659     a_sendbuf = ad->a;
660     b_sendbuf = bd->a;
661     b_sendj   = bd->j;
662     n         = A->rmap->rend - A->rmap->rstart;
663     cnt       = 0;
664     for (i=0; i<n; i++) {
665 
666       /* put in lower diagonal portion */
667       m = bd->i[i+1] - bd->i[i];
668       while (m > 0) {
669         /* is it above diagonal (in bd (compressed) numbering) */
670         if (garray[*b_sendj] > A->rmap->rstart + i) break;
671         sendbuf[cnt++] = *b_sendbuf++;
672         m--;
673         b_sendj++;
674       }
675 
676       /* put in diagonal portion */
677       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
678         sendbuf[cnt++] = *a_sendbuf++;
679       }
680 
681       /* put in upper diagonal portion */
682       while (m-- > 0) {
683         sendbuf[cnt++] = *b_sendbuf++;
684         b_sendj++;
685       }
686     }
687     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
688 
689     /* -----------------------------------------------------------------
690        Gather all numerical values to all processors
691     */
692     if (!recvcounts) {
693       ierr   = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr);
694     }
695     for (i=0; i<size; i++) {
696       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
697     }
698     displs[0]  = 0;
699     for (i=1; i<size; i++) {
700       displs[i] = displs[i-1] + recvcounts[i-1];
701     }
702     recvbuf   = b->a;
703 #if defined(PETSC_HAVE_MPI_IN_PLACE)
704     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
705 #else
706     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr);
707 #endif
708   }  /* endof (flag == MAT_GET_VALUES) */
709   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
710 
711   if (A->symmetric){
712     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
713   } else if (A->hermitian) {
714     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
715   } else if (A->structurally_symmetric) {
716     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
717   }
718   PetscFunctionReturn(0);
719 }
720 
721 
722 
723 #undef __FUNCT__
724 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
725 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
726 {
727   PetscErrorCode ierr;
728   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
729   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
730 
731   PetscFunctionBegin;
732   /*
733        Check for special case: each processor gets entire matrix
734   */
735   if (ismax == 1 && C->rmap->N == C->cmap->N) {
736     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
737     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
738     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
739     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
740     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
741       wantallmatrix = PETSC_TRUE;
742       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
743     }
744   }
745   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr);
746   if (twantallmatrix) {
747     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
748     PetscFunctionReturn(0);
749   }
750 
751   /* Allocate memory to hold all the submatrices */
752   if (scall != MAT_REUSE_MATRIX) {
753     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
754   }
755 
756   /* Check for special case: each processor gets entire matrix columns */
757   ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr);
758   for (i=0; i<ismax; i++) {
759     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
760     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
761     if (colflag && ncol == C->cmap->N){
762       allcolumns[i] = PETSC_TRUE;
763     } else {
764       allcolumns[i] = PETSC_FALSE;
765     }
766   }
767 
768   /* Determine the number of stages through which submatrices are done */
769   nmax          = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
770 
771   /*
772      Each stage will extract nmax submatrices.
773      nmax is determined by the matrix column dimension.
774      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
775   */
776   if (!nmax) nmax = 1;
777   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
778 
779   /* Make sure every processor loops through the nstages */
780   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr);
781 
782   for (i=0,pos=0; i<nstages; i++) {
783     if (pos+nmax <= ismax) max_no = nmax;
784     else if (pos == ismax) max_no = 0;
785     else                   max_no = ismax-pos;
786     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
787     pos += max_no;
788   }
789 
790   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
791   PetscFunctionReturn(0);
792 }
793 
794 /* -------------------------------------------------------------------------*/
795 #undef __FUNCT__
796 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
797 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
798 {
799   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
800   Mat            A = c->A;
801   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
802   const PetscInt **icol,**irow;
803   PetscInt       *nrow,*ncol,start;
804   PetscErrorCode ierr;
805   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
806   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
807   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
808   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
809   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
810 #if defined (PETSC_USE_CTABLE)
811   PetscTable     *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i;
812 #else
813   PetscInt       **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i;
814 #endif
815   const PetscInt *irow_i;
816   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
817   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
818   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
819   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
820   MPI_Status     *r_status3,*r_status4,*s_status4;
821   MPI_Comm       comm;
822   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
823   PetscMPIInt    *onodes1,*olengths1;
824   PetscMPIInt    idex,idex2,end;
825 
826   PetscFunctionBegin;
827   comm   = ((PetscObject)C)->comm;
828   tag0   = ((PetscObject)C)->tag;
829   size   = c->size;
830   rank   = c->rank;
831 
832   /* Get some new tags to keep the communication clean */
833   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
834   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
835   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
836 
837   ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr);
838 
839   for (i=0; i<ismax; i++) {
840     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
841     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
842     if (allcolumns[i]){
843       icol[i] = PETSC_NULL;
844       ncol[i] = C->cmap->N;
845     } else {
846       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
847       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
848     }
849   }
850 
851   /* evaluate communication - mesg to who, length of mesg, and buffer space
852      required. Based on this, buffers are allocated, and data copied into them*/
853   ierr   = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */
854   ierr   = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
855   ierr   = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
856   ierr   = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
857   for (i=0; i<ismax; i++) {
858     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
859     jmax   = nrow[i];
860     irow_i = irow[i];
861     for (j=0; j<jmax; j++) {
862       l = 0;
863       row  = irow_i[j];
864       while (row >= C->rmap->range[l+1]) l++;
865       proc = l;
866       w4[proc]++;
867     }
868     for (j=0; j<size; j++) {
869       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
870     }
871   }
872 
873   nrqs     = 0;              /* no of outgoing messages */
874   msz      = 0;              /* total mesg length (for all procs) */
875   w1[rank] = 0;              /* no mesg sent to self */
876   w3[rank] = 0;
877   for (i=0; i<size; i++) {
878     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
879   }
880   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/
881   for (i=0,j=0; i<size; i++) {
882     if (w1[i]) { pa[j] = i; j++; }
883   }
884 
885   /* Each message would have a header = 1 + 2*(no of IS) + data */
886   for (i=0; i<nrqs; i++) {
887     j     = pa[i];
888     w1[j] += w2[j] + 2* w3[j];
889     msz   += w1[j];
890   }
891   ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
892 
893   /* Determine the number of messages to expect, their lengths, from from-ids */
894   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
895   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
896 
897   /* Now post the Irecvs corresponding to these messages */
898   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
899 
900   ierr = PetscFree(onodes1);CHKERRQ(ierr);
901   ierr = PetscFree(olengths1);CHKERRQ(ierr);
902 
903   /* Allocate Memory for outgoing messages */
904   ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr);
905   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
906   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
907 
908   {
909     PetscInt *iptr = tmp,ict = 0;
910     for (i=0; i<nrqs; i++) {
911       j         = pa[i];
912       iptr     += ict;
913       sbuf1[j]  = iptr;
914       ict       = w1[j];
915     }
916   }
917 
918   /* Form the outgoing messages */
919   /* Initialize the header space */
920   for (i=0; i<nrqs; i++) {
921     j           = pa[i];
922     sbuf1[j][0] = 0;
923     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
924     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
925   }
926 
927   /* Parse the isrow and copy data into outbuf */
928   for (i=0; i<ismax; i++) {
929     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
930     irow_i = irow[i];
931     jmax   = nrow[i];
932     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
933       l = 0;
934       row  = irow_i[j];
935       while (row >= C->rmap->range[l+1]) l++;
936       proc = l;
937       if (proc != rank) { /* copy to the outgoing buf*/
938         ctr[proc]++;
939         *ptr[proc] = row;
940         ptr[proc]++;
941       }
942     }
943     /* Update the headers for the current IS */
944     for (j=0; j<size; j++) { /* Can Optimise this loop too */
945       if ((ctr_j = ctr[j])) {
946         sbuf1_j        = sbuf1[j];
947         k              = ++sbuf1_j[0];
948         sbuf1_j[2*k]   = ctr_j;
949         sbuf1_j[2*k-1] = i;
950       }
951     }
952   }
953 
954   /*  Now  post the sends */
955   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
956   for (i=0; i<nrqs; ++i) {
957     j    = pa[i];
958     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
959   }
960 
961   /* Post Receives to capture the buffer size */
962   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
963   ierr     = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr);
964   rbuf2[0] = tmp + msz;
965   for (i=1; i<nrqs; ++i) {
966     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
967   }
968   for (i=0; i<nrqs; ++i) {
969     j    = pa[i];
970     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
971   }
972 
973   /* Send to other procs the buf size they should allocate */
974 
975 
976   /* Receive messages*/
977   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
978   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
979   ierr        = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr);
980   {
981     Mat_SeqAIJ  *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
982     PetscInt    *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
983     PetscInt    *sbuf2_i;
984 
985     for (i=0; i<nrqr; ++i) {
986       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
987       req_size[idex] = 0;
988       rbuf1_i         = rbuf1[idex];
989       start           = 2*rbuf1_i[0] + 1;
990       ierr            = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
991       ierr            = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr);
992       sbuf2_i         = sbuf2[idex];
993       for (j=start; j<end; j++) {
994         id               = rbuf1_i[j] - rstart;
995         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
996         sbuf2_i[j]       = ncols;
997         req_size[idex] += ncols;
998       }
999       req_source[idex] = r_status1[i].MPI_SOURCE;
1000       /* form the header */
1001       sbuf2_i[0]   = req_size[idex];
1002       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1003       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1004     }
1005   }
1006   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1007   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1008 
1009   /*  recv buffer sizes */
1010   /* Receive messages*/
1011 
1012   ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr);
1013   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1014   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1015   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1016   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1017 
1018   for (i=0; i<nrqs; ++i) {
1019     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1020     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr);
1021     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1022     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1023     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1024   }
1025   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1026   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1027 
1028   /* Wait on sends1 and sends2 */
1029   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1030   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1031 
1032   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1033   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1034   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1035   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1036   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1037   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1038 
1039   /* Now allocate buffers for a->j, and send them off */
1040   ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr);
1041   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1042   ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr);
1043   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1044 
1045   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1046   {
1047     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1048     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1049     PetscInt cend = C->cmap->rend;
1050     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1051 
1052     for (i=0; i<nrqr; i++) {
1053       rbuf1_i   = rbuf1[i];
1054       sbuf_aj_i = sbuf_aj[i];
1055       ct1       = 2*rbuf1_i[0] + 1;
1056       ct2       = 0;
1057       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1058         kmax = rbuf1[i][2*j];
1059         for (k=0; k<kmax; k++,ct1++) {
1060           row    = rbuf1_i[ct1] - rstart;
1061           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1062           ncols  = nzA + nzB;
1063           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1064 
1065           /* load the column indices for this row into cols*/
1066           cols  = sbuf_aj_i + ct2;
1067 
1068 	  lwrite = 0;
1069           for (l=0; l<nzB; l++) {
1070             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[lwrite++] = ctmp;
1071           }
1072           for (l=0; l<nzA; l++)   cols[lwrite++] = cstart + cworkA[l];
1073           for (l=0; l<nzB; l++) {
1074             if ((ctmp = bmap[cworkB[l]]) >= cend)  cols[lwrite++] = ctmp;
1075           }
1076 
1077           ct2 += ncols;
1078         }
1079       }
1080       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1081     }
1082   }
1083   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1084   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1085 
1086   /* Allocate buffers for a->a, and send them off */
1087   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1088   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1089   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1090   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1091 
1092   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1093   {
1094     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1095     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1096     PetscInt    cend = C->cmap->rend;
1097     PetscInt    *b_j = b->j;
1098     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1099 
1100     for (i=0; i<nrqr; i++) {
1101       rbuf1_i   = rbuf1[i];
1102       sbuf_aa_i = sbuf_aa[i];
1103       ct1       = 2*rbuf1_i[0]+1;
1104       ct2       = 0;
1105       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1106         kmax = rbuf1_i[2*j];
1107         for (k=0; k<kmax; k++,ct1++) {
1108           row    = rbuf1_i[ct1] - rstart;
1109           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1110           ncols  = nzA + nzB;
1111           cworkB = b_j + b_i[row];
1112           vworkA = a_a + a_i[row];
1113           vworkB = b_a + b_i[row];
1114 
1115           /* load the column values for this row into vals*/
1116           vals  = sbuf_aa_i+ct2;
1117 
1118 	  lwrite = 0;
1119           for (l=0; l<nzB; l++) {
1120             if ((bmap[cworkB[l]]) < cstart)  vals[lwrite++] = vworkB[l];
1121           }
1122           for (l=0; l<nzA; l++)   vals[lwrite++] = vworkA[l];
1123           for (l=0; l<nzB; l++) {
1124             if ((bmap[cworkB[l]]) >= cend)  vals[lwrite++] = vworkB[l];
1125           }
1126 
1127           ct2 += ncols;
1128         }
1129       }
1130       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1131     }
1132   }
1133   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1134   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1135   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1136   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1137 
1138   /* Form the matrix */
1139   /* create col map: global col of C -> local col of submatrices */
1140   {
1141     const PetscInt *icol_i;
1142 #if defined (PETSC_USE_CTABLE)
1143     ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr);
1144     for (i=0; i<ismax; i++) {
1145       if (!allcolumns[i]){
1146         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1147         jmax   = ncol[i];
1148         icol_i = icol[i];
1149         cmap_i = cmap[i];
1150         for (j=0; j<jmax; j++) {
1151           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1152         }
1153       } else {
1154         cmap[i] = PETSC_NULL;
1155       }
1156     }
1157 #else
1158     ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr);
1159     for (i=0; i<ismax; i++) {
1160       if (!allcolumns[i]){
1161         ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr);
1162         ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1163         jmax   = ncol[i];
1164         icol_i = icol[i];
1165         cmap_i = cmap[i];
1166         for (j=0; j<jmax; j++) {
1167           cmap_i[icol_i[j]] = j+1;
1168         }
1169       } else {
1170         cmap[i] = PETSC_NULL;
1171       }
1172     }
1173 #endif
1174   }
1175 
1176   /* Create lens which is required for MatCreate... */
1177   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1178   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr);
1179   ierr    = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr);
1180   ierr    = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1181   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1182 
1183   /* Update lens from local data */
1184   for (i=0; i<ismax; i++) {
1185     jmax   = nrow[i];
1186     if (!allcolumns[i]) cmap_i = cmap[i];
1187     irow_i = irow[i];
1188     lens_i = lens[i];
1189     for (j=0; j<jmax; j++) {
1190       l = 0;
1191       row  = irow_i[j];
1192       while (row >= C->rmap->range[l+1]) l++;
1193       proc = l;
1194       if (proc == rank) {
1195         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1196         if (!allcolumns[i]){
1197           for (k=0; k<ncols; k++) {
1198 #if defined (PETSC_USE_CTABLE)
1199             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1200 #else
1201             tcol = cmap_i[cols[k]];
1202 #endif
1203             if (tcol) { lens_i[j]++;}
1204           }
1205         } else { /* allcolumns */
1206           lens_i[j] = ncols;
1207         }
1208         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1209       }
1210     }
1211   }
1212 
1213   /* Create row map: global row of C -> local row of submatrices */
1214 #if defined (PETSC_USE_CTABLE)
1215   ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr);
1216     for (i=0; i<ismax; i++) {
1217       ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1218       rmap_i = rmap[i];
1219       irow_i = irow[i];
1220       jmax   = nrow[i];
1221       for (j=0; j<jmax; j++) {
1222         ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1223       }
1224     }
1225 #else
1226   ierr    = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr);
1227   ierr    = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr);
1228   ierr    = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1229   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;}
1230   for (i=0; i<ismax; i++) {
1231     rmap_i = rmap[i];
1232     irow_i = irow[i];
1233     jmax   = nrow[i];
1234     for (j=0; j<jmax; j++) {
1235       rmap_i[irow_i[j]] = j;
1236     }
1237   }
1238 #endif
1239 
1240   /* Update lens from offproc data */
1241   {
1242     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1243 
1244     for (tmp2=0; tmp2<nrqs; tmp2++) {
1245       ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1246       idex   = pa[idex2];
1247       sbuf1_i = sbuf1[idex];
1248       jmax    = sbuf1_i[0];
1249       ct1     = 2*jmax+1;
1250       ct2     = 0;
1251       rbuf2_i = rbuf2[idex2];
1252       rbuf3_i = rbuf3[idex2];
1253       for (j=1; j<=jmax; j++) {
1254         is_no   = sbuf1_i[2*j-1];
1255         max1    = sbuf1_i[2*j];
1256         lens_i  = lens[is_no];
1257         if (!allcolumns[is_no]){cmap_i  = cmap[is_no];}
1258         rmap_i  = rmap[is_no];
1259         for (k=0; k<max1; k++,ct1++) {
1260 #if defined (PETSC_USE_CTABLE)
1261           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1262           row--;
1263           if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); }
1264 #else
1265           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1266 #endif
1267           max2 = rbuf2_i[ct1];
1268           for (l=0; l<max2; l++,ct2++) {
1269             if (!allcolumns[is_no]){
1270 #if defined (PETSC_USE_CTABLE)
1271               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1272 #else
1273               tcol = cmap_i[rbuf3_i[ct2]];
1274 #endif
1275               if (tcol) {
1276                 lens_i[row]++;
1277               }
1278             } else { /* allcolumns */
1279               lens_i[row]++; /* lens_i[row] += max2 ? */
1280             }
1281           }
1282         }
1283       }
1284     }
1285   }
1286   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1287   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1288   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1289   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1290   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1291 
1292   /* Create the submatrices */
1293   if (scall == MAT_REUSE_MATRIX) {
1294     PetscBool  flag;
1295 
1296     /*
1297         Assumes new rows are same length as the old rows,hence bug!
1298     */
1299     for (i=0; i<ismax; i++) {
1300       mat = (Mat_SeqAIJ *)(submats[i]->data);
1301       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1302       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1303       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1304       /* Initial matrix as if empty */
1305       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1306       submats[i]->factortype = C->factortype;
1307     }
1308   } else {
1309     for (i=0; i<ismax; i++) {
1310       PetscInt rbs,cbs;
1311       ierr = ISGetBlockSize(isrow[i],&rbs); CHKERRQ(ierr);
1312       ierr = ISGetBlockSize(iscol[i],&cbs); CHKERRQ(ierr);
1313 
1314       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1315       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1316 
1317       ierr = MatSetBlockSizes(submats[i],rbs,cbs); CHKERRQ(ierr);
1318       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1319       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1320     }
1321   }
1322 
1323   /* Assemble the matrices */
1324   /* First assemble the local rows */
1325   {
1326     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1327     PetscScalar *imat_a;
1328 
1329     for (i=0; i<ismax; i++) {
1330       mat       = (Mat_SeqAIJ*)submats[i]->data;
1331       imat_ilen = mat->ilen;
1332       imat_j    = mat->j;
1333       imat_i    = mat->i;
1334       imat_a    = mat->a;
1335       if (!allcolumns[i]) cmap_i = cmap[i];
1336       rmap_i    = rmap[i];
1337       irow_i    = irow[i];
1338       jmax      = nrow[i];
1339       for (j=0; j<jmax; j++) {
1340 	l = 0;
1341         row      = irow_i[j];
1342         while (row >= C->rmap->range[l+1]) l++;
1343         proc = l;
1344         if (proc == rank) {
1345           old_row  = row;
1346 #if defined (PETSC_USE_CTABLE)
1347           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1348           row--;
1349 #else
1350           row      = rmap_i[row];
1351 #endif
1352           ilen_row = imat_ilen[row];
1353           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1354           mat_i    = imat_i[row] ;
1355           mat_a    = imat_a + mat_i;
1356           mat_j    = imat_j + mat_i;
1357           if (!allcolumns[i]){
1358             for (k=0; k<ncols; k++) {
1359 #if defined (PETSC_USE_CTABLE)
1360               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1361 #else
1362               tcol = cmap_i[cols[k]];
1363 #endif
1364               if (tcol){
1365                 *mat_j++ = tcol - 1;
1366                 *mat_a++ = vals[k];
1367                 ilen_row++;
1368               }
1369             }
1370           } else { /* allcolumns */
1371             for (k=0; k<ncols; k++) {
1372               *mat_j++ = cols[k] ; /* global col index! */
1373               *mat_a++ = vals[k];
1374               ilen_row++;
1375             }
1376           }
1377           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1378           imat_ilen[row] = ilen_row;
1379         }
1380       }
1381     }
1382   }
1383 
1384   /*   Now assemble the off proc rows*/
1385   {
1386     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1387     PetscInt    *imat_j,*imat_i;
1388     PetscScalar *imat_a,*rbuf4_i;
1389 
1390     for (tmp2=0; tmp2<nrqs; tmp2++) {
1391       ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1392       idex   = pa[idex2];
1393       sbuf1_i = sbuf1[idex];
1394       jmax    = sbuf1_i[0];
1395       ct1     = 2*jmax + 1;
1396       ct2     = 0;
1397       rbuf2_i = rbuf2[idex2];
1398       rbuf3_i = rbuf3[idex2];
1399       rbuf4_i = rbuf4[idex2];
1400       for (j=1; j<=jmax; j++) {
1401         is_no     = sbuf1_i[2*j-1];
1402         rmap_i    = rmap[is_no];
1403         if (!allcolumns[is_no]){cmap_i = cmap[is_no];}
1404         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1405         imat_ilen = mat->ilen;
1406         imat_j    = mat->j;
1407         imat_i    = mat->i;
1408         imat_a    = mat->a;
1409         max1      = sbuf1_i[2*j];
1410         for (k=0; k<max1; k++,ct1++) {
1411           row   = sbuf1_i[ct1];
1412 #if defined (PETSC_USE_CTABLE)
1413           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1414           row--;
1415 #else
1416           row   = rmap_i[row];
1417 #endif
1418           ilen  = imat_ilen[row];
1419           mat_i = imat_i[row] ;
1420           mat_a = imat_a + mat_i;
1421           mat_j = imat_j + mat_i;
1422           max2 = rbuf2_i[ct1];
1423           if (!allcolumns[is_no]){
1424             for (l=0; l<max2; l++,ct2++) {
1425 
1426 #if defined (PETSC_USE_CTABLE)
1427               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1428 #else
1429               tcol = cmap_i[rbuf3_i[ct2]];
1430 #endif
1431               if (tcol) {
1432                 *mat_j++ = tcol - 1;
1433                 *mat_a++ = rbuf4_i[ct2];
1434                 ilen++;
1435               }
1436             }
1437           } else { /* allcolumns */
1438             for (l=0; l<max2; l++,ct2++) {
1439               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1440               *mat_a++ = rbuf4_i[ct2];
1441               ilen++;
1442             }
1443           }
1444           imat_ilen[row] = ilen;
1445         }
1446       }
1447     }
1448   }
1449 
1450   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1451   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1452   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1453   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1454   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1455 
1456   /* Restore the indices */
1457   for (i=0; i<ismax; i++) {
1458     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1459     if (!allcolumns[i]){
1460       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1461     }
1462   }
1463 
1464   /* Destroy allocated memory */
1465   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1466   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1467   ierr = PetscFree(pa);CHKERRQ(ierr);
1468 
1469   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1470   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1471   for (i=0; i<nrqr; ++i) {
1472     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1473   }
1474   for (i=0; i<nrqs; ++i) {
1475     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1476     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1477   }
1478 
1479   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1480   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1481   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1482   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1483   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1484   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1485   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1486 
1487 #if defined (PETSC_USE_CTABLE)
1488   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1489 #else
1490   ierr = PetscFree(rmap[0]);CHKERRQ(ierr);
1491 #endif
1492   ierr = PetscFree(rmap);CHKERRQ(ierr);
1493 
1494   for (i=0; i<ismax; i++) {
1495     if (!allcolumns[i]){
1496 #if defined (PETSC_USE_CTABLE)
1497       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1498 #else
1499       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1500 #endif
1501     }
1502   }
1503   ierr = PetscFree(cmap);CHKERRQ(ierr);
1504   ierr = PetscFree(lens[0]);CHKERRQ(ierr);
1505   ierr = PetscFree(lens);CHKERRQ(ierr);
1506 
1507   for (i=0; i<ismax; i++) {
1508     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1509     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1510   }
1511   PetscFunctionReturn(0);
1512 }
1513 
1514 /*
1515  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1516  Be careful not to destroy them elsewhere.
1517  */
1518 #undef __FUNCT__
1519 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1520 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1521 {
1522   /* If making this function public, change the error returned in this function away from _PLIB. */
1523   PetscErrorCode ierr;
1524   Mat_MPIAIJ     *aij;
1525   PetscBool      seqaij;
1526 
1527   PetscFunctionBegin;
1528   /* Check to make sure the component matrices are compatible with C. */
1529   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1530   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1531   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr);
1532   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1533   if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1534 
1535   ierr = MatCreate(comm, C); CHKERRQ(ierr);
1536   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr);
1537   ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs); CHKERRQ(ierr);
1538   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1539   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1540   if((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1541   ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr);
1542   aij = (Mat_MPIAIJ*)((*C)->data);
1543   aij->A = A;
1544   aij->B = B;
1545   ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr);
1546   ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr);
1547   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1548   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 #undef __FUNCT__
1553 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1554 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1555 {
1556   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1557   PetscFunctionBegin;
1558   PetscValidPointer(A,2);
1559   PetscValidPointer(B,3);
1560   *A = aij->A;
1561   *B = aij->B;
1562   /* Note that we don't incref *A and *B, so be careful! */
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 #undef __FUNCT__
1567 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1568 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1569 						 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1570 						 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1571 						 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1572 {
1573   PetscErrorCode ierr;
1574   PetscMPIInt    size, flag;
1575   PetscInt       i,ii;
1576   PetscInt       ismax_c;
1577 
1578   PetscFunctionBegin;
1579   if (!ismax) {
1580     PetscFunctionReturn(0);
1581   }
1582   for (i = 0, ismax_c = 0; i < ismax; ++i) {
1583     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr);
1584     if(flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1585     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1586     if(size > 1) {
1587       ++ismax_c;
1588     }
1589   }
1590   if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1591     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr);
1592   } else { /* if(ismax_c) */
1593     Mat         *A,*B;
1594     IS          *isrow_c, *iscol_c;
1595     PetscMPIInt size;
1596     /*
1597      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1598      array of sequential matrices underlying the resulting parallel matrices.
1599      Which arrays to allocate is based on the value of MatReuse scall.
1600      There are as many diag matrices as there are original index sets.
1601      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1602 
1603      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1604      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1605      will deposite the extracted diag and off-diag parts.
1606      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1607      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1608     */
1609 
1610     /* Parallel matrix array is allocated only if no reuse is taking place. */
1611     if (scall != MAT_REUSE_MATRIX) {
1612       ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr);
1613     } else {
1614       ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr);
1615       ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr);
1616       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1617       for(i = 0, ii = 0; i < ismax; ++i) {
1618         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1619         if(size > 1) {
1620           ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr);
1621           ++ii;
1622         } else {
1623           A[i] = (*submat)[i];
1624         }
1625       }
1626     }
1627     /*
1628      Construct the complements of the iscol ISs for parallel ISs only.
1629      These are used to extract the off-diag portion of the resulting parallel matrix.
1630      The row IS for the off-diag portion is the same as for the diag portion,
1631      so we merely alias the row IS, while skipping those that are sequential.
1632     */
1633     ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr);
1634     for(i = 0, ii = 0; i < ismax; ++i) {
1635       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1636       if(size > 1) {
1637 	isrow_c[ii] = isrow[i];
1638 	ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr);
1639 	++ii;
1640       }
1641     }
1642     /* Now obtain the sequential A and B submatrices separately. */
1643     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr);
1644     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr);
1645     for(ii = 0; ii < ismax_c; ++ii) {
1646       ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr);
1647     }
1648     ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr);
1649     /*
1650      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1651      have been extracted directly into the parallel matrices containing them, or
1652      simply into the sequential matrix identical with the corresponding A (if size == 1).
1653      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1654      A is put into the correct submat slot (if size == 1).
1655      */
1656     if(scall != MAT_REUSE_MATRIX) {
1657       for(i = 0, ii = 0; i < ismax; ++i) {
1658         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr);
1659         if(size > 1) {
1660           /*
1661            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1662            */
1663           /* Construct submat[i] from the Seq pieces A and B. */
1664           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr);
1665 
1666           ++ii;
1667         } else {
1668           (*submat)[i] = A[i];
1669         }
1670       }
1671     }
1672     ierr = PetscFree(A); CHKERRQ(ierr);
1673     ierr = PetscFree(B); CHKERRQ(ierr);
1674   }
1675   PetscFunctionReturn(0);
1676 }/* MatGetSubMatricesParallel_MPIXAIJ() */
1677 
1678 
1679 
1680 #undef __FUNCT__
1681 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1682 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1683 {
1684   PetscErrorCode ierr;
1685 
1686   PetscFunctionBegin;
1687   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
1688   PetscFunctionReturn(0);
1689 }
1690