xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 785e854f82a3c614b452fca2cf5ad4f2afe8bdde)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
73   size = c->size;
74   rank = c->rank;
75   M    = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++) {
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc1((nrqs+1),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscMalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
162     ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr);
163     ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr);
164     ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr);
165     ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr);
166     ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr);
167 
168     for (i=0; i<imax; i++) {
169       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
170       data[i]  = d_p + M*i;
171     }
172   }
173 
174   /* Parse the IS and update local tables and the outgoing buf with the data*/
175   {
176     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
177     PetscBT  table_i;
178 
179     for (i=0; i<imax; i++) {
180       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
181       n_i     = n[i];
182       table_i = table[i];
183       idx_i   = idx[i];
184       data_i  = data[i];
185       isz_i   = isz[i];
186       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
187         row  = idx_i[j];
188         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
189         if (proc != rank) { /* copy to the outgoing buffer */
190           ctr[proc]++;
191           *ptr[proc] = row;
192           ptr[proc]++;
193         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
194       }
195       /* Update the headers for the current IS */
196       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
197         if ((ctr_j = ctr[j])) {
198           outdat_j        = outdat[j];
199           k               = ++outdat_j[0];
200           outdat_j[2*k]   = ctr_j;
201           outdat_j[2*k-1] = i;
202         }
203       }
204       isz[i] = isz_i;
205     }
206   }
207 
208   /*  Now  post the sends */
209   ierr = PetscMalloc1((nrqs+1),&s_waits1);CHKERRQ(ierr);
210   for (i=0; i<nrqs; ++i) {
211     j    = pa[i];
212     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
213   }
214 
215   /* No longer need the original indices*/
216   for (i=0; i<imax; ++i) {
217     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
218   }
219   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
220 
221   for (i=0; i<imax; ++i) {
222     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
223   }
224 
225   /* Do Local work*/
226   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
227 
228   /* Receive messages*/
229   ierr = PetscMalloc1((nrqr+1),&recv_status);CHKERRQ(ierr);
230   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
231 
232   ierr = PetscMalloc1((nrqs+1),&s_status);CHKERRQ(ierr);
233   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
234 
235   /* Phase 1 sends are complete - deallocate buffers */
236   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
237   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
238 
239   ierr = PetscMalloc1((nrqr+1),&xdata);CHKERRQ(ierr);
240   ierr = PetscMalloc1((nrqr+1),&isz1);CHKERRQ(ierr);
241   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
242   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
243   ierr = PetscFree(rbuf);CHKERRQ(ierr);
244 
245 
246   /* Send the data back*/
247   /* Do a global reduction to know the buffer space req for incoming messages*/
248   {
249     PetscMPIInt *rw1;
250 
251     ierr = PetscMalloc1(size,&rw1);CHKERRQ(ierr);
252     ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
253 
254     for (i=0; i<nrqr; ++i) {
255       proc = recv_status[i].MPI_SOURCE;
256 
257       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
258       rw1[proc] = isz1[i];
259     }
260     ierr = PetscFree(onodes1);CHKERRQ(ierr);
261     ierr = PetscFree(olengths1);CHKERRQ(ierr);
262 
263     /* Determine the number of messages to expect, their lengths, from from-ids */
264     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
265     ierr = PetscFree(rw1);CHKERRQ(ierr);
266   }
267   /* Now post the Irecvs corresponding to these messages */
268   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
269 
270   /*  Now  post the sends */
271   ierr = PetscMalloc1((nrqr+1),&s_waits2);CHKERRQ(ierr);
272   for (i=0; i<nrqr; ++i) {
273     j    = recv_status[i].MPI_SOURCE;
274     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
275   }
276 
277   /* receive work done on other processors*/
278   {
279     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
280     PetscMPIInt idex;
281     PetscBT     table_i;
282     MPI_Status  *status2;
283 
284     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
285     for (i=0; i<nrqs; ++i) {
286       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
287       /* Process the message*/
288       rbuf2_i = rbuf2[idex];
289       ct1     = 2*rbuf2_i[0]+1;
290       jmax    = rbuf2[idex][0];
291       for (j=1; j<=jmax; j++) {
292         max     = rbuf2_i[2*j];
293         is_no   = rbuf2_i[2*j-1];
294         isz_i   = isz[is_no];
295         data_i  = data[is_no];
296         table_i = table[is_no];
297         for (k=0; k<max; k++,ct1++) {
298           row = rbuf2_i[ct1];
299           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
300         }
301         isz[is_no] = isz_i;
302       }
303     }
304 
305     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
306     ierr = PetscFree(status2);CHKERRQ(ierr);
307   }
308 
309   for (i=0; i<imax; ++i) {
310     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
311   }
312 
313   ierr = PetscFree(onodes2);CHKERRQ(ierr);
314   ierr = PetscFree(olengths2);CHKERRQ(ierr);
315 
316   ierr = PetscFree(pa);CHKERRQ(ierr);
317   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
318   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
319   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
320   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
321   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
322   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
323   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
324   ierr = PetscFree(s_status);CHKERRQ(ierr);
325   ierr = PetscFree(recv_status);CHKERRQ(ierr);
326   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
327   ierr = PetscFree(xdata);CHKERRQ(ierr);
328   ierr = PetscFree(isz1);CHKERRQ(ierr);
329   PetscFunctionReturn(0);
330 }
331 
332 #undef __FUNCT__
333 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
334 /*
335    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
336        the work on the local processor.
337 
338      Inputs:
339       C      - MAT_MPIAIJ;
340       imax - total no of index sets processed at a time;
341       table  - an array of char - size = m bits.
342 
343      Output:
344       isz    - array containing the count of the solution elements corresponding
345                to each index set;
346       data   - pointer to the solutions
347 */
348 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
349 {
350   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
351   Mat        A  = c->A,B = c->B;
352   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
353   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
354   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
355   PetscBT    table_i;
356 
357   PetscFunctionBegin;
358   rstart = C->rmap->rstart;
359   cstart = C->cmap->rstart;
360   ai     = a->i;
361   aj     = a->j;
362   bi     = b->i;
363   bj     = b->j;
364   garray = c->garray;
365 
366 
367   for (i=0; i<imax; i++) {
368     data_i  = data[i];
369     table_i = table[i];
370     isz_i   = isz[i];
371     for (j=0,max=isz[i]; j<max; j++) {
372       row   = data_i[j] - rstart;
373       start = ai[row];
374       end   = ai[row+1];
375       for (k=start; k<end; k++) { /* Amat */
376         val = aj[k] + cstart;
377         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
378       }
379       start = bi[row];
380       end   = bi[row+1];
381       for (k=start; k<end; k++) { /* Bmat */
382         val = garray[bj[k]];
383         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
384       }
385     }
386     isz[i] = isz_i;
387   }
388   PetscFunctionReturn(0);
389 }
390 
391 #undef __FUNCT__
392 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
393 /*
394       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
395          and return the output
396 
397          Input:
398            C    - the matrix
399            nrqr - no of messages being processed.
400            rbuf - an array of pointers to the recieved requests
401 
402          Output:
403            xdata - array of messages to be sent back
404            isz1  - size of each message
405 
406   For better efficiency perhaps we should malloc separately each xdata[i],
407 then if a remalloc is required we need only copy the data for that one row
408 rather then all previous rows as it is now where a single large chunck of
409 memory is used.
410 
411 */
412 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
413 {
414   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
415   Mat            A  = c->A,B = c->B;
416   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
417   PetscErrorCode ierr;
418   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
419   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
420   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
421   PetscInt       *rbuf_i,kmax,rbuf_0;
422   PetscBT        xtable;
423 
424   PetscFunctionBegin;
425   m      = C->rmap->N;
426   rstart = C->rmap->rstart;
427   cstart = C->cmap->rstart;
428   ai     = a->i;
429   aj     = a->j;
430   bi     = b->i;
431   bj     = b->j;
432   garray = c->garray;
433 
434 
435   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
436     rbuf_i =  rbuf[i];
437     rbuf_0 =  rbuf_i[0];
438     ct    += rbuf_0;
439     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
440   }
441 
442   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
443   else max1 = 1;
444   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
445   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
446   ++no_malloc;
447   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
448   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
449 
450   ct3 = 0;
451   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
452     rbuf_i =  rbuf[i];
453     rbuf_0 =  rbuf_i[0];
454     ct1    =  2*rbuf_0+1;
455     ct2    =  ct1;
456     ct3   += ct1;
457     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
458       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
459       oct2 = ct2;
460       kmax = rbuf_i[2*j];
461       for (k=0; k<kmax; k++,ct1++) {
462         row = rbuf_i[ct1];
463         if (!PetscBTLookupSet(xtable,row)) {
464           if (!(ct3 < mem_estimate)) {
465             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
466             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
467             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
468             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
469             xdata[0]     = tmp;
470             mem_estimate = new_estimate; ++no_malloc;
471             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
472           }
473           xdata[i][ct2++] = row;
474           ct3++;
475         }
476       }
477       for (k=oct2,max2=ct2; k<max2; k++) {
478         row   = xdata[i][k] - rstart;
479         start = ai[row];
480         end   = ai[row+1];
481         for (l=start; l<end; l++) {
482           val = aj[l] + cstart;
483           if (!PetscBTLookupSet(xtable,val)) {
484             if (!(ct3 < mem_estimate)) {
485               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
486               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
487               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
488               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
489               xdata[0]     = tmp;
490               mem_estimate = new_estimate; ++no_malloc;
491               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
492             }
493             xdata[i][ct2++] = val;
494             ct3++;
495           }
496         }
497         start = bi[row];
498         end   = bi[row+1];
499         for (l=start; l<end; l++) {
500           val = garray[bj[l]];
501           if (!PetscBTLookupSet(xtable,val)) {
502             if (!(ct3 < mem_estimate)) {
503               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
504               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
505               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
506               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
507               xdata[0]     = tmp;
508               mem_estimate = new_estimate; ++no_malloc;
509               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
510             }
511             xdata[i][ct2++] = val;
512             ct3++;
513           }
514         }
515       }
516       /* Update the header*/
517       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
518       xdata[i][2*j-1] = rbuf_i[2*j-1];
519     }
520     xdata[i][0] = rbuf_0;
521     xdata[i+1]  = xdata[i] + ct2;
522     isz1[i]     = ct2; /* size of each message */
523   }
524   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
525   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
526   PetscFunctionReturn(0);
527 }
528 /* -------------------------------------------------------------------------*/
529 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
530 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
531 /*
532     Every processor gets the entire matrix
533 */
534 #undef __FUNCT__
535 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
536 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
537 {
538   Mat            B;
539   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
540   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
541   PetscErrorCode ierr;
542   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
543   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
544   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
545   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
546 
547   PetscFunctionBegin;
548   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
549   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
550 
551   if (scall == MAT_INITIAL_MATRIX) {
552     /* ----------------------------------------------------------------
553          Tell every processor the number of nonzeros per row
554     */
555     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
556     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
557       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
558     }
559     sendcount = A->rmap->rend - A->rmap->rstart;
560     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
561     for (i=0; i<size; i++) {
562       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
563       displs[i]     = A->rmap->range[i];
564     }
565 #if defined(PETSC_HAVE_MPI_IN_PLACE)
566     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
567 #else
568     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
569 #endif
570     /* ---------------------------------------------------------------
571          Create the sequential matrix of the same type as the local block diagonal
572     */
573     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
574     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
575     ierr  = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
576     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
577     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
578     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
579     **Bin = B;
580     b     = (Mat_SeqAIJ*)B->data;
581 
582     /*--------------------------------------------------------------------
583        Copy my part of matrix column indices over
584     */
585     sendcount  = ad->nz + bd->nz;
586     jsendbuf   = b->j + b->i[rstarts[rank]];
587     a_jsendbuf = ad->j;
588     b_jsendbuf = bd->j;
589     n          = A->rmap->rend - A->rmap->rstart;
590     cnt        = 0;
591     for (i=0; i<n; i++) {
592 
593       /* put in lower diagonal portion */
594       m = bd->i[i+1] - bd->i[i];
595       while (m > 0) {
596         /* is it above diagonal (in bd (compressed) numbering) */
597         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
598         jsendbuf[cnt++] = garray[*b_jsendbuf++];
599         m--;
600       }
601 
602       /* put in diagonal portion */
603       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
604         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
605       }
606 
607       /* put in upper diagonal portion */
608       while (m-- > 0) {
609         jsendbuf[cnt++] = garray[*b_jsendbuf++];
610       }
611     }
612     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
613 
614     /*--------------------------------------------------------------------
615        Gather all column indices to all processors
616     */
617     for (i=0; i<size; i++) {
618       recvcounts[i] = 0;
619       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
620         recvcounts[i] += lens[j];
621       }
622     }
623     displs[0] = 0;
624     for (i=1; i<size; i++) {
625       displs[i] = displs[i-1] + recvcounts[i-1];
626     }
627 #if defined(PETSC_HAVE_MPI_IN_PLACE)
628     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
629 #else
630     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
631 #endif
632     /*--------------------------------------------------------------------
633         Assemble the matrix into useable form (note numerical values not yet set)
634     */
635     /* set the b->ilen (length of each row) values */
636     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
637     /* set the b->i indices */
638     b->i[0] = 0;
639     for (i=1; i<=A->rmap->N; i++) {
640       b->i[i] = b->i[i-1] + lens[i-1];
641     }
642     ierr = PetscFree(lens);CHKERRQ(ierr);
643     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
644     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
645 
646   } else {
647     B = **Bin;
648     b = (Mat_SeqAIJ*)B->data;
649   }
650 
651   /*--------------------------------------------------------------------
652        Copy my part of matrix numerical values into the values location
653   */
654   if (flag == MAT_GET_VALUES) {
655     sendcount = ad->nz + bd->nz;
656     sendbuf   = b->a + b->i[rstarts[rank]];
657     a_sendbuf = ad->a;
658     b_sendbuf = bd->a;
659     b_sendj   = bd->j;
660     n         = A->rmap->rend - A->rmap->rstart;
661     cnt       = 0;
662     for (i=0; i<n; i++) {
663 
664       /* put in lower diagonal portion */
665       m = bd->i[i+1] - bd->i[i];
666       while (m > 0) {
667         /* is it above diagonal (in bd (compressed) numbering) */
668         if (garray[*b_sendj] > A->rmap->rstart + i) break;
669         sendbuf[cnt++] = *b_sendbuf++;
670         m--;
671         b_sendj++;
672       }
673 
674       /* put in diagonal portion */
675       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
676         sendbuf[cnt++] = *a_sendbuf++;
677       }
678 
679       /* put in upper diagonal portion */
680       while (m-- > 0) {
681         sendbuf[cnt++] = *b_sendbuf++;
682         b_sendj++;
683       }
684     }
685     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
686 
687     /* -----------------------------------------------------------------
688        Gather all numerical values to all processors
689     */
690     if (!recvcounts) {
691       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
692     }
693     for (i=0; i<size; i++) {
694       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
695     }
696     displs[0] = 0;
697     for (i=1; i<size; i++) {
698       displs[i] = displs[i-1] + recvcounts[i-1];
699     }
700     recvbuf = b->a;
701 #if defined(PETSC_HAVE_MPI_IN_PLACE)
702     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
703 #else
704     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
705 #endif
706   }  /* endof (flag == MAT_GET_VALUES) */
707   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
708 
709   if (A->symmetric) {
710     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
711   } else if (A->hermitian) {
712     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
713   } else if (A->structurally_symmetric) {
714     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
715   }
716   PetscFunctionReturn(0);
717 }
718 
719 
720 
721 #undef __FUNCT__
722 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
723 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
724 {
725   PetscErrorCode ierr;
726   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
727   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
728 
729   PetscFunctionBegin;
730   /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
731   /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level.
732      However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that
733      take care to order the result correctly by assembling it with MatSetValues() (after preallocating).
734    */
735   for (i = 0; i < ismax; ++i) {
736     PetscBool sorted;
737     ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr);
738     if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i);
739   }
740 
741   /*
742        Check for special case: each processor gets entire matrix
743   */
744   if (ismax == 1 && C->rmap->N == C->cmap->N) {
745     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
746     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
747     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
748     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
749     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
750       wantallmatrix = PETSC_TRUE;
751 
752       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
753     }
754   }
755   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
756   if (twantallmatrix) {
757     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
758     PetscFunctionReturn(0);
759   }
760 
761   /* Allocate memory to hold all the submatrices */
762   if (scall != MAT_REUSE_MATRIX) {
763     ierr = PetscMalloc1((ismax+1),submat);CHKERRQ(ierr);
764   }
765 
766   /* Check for special case: each processor gets entire matrix columns */
767   ierr = PetscMalloc1((ismax+1),&allcolumns);CHKERRQ(ierr);
768   for (i=0; i<ismax; i++) {
769     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
770     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
771     if (colflag && ncol == C->cmap->N) {
772       allcolumns[i] = PETSC_TRUE;
773     } else {
774       allcolumns[i] = PETSC_FALSE;
775     }
776   }
777 
778   /* Determine the number of stages through which submatrices are done */
779   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
780 
781   /*
782      Each stage will extract nmax submatrices.
783      nmax is determined by the matrix column dimension.
784      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
785   */
786   if (!nmax) nmax = 1;
787   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
788 
789   /* Make sure every processor loops through the nstages */
790   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
791 
792   for (i=0,pos=0; i<nstages; i++) {
793     if (pos+nmax <= ismax) max_no = nmax;
794     else if (pos == ismax) max_no = 0;
795     else                   max_no = ismax-pos;
796     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
797     pos += max_no;
798   }
799 
800   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
801   PetscFunctionReturn(0);
802 }
803 
804 /* -------------------------------------------------------------------------*/
805 #undef __FUNCT__
806 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
807 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
808 {
809   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
810   Mat            A  = c->A;
811   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
812   const PetscInt **icol,**irow;
813   PetscInt       *nrow,*ncol,start;
814   PetscErrorCode ierr;
815   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
816   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
817   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
818   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
819   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
820 #if defined(PETSC_USE_CTABLE)
821   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
822 #else
823   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
824 #endif
825   const PetscInt *irow_i;
826   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
827   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
828   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
829   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
830   MPI_Status     *r_status3,*r_status4,*s_status4;
831   MPI_Comm       comm;
832   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
833   PetscMPIInt    *onodes1,*olengths1;
834   PetscMPIInt    idex,idex2,end;
835 
836   PetscFunctionBegin;
837   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
838   tag0 = ((PetscObject)C)->tag;
839   size = c->size;
840   rank = c->rank;
841 
842   /* Get some new tags to keep the communication clean */
843   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
844   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
845   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
846 
847   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
848 
849   for (i=0; i<ismax; i++) {
850     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
851     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
852     if (allcolumns[i]) {
853       icol[i] = NULL;
854       ncol[i] = C->cmap->N;
855     } else {
856       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
857       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
858     }
859   }
860 
861   /* evaluate communication - mesg to who, length of mesg, and buffer space
862      required. Based on this, buffers are allocated, and data copied into them*/
863   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
864   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
865   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
866   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
867   for (i=0; i<ismax; i++) {
868     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
869     jmax   = nrow[i];
870     irow_i = irow[i];
871     for (j=0; j<jmax; j++) {
872       l   = 0;
873       row = irow_i[j];
874       while (row >= C->rmap->range[l+1]) l++;
875       proc = l;
876       w4[proc]++;
877     }
878     for (j=0; j<size; j++) {
879       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
880     }
881   }
882 
883   nrqs     = 0;              /* no of outgoing messages */
884   msz      = 0;              /* total mesg length (for all procs) */
885   w1[rank] = 0;              /* no mesg sent to self */
886   w3[rank] = 0;
887   for (i=0; i<size; i++) {
888     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
889   }
890   ierr = PetscMalloc1((nrqs+1),&pa);CHKERRQ(ierr); /*(proc -array)*/
891   for (i=0,j=0; i<size; i++) {
892     if (w1[i]) { pa[j] = i; j++; }
893   }
894 
895   /* Each message would have a header = 1 + 2*(no of IS) + data */
896   for (i=0; i<nrqs; i++) {
897     j      = pa[i];
898     w1[j] += w2[j] + 2* w3[j];
899     msz   += w1[j];
900   }
901   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
902 
903   /* Determine the number of messages to expect, their lengths, from from-ids */
904   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
905   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
906 
907   /* Now post the Irecvs corresponding to these messages */
908   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
909 
910   ierr = PetscFree(onodes1);CHKERRQ(ierr);
911   ierr = PetscFree(olengths1);CHKERRQ(ierr);
912 
913   /* Allocate Memory for outgoing messages */
914   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
915   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
916   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
917 
918   {
919     PetscInt *iptr = tmp,ict = 0;
920     for (i=0; i<nrqs; i++) {
921       j        = pa[i];
922       iptr    += ict;
923       sbuf1[j] = iptr;
924       ict      = w1[j];
925     }
926   }
927 
928   /* Form the outgoing messages */
929   /* Initialize the header space */
930   for (i=0; i<nrqs; i++) {
931     j           = pa[i];
932     sbuf1[j][0] = 0;
933     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
934     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
935   }
936 
937   /* Parse the isrow and copy data into outbuf */
938   for (i=0; i<ismax; i++) {
939     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
940     irow_i = irow[i];
941     jmax   = nrow[i];
942     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
943       l   = 0;
944       row = irow_i[j];
945       while (row >= C->rmap->range[l+1]) l++;
946       proc = l;
947       if (proc != rank) { /* copy to the outgoing buf*/
948         ctr[proc]++;
949         *ptr[proc] = row;
950         ptr[proc]++;
951       }
952     }
953     /* Update the headers for the current IS */
954     for (j=0; j<size; j++) { /* Can Optimise this loop too */
955       if ((ctr_j = ctr[j])) {
956         sbuf1_j        = sbuf1[j];
957         k              = ++sbuf1_j[0];
958         sbuf1_j[2*k]   = ctr_j;
959         sbuf1_j[2*k-1] = i;
960       }
961     }
962   }
963 
964   /*  Now  post the sends */
965   ierr = PetscMalloc1((nrqs+1),&s_waits1);CHKERRQ(ierr);
966   for (i=0; i<nrqs; ++i) {
967     j    = pa[i];
968     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
969   }
970 
971   /* Post Receives to capture the buffer size */
972   ierr     = PetscMalloc1((nrqs+1),&r_waits2);CHKERRQ(ierr);
973   ierr     = PetscMalloc1((nrqs+1),&rbuf2);CHKERRQ(ierr);
974   rbuf2[0] = tmp + msz;
975   for (i=1; i<nrqs; ++i) {
976     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
977   }
978   for (i=0; i<nrqs; ++i) {
979     j    = pa[i];
980     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
981   }
982 
983   /* Send to other procs the buf size they should allocate */
984 
985 
986   /* Receive messages*/
987   ierr = PetscMalloc1((nrqr+1),&s_waits2);CHKERRQ(ierr);
988   ierr = PetscMalloc1((nrqr+1),&r_status1);CHKERRQ(ierr);
989   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
990   {
991     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
992     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
993     PetscInt   *sbuf2_i;
994 
995     for (i=0; i<nrqr; ++i) {
996       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
997 
998       req_size[idex] = 0;
999       rbuf1_i        = rbuf1[idex];
1000       start          = 2*rbuf1_i[0] + 1;
1001       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
1002       ierr           = PetscMalloc1((end+1),&sbuf2[idex]);CHKERRQ(ierr);
1003       sbuf2_i        = sbuf2[idex];
1004       for (j=start; j<end; j++) {
1005         id              = rbuf1_i[j] - rstart;
1006         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1007         sbuf2_i[j]      = ncols;
1008         req_size[idex] += ncols;
1009       }
1010       req_source[idex] = r_status1[i].MPI_SOURCE;
1011       /* form the header */
1012       sbuf2_i[0] = req_size[idex];
1013       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1014 
1015       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1016     }
1017   }
1018   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1019   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1020 
1021   /*  recv buffer sizes */
1022   /* Receive messages*/
1023 
1024   ierr = PetscMalloc1((nrqs+1),&rbuf3);CHKERRQ(ierr);
1025   ierr = PetscMalloc1((nrqs+1),&rbuf4);CHKERRQ(ierr);
1026   ierr = PetscMalloc1((nrqs+1),&r_waits3);CHKERRQ(ierr);
1027   ierr = PetscMalloc1((nrqs+1),&r_waits4);CHKERRQ(ierr);
1028   ierr = PetscMalloc1((nrqs+1),&r_status2);CHKERRQ(ierr);
1029 
1030   for (i=0; i<nrqs; ++i) {
1031     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1032     ierr = PetscMalloc1((rbuf2[idex][0]+1),&rbuf3[idex]);CHKERRQ(ierr);
1033     ierr = PetscMalloc1((rbuf2[idex][0]+1),&rbuf4[idex]);CHKERRQ(ierr);
1034     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1035     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1036   }
1037   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1038   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1039 
1040   /* Wait on sends1 and sends2 */
1041   ierr = PetscMalloc1((nrqs+1),&s_status1);CHKERRQ(ierr);
1042   ierr = PetscMalloc1((nrqr+1),&s_status2);CHKERRQ(ierr);
1043 
1044   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1045   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1046   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1047   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1048   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1049   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1050 
1051   /* Now allocate buffers for a->j, and send them off */
1052   ierr = PetscMalloc1((nrqr+1),&sbuf_aj);CHKERRQ(ierr);
1053   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1054   ierr = PetscMalloc1((j+1),&sbuf_aj[0]);CHKERRQ(ierr);
1055   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1056 
1057   ierr = PetscMalloc1((nrqr+1),&s_waits3);CHKERRQ(ierr);
1058   {
1059     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1060     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1061     PetscInt cend = C->cmap->rend;
1062     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1063 
1064     for (i=0; i<nrqr; i++) {
1065       rbuf1_i   = rbuf1[i];
1066       sbuf_aj_i = sbuf_aj[i];
1067       ct1       = 2*rbuf1_i[0] + 1;
1068       ct2       = 0;
1069       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1070         kmax = rbuf1[i][2*j];
1071         for (k=0; k<kmax; k++,ct1++) {
1072           row    = rbuf1_i[ct1] - rstart;
1073           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1074           ncols  = nzA + nzB;
1075           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1076 
1077           /* load the column indices for this row into cols*/
1078           cols = sbuf_aj_i + ct2;
1079 
1080           lwrite = 0;
1081           for (l=0; l<nzB; l++) {
1082             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1083           }
1084           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1085           for (l=0; l<nzB; l++) {
1086             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1087           }
1088 
1089           ct2 += ncols;
1090         }
1091       }
1092       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1093     }
1094   }
1095   ierr = PetscMalloc1((nrqs+1),&r_status3);CHKERRQ(ierr);
1096   ierr = PetscMalloc1((nrqr+1),&s_status3);CHKERRQ(ierr);
1097 
1098   /* Allocate buffers for a->a, and send them off */
1099   ierr = PetscMalloc1((nrqr+1),&sbuf_aa);CHKERRQ(ierr);
1100   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1101   ierr = PetscMalloc1((j+1),&sbuf_aa[0]);CHKERRQ(ierr);
1102   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1103 
1104   ierr = PetscMalloc1((nrqr+1),&s_waits4);CHKERRQ(ierr);
1105   {
1106     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1107     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1108     PetscInt    cend   = C->cmap->rend;
1109     PetscInt    *b_j   = b->j;
1110     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1111 
1112     for (i=0; i<nrqr; i++) {
1113       rbuf1_i   = rbuf1[i];
1114       sbuf_aa_i = sbuf_aa[i];
1115       ct1       = 2*rbuf1_i[0]+1;
1116       ct2       = 0;
1117       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1118         kmax = rbuf1_i[2*j];
1119         for (k=0; k<kmax; k++,ct1++) {
1120           row    = rbuf1_i[ct1] - rstart;
1121           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1122           ncols  = nzA + nzB;
1123           cworkB = b_j + b_i[row];
1124           vworkA = a_a + a_i[row];
1125           vworkB = b_a + b_i[row];
1126 
1127           /* load the column values for this row into vals*/
1128           vals = sbuf_aa_i+ct2;
1129 
1130           lwrite = 0;
1131           for (l=0; l<nzB; l++) {
1132             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1133           }
1134           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1135           for (l=0; l<nzB; l++) {
1136             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1137           }
1138 
1139           ct2 += ncols;
1140         }
1141       }
1142       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1143     }
1144   }
1145   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1146   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1147   ierr = PetscMalloc1((nrqs+1),&r_status4);CHKERRQ(ierr);
1148   ierr = PetscMalloc1((nrqr+1),&s_status4);CHKERRQ(ierr);
1149 
1150   /* Form the matrix */
1151   /* create col map: global col of C -> local col of submatrices */
1152   {
1153     const PetscInt *icol_i;
1154 #if defined(PETSC_USE_CTABLE)
1155     ierr = PetscMalloc1((1+ismax),&cmap);CHKERRQ(ierr);
1156     for (i=0; i<ismax; i++) {
1157       if (!allcolumns[i]) {
1158         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1159 
1160         jmax   = ncol[i];
1161         icol_i = icol[i];
1162         cmap_i = cmap[i];
1163         for (j=0; j<jmax; j++) {
1164           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1165         }
1166       } else {
1167         cmap[i] = NULL;
1168       }
1169     }
1170 #else
1171     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1172     for (i=0; i<ismax; i++) {
1173       if (!allcolumns[i]) {
1174         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1175         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1176         jmax   = ncol[i];
1177         icol_i = icol[i];
1178         cmap_i = cmap[i];
1179         for (j=0; j<jmax; j++) {
1180           cmap_i[icol_i[j]] = j+1;
1181         }
1182       } else {
1183         cmap[i] = NULL;
1184       }
1185     }
1186 #endif
1187   }
1188 
1189   /* Create lens which is required for MatCreate... */
1190   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1191   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1192   if (ismax) {
1193     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1194     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1195   }
1196   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1197 
1198   /* Update lens from local data */
1199   for (i=0; i<ismax; i++) {
1200     jmax = nrow[i];
1201     if (!allcolumns[i]) cmap_i = cmap[i];
1202     irow_i = irow[i];
1203     lens_i = lens[i];
1204     for (j=0; j<jmax; j++) {
1205       l   = 0;
1206       row = irow_i[j];
1207       while (row >= C->rmap->range[l+1]) l++;
1208       proc = l;
1209       if (proc == rank) {
1210         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1211         if (!allcolumns[i]) {
1212           for (k=0; k<ncols; k++) {
1213 #if defined(PETSC_USE_CTABLE)
1214             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1215 #else
1216             tcol = cmap_i[cols[k]];
1217 #endif
1218             if (tcol) lens_i[j]++;
1219           }
1220         } else { /* allcolumns */
1221           lens_i[j] = ncols;
1222         }
1223         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1224       }
1225     }
1226   }
1227 
1228   /* Create row map: global row of C -> local row of submatrices */
1229 #if defined(PETSC_USE_CTABLE)
1230   ierr = PetscMalloc1((1+ismax),&rmap);CHKERRQ(ierr);
1231   for (i=0; i<ismax; i++) {
1232     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1233     rmap_i = rmap[i];
1234     irow_i = irow[i];
1235     jmax   = nrow[i];
1236     for (j=0; j<jmax; j++) {
1237       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1238     }
1239   }
1240 #else
1241   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1242   if (ismax) {
1243     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1244     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1245   }
1246   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1247   for (i=0; i<ismax; i++) {
1248     rmap_i = rmap[i];
1249     irow_i = irow[i];
1250     jmax   = nrow[i];
1251     for (j=0; j<jmax; j++) {
1252       rmap_i[irow_i[j]] = j;
1253     }
1254   }
1255 #endif
1256 
1257   /* Update lens from offproc data */
1258   {
1259     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1260 
1261     for (tmp2=0; tmp2<nrqs; tmp2++) {
1262       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1263       idex    = pa[idex2];
1264       sbuf1_i = sbuf1[idex];
1265       jmax    = sbuf1_i[0];
1266       ct1     = 2*jmax+1;
1267       ct2     = 0;
1268       rbuf2_i = rbuf2[idex2];
1269       rbuf3_i = rbuf3[idex2];
1270       for (j=1; j<=jmax; j++) {
1271         is_no  = sbuf1_i[2*j-1];
1272         max1   = sbuf1_i[2*j];
1273         lens_i = lens[is_no];
1274         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1275         rmap_i = rmap[is_no];
1276         for (k=0; k<max1; k++,ct1++) {
1277 #if defined(PETSC_USE_CTABLE)
1278           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1279           row--;
1280           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1281 #else
1282           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1283 #endif
1284           max2 = rbuf2_i[ct1];
1285           for (l=0; l<max2; l++,ct2++) {
1286             if (!allcolumns[is_no]) {
1287 #if defined(PETSC_USE_CTABLE)
1288               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1289 #else
1290               tcol = cmap_i[rbuf3_i[ct2]];
1291 #endif
1292               if (tcol) lens_i[row]++;
1293             } else { /* allcolumns */
1294               lens_i[row]++; /* lens_i[row] += max2 ? */
1295             }
1296           }
1297         }
1298       }
1299     }
1300   }
1301   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1302   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1303   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1304   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1305   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1306 
1307   /* Create the submatrices */
1308   if (scall == MAT_REUSE_MATRIX) {
1309     PetscBool flag;
1310 
1311     /*
1312         Assumes new rows are same length as the old rows,hence bug!
1313     */
1314     for (i=0; i<ismax; i++) {
1315       mat = (Mat_SeqAIJ*)(submats[i]->data);
1316       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1317       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1318       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1319       /* Initial matrix as if empty */
1320       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1321 
1322       submats[i]->factortype = C->factortype;
1323     }
1324   } else {
1325     for (i=0; i<ismax; i++) {
1326       PetscInt rbs,cbs;
1327       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1328       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1329 
1330       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1331       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1332 
1333       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1334       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1335       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1336     }
1337   }
1338 
1339   /* Assemble the matrices */
1340   /* First assemble the local rows */
1341   {
1342     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1343     PetscScalar *imat_a;
1344 
1345     for (i=0; i<ismax; i++) {
1346       mat       = (Mat_SeqAIJ*)submats[i]->data;
1347       imat_ilen = mat->ilen;
1348       imat_j    = mat->j;
1349       imat_i    = mat->i;
1350       imat_a    = mat->a;
1351 
1352       if (!allcolumns[i]) cmap_i = cmap[i];
1353       rmap_i = rmap[i];
1354       irow_i = irow[i];
1355       jmax   = nrow[i];
1356       for (j=0; j<jmax; j++) {
1357         l   = 0;
1358         row = irow_i[j];
1359         while (row >= C->rmap->range[l+1]) l++;
1360         proc = l;
1361         if (proc == rank) {
1362           old_row = row;
1363 #if defined(PETSC_USE_CTABLE)
1364           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1365           row--;
1366 #else
1367           row = rmap_i[row];
1368 #endif
1369           ilen_row = imat_ilen[row];
1370           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1371           mat_i    = imat_i[row];
1372           mat_a    = imat_a + mat_i;
1373           mat_j    = imat_j + mat_i;
1374           if (!allcolumns[i]) {
1375             for (k=0; k<ncols; k++) {
1376 #if defined(PETSC_USE_CTABLE)
1377               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1378 #else
1379               tcol = cmap_i[cols[k]];
1380 #endif
1381               if (tcol) {
1382                 *mat_j++ = tcol - 1;
1383                 *mat_a++ = vals[k];
1384                 ilen_row++;
1385               }
1386             }
1387           } else { /* allcolumns */
1388             for (k=0; k<ncols; k++) {
1389               *mat_j++ = cols[k];  /* global col index! */
1390               *mat_a++ = vals[k];
1391               ilen_row++;
1392             }
1393           }
1394           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1395 
1396           imat_ilen[row] = ilen_row;
1397         }
1398       }
1399     }
1400   }
1401 
1402   /*   Now assemble the off proc rows*/
1403   {
1404     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1405     PetscInt    *imat_j,*imat_i;
1406     PetscScalar *imat_a,*rbuf4_i;
1407 
1408     for (tmp2=0; tmp2<nrqs; tmp2++) {
1409       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1410       idex    = pa[idex2];
1411       sbuf1_i = sbuf1[idex];
1412       jmax    = sbuf1_i[0];
1413       ct1     = 2*jmax + 1;
1414       ct2     = 0;
1415       rbuf2_i = rbuf2[idex2];
1416       rbuf3_i = rbuf3[idex2];
1417       rbuf4_i = rbuf4[idex2];
1418       for (j=1; j<=jmax; j++) {
1419         is_no     = sbuf1_i[2*j-1];
1420         rmap_i    = rmap[is_no];
1421         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1422         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1423         imat_ilen = mat->ilen;
1424         imat_j    = mat->j;
1425         imat_i    = mat->i;
1426         imat_a    = mat->a;
1427         max1      = sbuf1_i[2*j];
1428         for (k=0; k<max1; k++,ct1++) {
1429           row = sbuf1_i[ct1];
1430 #if defined(PETSC_USE_CTABLE)
1431           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1432           row--;
1433 #else
1434           row = rmap_i[row];
1435 #endif
1436           ilen  = imat_ilen[row];
1437           mat_i = imat_i[row];
1438           mat_a = imat_a + mat_i;
1439           mat_j = imat_j + mat_i;
1440           max2  = rbuf2_i[ct1];
1441           if (!allcolumns[is_no]) {
1442             for (l=0; l<max2; l++,ct2++) {
1443 
1444 #if defined(PETSC_USE_CTABLE)
1445               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1446 #else
1447               tcol = cmap_i[rbuf3_i[ct2]];
1448 #endif
1449               if (tcol) {
1450                 *mat_j++ = tcol - 1;
1451                 *mat_a++ = rbuf4_i[ct2];
1452                 ilen++;
1453               }
1454             }
1455           } else { /* allcolumns */
1456             for (l=0; l<max2; l++,ct2++) {
1457               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1458               *mat_a++ = rbuf4_i[ct2];
1459               ilen++;
1460             }
1461           }
1462           imat_ilen[row] = ilen;
1463         }
1464       }
1465     }
1466   }
1467 
1468   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1469   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1470   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1471   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1472   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1473 
1474   /* Restore the indices */
1475   for (i=0; i<ismax; i++) {
1476     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1477     if (!allcolumns[i]) {
1478       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1479     }
1480   }
1481 
1482   /* Destroy allocated memory */
1483   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1484   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1485   ierr = PetscFree(pa);CHKERRQ(ierr);
1486 
1487   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1488   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1489   for (i=0; i<nrqr; ++i) {
1490     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1491   }
1492   for (i=0; i<nrqs; ++i) {
1493     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1494     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1495   }
1496 
1497   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1498   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1499   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1500   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1501   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1502   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1503   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1504 
1505 #if defined(PETSC_USE_CTABLE)
1506   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1507 #else
1508   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1509 #endif
1510   ierr = PetscFree(rmap);CHKERRQ(ierr);
1511 
1512   for (i=0; i<ismax; i++) {
1513     if (!allcolumns[i]) {
1514 #if defined(PETSC_USE_CTABLE)
1515       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1516 #else
1517       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1518 #endif
1519     }
1520   }
1521   ierr = PetscFree(cmap);CHKERRQ(ierr);
1522   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1523   ierr = PetscFree(lens);CHKERRQ(ierr);
1524 
1525   for (i=0; i<ismax; i++) {
1526     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1527     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1528   }
1529   PetscFunctionReturn(0);
1530 }
1531 
1532 /*
1533  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1534  Be careful not to destroy them elsewhere.
1535  */
1536 #undef __FUNCT__
1537 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1538 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1539 {
1540   /* If making this function public, change the error returned in this function away from _PLIB. */
1541   PetscErrorCode ierr;
1542   Mat_MPIAIJ     *aij;
1543   PetscBool      seqaij;
1544 
1545   PetscFunctionBegin;
1546   /* Check to make sure the component matrices are compatible with C. */
1547   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1548   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1549   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1550   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1551   if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1552 
1553   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1554   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1555   ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr);
1556   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1557   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1558   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1559   ierr   = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1560   aij    = (Mat_MPIAIJ*)((*C)->data);
1561   aij->A = A;
1562   aij->B = B;
1563   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr);
1564   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr);
1565 
1566   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1567   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 #undef __FUNCT__
1572 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1573 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1574 {
1575   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1576 
1577   PetscFunctionBegin;
1578   PetscValidPointer(A,2);
1579   PetscValidPointer(B,3);
1580   *A = aij->A;
1581   *B = aij->B;
1582   /* Note that we don't incref *A and *B, so be careful! */
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 #undef __FUNCT__
1587 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1588 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1589                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1590                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1591                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1592 {
1593   PetscErrorCode ierr;
1594   PetscMPIInt    size, flag;
1595   PetscInt       i,ii;
1596   PetscInt       ismax_c;
1597 
1598   PetscFunctionBegin;
1599   if (!ismax) PetscFunctionReturn(0);
1600 
1601   for (i = 0, ismax_c = 0; i < ismax; ++i) {
1602     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
1603     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1604     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1605     if (size > 1) ++ismax_c;
1606   }
1607   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1608     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1609   } else { /* if (ismax_c) */
1610     Mat         *A,*B;
1611     IS          *isrow_c, *iscol_c;
1612     PetscMPIInt size;
1613     /*
1614      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1615      array of sequential matrices underlying the resulting parallel matrices.
1616      Which arrays to allocate is based on the value of MatReuse scall.
1617      There are as many diag matrices as there are original index sets.
1618      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1619 
1620      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1621      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1622      will deposite the extracted diag and off-diag parts.
1623      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1624      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1625     */
1626 
1627     /* Parallel matrix array is allocated only if no reuse is taking place. */
1628     if (scall != MAT_REUSE_MATRIX) {
1629       ierr = PetscMalloc1((ismax),submat);CHKERRQ(ierr);
1630     } else {
1631       ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr);
1632       ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr);
1633       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1634       for (i = 0, ii = 0; i < ismax; ++i) {
1635         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1636         if (size > 1) {
1637           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
1638           ++ii;
1639         } else A[i] = (*submat)[i];
1640       }
1641     }
1642     /*
1643      Construct the complements of the iscol ISs for parallel ISs only.
1644      These are used to extract the off-diag portion of the resulting parallel matrix.
1645      The row IS for the off-diag portion is the same as for the diag portion,
1646      so we merely alias the row IS, while skipping those that are sequential.
1647     */
1648     ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr);
1649     for (i = 0, ii = 0; i < ismax; ++i) {
1650       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1651       if (size > 1) {
1652         isrow_c[ii] = isrow[i];
1653 
1654         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
1655         ++ii;
1656       }
1657     }
1658     /* Now obtain the sequential A and B submatrices separately. */
1659     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
1660     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
1661     for (ii = 0; ii < ismax_c; ++ii) {
1662       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
1663     }
1664     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
1665     /*
1666      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1667      have been extracted directly into the parallel matrices containing them, or
1668      simply into the sequential matrix identical with the corresponding A (if size == 1).
1669      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1670      A is put into the correct submat slot (if size == 1).
1671      */
1672     if (scall != MAT_REUSE_MATRIX) {
1673       for (i = 0, ii = 0; i < ismax; ++i) {
1674         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1675         if (size > 1) {
1676           /*
1677            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1678            */
1679           /* Construct submat[i] from the Seq pieces A and B. */
1680           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
1681 
1682           ++ii;
1683         } else (*submat)[i] = A[i];
1684       }
1685     }
1686     ierr = PetscFree(A);CHKERRQ(ierr);
1687     ierr = PetscFree(B);CHKERRQ(ierr);
1688   }
1689   PetscFunctionReturn(0);
1690 } /* MatGetSubMatricesParallel_MPIXAIJ() */
1691 
1692 
1693 
1694 #undef __FUNCT__
1695 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1696 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1697 {
1698   PetscErrorCode ierr;
1699 
1700   PetscFunctionBegin;
1701   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704