xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 2c9ec6dfe874b911fa49ef7e759d29a8430d6aff)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
73   size = c->size;
74   rank = c->rank;
75   M    = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++) {
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc1((nrqs+1),&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
162 
163     for (i=0; i<imax; i++) {
164       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
165       data[i]  = d_p + M*i;
166     }
167   }
168 
169   /* Parse the IS and update local tables and the outgoing buf with the data*/
170   {
171     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
172     PetscBT  table_i;
173 
174     for (i=0; i<imax; i++) {
175       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
176       n_i     = n[i];
177       table_i = table[i];
178       idx_i   = idx[i];
179       data_i  = data[i];
180       isz_i   = isz[i];
181       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
182         row  = idx_i[j];
183         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
184         if (proc != rank) { /* copy to the outgoing buffer */
185           ctr[proc]++;
186           *ptr[proc] = row;
187           ptr[proc]++;
188         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
189       }
190       /* Update the headers for the current IS */
191       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
192         if ((ctr_j = ctr[j])) {
193           outdat_j        = outdat[j];
194           k               = ++outdat_j[0];
195           outdat_j[2*k]   = ctr_j;
196           outdat_j[2*k-1] = i;
197         }
198       }
199       isz[i] = isz_i;
200     }
201   }
202 
203   /*  Now  post the sends */
204   ierr = PetscMalloc1((nrqs+1),&s_waits1);CHKERRQ(ierr);
205   for (i=0; i<nrqs; ++i) {
206     j    = pa[i];
207     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
208   }
209 
210   /* No longer need the original indices*/
211   for (i=0; i<imax; ++i) {
212     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
213   }
214   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
215 
216   for (i=0; i<imax; ++i) {
217     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
218   }
219 
220   /* Do Local work*/
221   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
222 
223   /* Receive messages*/
224   ierr = PetscMalloc1((nrqr+1),&recv_status);CHKERRQ(ierr);
225   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
226 
227   ierr = PetscMalloc1((nrqs+1),&s_status);CHKERRQ(ierr);
228   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
229 
230   /* Phase 1 sends are complete - deallocate buffers */
231   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
232   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
233 
234   ierr = PetscMalloc1((nrqr+1),&xdata);CHKERRQ(ierr);
235   ierr = PetscMalloc1((nrqr+1),&isz1);CHKERRQ(ierr);
236   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
237   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
238   ierr = PetscFree(rbuf);CHKERRQ(ierr);
239 
240 
241   /* Send the data back*/
242   /* Do a global reduction to know the buffer space req for incoming messages*/
243   {
244     PetscMPIInt *rw1;
245 
246     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
247 
248     for (i=0; i<nrqr; ++i) {
249       proc = recv_status[i].MPI_SOURCE;
250 
251       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
252       rw1[proc] = isz1[i];
253     }
254     ierr = PetscFree(onodes1);CHKERRQ(ierr);
255     ierr = PetscFree(olengths1);CHKERRQ(ierr);
256 
257     /* Determine the number of messages to expect, their lengths, from from-ids */
258     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
259     ierr = PetscFree(rw1);CHKERRQ(ierr);
260   }
261   /* Now post the Irecvs corresponding to these messages */
262   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
263 
264   /*  Now  post the sends */
265   ierr = PetscMalloc1((nrqr+1),&s_waits2);CHKERRQ(ierr);
266   for (i=0; i<nrqr; ++i) {
267     j    = recv_status[i].MPI_SOURCE;
268     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
269   }
270 
271   /* receive work done on other processors*/
272   {
273     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
274     PetscMPIInt idex;
275     PetscBT     table_i;
276     MPI_Status  *status2;
277 
278     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
279     for (i=0; i<nrqs; ++i) {
280       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
281       /* Process the message*/
282       rbuf2_i = rbuf2[idex];
283       ct1     = 2*rbuf2_i[0]+1;
284       jmax    = rbuf2[idex][0];
285       for (j=1; j<=jmax; j++) {
286         max     = rbuf2_i[2*j];
287         is_no   = rbuf2_i[2*j-1];
288         isz_i   = isz[is_no];
289         data_i  = data[is_no];
290         table_i = table[is_no];
291         for (k=0; k<max; k++,ct1++) {
292           row = rbuf2_i[ct1];
293           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
294         }
295         isz[is_no] = isz_i;
296       }
297     }
298 
299     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
300     ierr = PetscFree(status2);CHKERRQ(ierr);
301   }
302 
303   for (i=0; i<imax; ++i) {
304     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
305   }
306 
307   ierr = PetscFree(onodes2);CHKERRQ(ierr);
308   ierr = PetscFree(olengths2);CHKERRQ(ierr);
309 
310   ierr = PetscFree(pa);CHKERRQ(ierr);
311   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
312   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
313   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
314   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
315   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
316   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
317   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
318   ierr = PetscFree(s_status);CHKERRQ(ierr);
319   ierr = PetscFree(recv_status);CHKERRQ(ierr);
320   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
321   ierr = PetscFree(xdata);CHKERRQ(ierr);
322   ierr = PetscFree(isz1);CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 #undef __FUNCT__
327 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
328 /*
329    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
330        the work on the local processor.
331 
332      Inputs:
333       C      - MAT_MPIAIJ;
334       imax - total no of index sets processed at a time;
335       table  - an array of char - size = m bits.
336 
337      Output:
338       isz    - array containing the count of the solution elements corresponding
339                to each index set;
340       data   - pointer to the solutions
341 */
342 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
343 {
344   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
345   Mat        A  = c->A,B = c->B;
346   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
347   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
348   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
349   PetscBT    table_i;
350 
351   PetscFunctionBegin;
352   rstart = C->rmap->rstart;
353   cstart = C->cmap->rstart;
354   ai     = a->i;
355   aj     = a->j;
356   bi     = b->i;
357   bj     = b->j;
358   garray = c->garray;
359 
360 
361   for (i=0; i<imax; i++) {
362     data_i  = data[i];
363     table_i = table[i];
364     isz_i   = isz[i];
365     for (j=0,max=isz[i]; j<max; j++) {
366       row   = data_i[j] - rstart;
367       start = ai[row];
368       end   = ai[row+1];
369       for (k=start; k<end; k++) { /* Amat */
370         val = aj[k] + cstart;
371         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
372       }
373       start = bi[row];
374       end   = bi[row+1];
375       for (k=start; k<end; k++) { /* Bmat */
376         val = garray[bj[k]];
377         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
378       }
379     }
380     isz[i] = isz_i;
381   }
382   PetscFunctionReturn(0);
383 }
384 
385 #undef __FUNCT__
386 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
387 /*
388       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
389          and return the output
390 
391          Input:
392            C    - the matrix
393            nrqr - no of messages being processed.
394            rbuf - an array of pointers to the recieved requests
395 
396          Output:
397            xdata - array of messages to be sent back
398            isz1  - size of each message
399 
400   For better efficiency perhaps we should malloc separately each xdata[i],
401 then if a remalloc is required we need only copy the data for that one row
402 rather then all previous rows as it is now where a single large chunck of
403 memory is used.
404 
405 */
406 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
407 {
408   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
409   Mat            A  = c->A,B = c->B;
410   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
411   PetscErrorCode ierr;
412   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
413   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
414   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
415   PetscInt       *rbuf_i,kmax,rbuf_0;
416   PetscBT        xtable;
417 
418   PetscFunctionBegin;
419   m      = C->rmap->N;
420   rstart = C->rmap->rstart;
421   cstart = C->cmap->rstart;
422   ai     = a->i;
423   aj     = a->j;
424   bi     = b->i;
425   bj     = b->j;
426   garray = c->garray;
427 
428 
429   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
430     rbuf_i =  rbuf[i];
431     rbuf_0 =  rbuf_i[0];
432     ct    += rbuf_0;
433     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
434   }
435 
436   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
437   else max1 = 1;
438   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
439   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
440   ++no_malloc;
441   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
442   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
443 
444   ct3 = 0;
445   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
446     rbuf_i =  rbuf[i];
447     rbuf_0 =  rbuf_i[0];
448     ct1    =  2*rbuf_0+1;
449     ct2    =  ct1;
450     ct3   += ct1;
451     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
452       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
453       oct2 = ct2;
454       kmax = rbuf_i[2*j];
455       for (k=0; k<kmax; k++,ct1++) {
456         row = rbuf_i[ct1];
457         if (!PetscBTLookupSet(xtable,row)) {
458           if (!(ct3 < mem_estimate)) {
459             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
460             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
461             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
462             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
463             xdata[0]     = tmp;
464             mem_estimate = new_estimate; ++no_malloc;
465             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
466           }
467           xdata[i][ct2++] = row;
468           ct3++;
469         }
470       }
471       for (k=oct2,max2=ct2; k<max2; k++) {
472         row   = xdata[i][k] - rstart;
473         start = ai[row];
474         end   = ai[row+1];
475         for (l=start; l<end; l++) {
476           val = aj[l] + cstart;
477           if (!PetscBTLookupSet(xtable,val)) {
478             if (!(ct3 < mem_estimate)) {
479               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
480               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
481               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
482               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
483               xdata[0]     = tmp;
484               mem_estimate = new_estimate; ++no_malloc;
485               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
486             }
487             xdata[i][ct2++] = val;
488             ct3++;
489           }
490         }
491         start = bi[row];
492         end   = bi[row+1];
493         for (l=start; l<end; l++) {
494           val = garray[bj[l]];
495           if (!PetscBTLookupSet(xtable,val)) {
496             if (!(ct3 < mem_estimate)) {
497               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
498               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
499               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
500               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
501               xdata[0]     = tmp;
502               mem_estimate = new_estimate; ++no_malloc;
503               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
504             }
505             xdata[i][ct2++] = val;
506             ct3++;
507           }
508         }
509       }
510       /* Update the header*/
511       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
512       xdata[i][2*j-1] = rbuf_i[2*j-1];
513     }
514     xdata[i][0] = rbuf_0;
515     xdata[i+1]  = xdata[i] + ct2;
516     isz1[i]     = ct2; /* size of each message */
517   }
518   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
519   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
520   PetscFunctionReturn(0);
521 }
522 /* -------------------------------------------------------------------------*/
523 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
524 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
525 /*
526     Every processor gets the entire matrix
527 */
528 #undef __FUNCT__
529 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
530 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
531 {
532   Mat            B;
533   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
534   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
535   PetscErrorCode ierr;
536   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
537   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
538   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
539   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
540 
541   PetscFunctionBegin;
542   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
543   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
544 
545   if (scall == MAT_INITIAL_MATRIX) {
546     /* ----------------------------------------------------------------
547          Tell every processor the number of nonzeros per row
548     */
549     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
550     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
551       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
552     }
553     sendcount = A->rmap->rend - A->rmap->rstart;
554     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
555     for (i=0; i<size; i++) {
556       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
557       displs[i]     = A->rmap->range[i];
558     }
559 #if defined(PETSC_HAVE_MPI_IN_PLACE)
560     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
561 #else
562     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
563 #endif
564     /* ---------------------------------------------------------------
565          Create the sequential matrix of the same type as the local block diagonal
566     */
567     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
568     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
569     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
570     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
571     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
572     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
573     **Bin = B;
574     b     = (Mat_SeqAIJ*)B->data;
575 
576     /*--------------------------------------------------------------------
577        Copy my part of matrix column indices over
578     */
579     sendcount  = ad->nz + bd->nz;
580     jsendbuf   = b->j + b->i[rstarts[rank]];
581     a_jsendbuf = ad->j;
582     b_jsendbuf = bd->j;
583     n          = A->rmap->rend - A->rmap->rstart;
584     cnt        = 0;
585     for (i=0; i<n; i++) {
586 
587       /* put in lower diagonal portion */
588       m = bd->i[i+1] - bd->i[i];
589       while (m > 0) {
590         /* is it above diagonal (in bd (compressed) numbering) */
591         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
592         jsendbuf[cnt++] = garray[*b_jsendbuf++];
593         m--;
594       }
595 
596       /* put in diagonal portion */
597       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
598         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
599       }
600 
601       /* put in upper diagonal portion */
602       while (m-- > 0) {
603         jsendbuf[cnt++] = garray[*b_jsendbuf++];
604       }
605     }
606     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
607 
608     /*--------------------------------------------------------------------
609        Gather all column indices to all processors
610     */
611     for (i=0; i<size; i++) {
612       recvcounts[i] = 0;
613       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
614         recvcounts[i] += lens[j];
615       }
616     }
617     displs[0] = 0;
618     for (i=1; i<size; i++) {
619       displs[i] = displs[i-1] + recvcounts[i-1];
620     }
621 #if defined(PETSC_HAVE_MPI_IN_PLACE)
622     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
623 #else
624     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
625 #endif
626     /*--------------------------------------------------------------------
627         Assemble the matrix into useable form (note numerical values not yet set)
628     */
629     /* set the b->ilen (length of each row) values */
630     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
631     /* set the b->i indices */
632     b->i[0] = 0;
633     for (i=1; i<=A->rmap->N; i++) {
634       b->i[i] = b->i[i-1] + lens[i-1];
635     }
636     ierr = PetscFree(lens);CHKERRQ(ierr);
637     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
638     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
639 
640   } else {
641     B = **Bin;
642     b = (Mat_SeqAIJ*)B->data;
643   }
644 
645   /*--------------------------------------------------------------------
646        Copy my part of matrix numerical values into the values location
647   */
648   if (flag == MAT_GET_VALUES) {
649     sendcount = ad->nz + bd->nz;
650     sendbuf   = b->a + b->i[rstarts[rank]];
651     a_sendbuf = ad->a;
652     b_sendbuf = bd->a;
653     b_sendj   = bd->j;
654     n         = A->rmap->rend - A->rmap->rstart;
655     cnt       = 0;
656     for (i=0; i<n; i++) {
657 
658       /* put in lower diagonal portion */
659       m = bd->i[i+1] - bd->i[i];
660       while (m > 0) {
661         /* is it above diagonal (in bd (compressed) numbering) */
662         if (garray[*b_sendj] > A->rmap->rstart + i) break;
663         sendbuf[cnt++] = *b_sendbuf++;
664         m--;
665         b_sendj++;
666       }
667 
668       /* put in diagonal portion */
669       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
670         sendbuf[cnt++] = *a_sendbuf++;
671       }
672 
673       /* put in upper diagonal portion */
674       while (m-- > 0) {
675         sendbuf[cnt++] = *b_sendbuf++;
676         b_sendj++;
677       }
678     }
679     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
680 
681     /* -----------------------------------------------------------------
682        Gather all numerical values to all processors
683     */
684     if (!recvcounts) {
685       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
686     }
687     for (i=0; i<size; i++) {
688       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
689     }
690     displs[0] = 0;
691     for (i=1; i<size; i++) {
692       displs[i] = displs[i-1] + recvcounts[i-1];
693     }
694     recvbuf = b->a;
695 #if defined(PETSC_HAVE_MPI_IN_PLACE)
696     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
697 #else
698     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
699 #endif
700   }  /* endof (flag == MAT_GET_VALUES) */
701   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
702 
703   if (A->symmetric) {
704     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
705   } else if (A->hermitian) {
706     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
707   } else if (A->structurally_symmetric) {
708     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
709   }
710   PetscFunctionReturn(0);
711 }
712 
713 
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
717 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
718 {
719   PetscErrorCode ierr;
720   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
721   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
722 
723   PetscFunctionBegin;
724   /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
725   /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level.
726      However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that
727      take care to order the result correctly by assembling it with MatSetValues() (after preallocating).
728    */
729   for (i = 0; i < ismax; ++i) {
730     PetscBool sorted;
731     ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr);
732     if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i);
733   }
734 
735   /*
736        Check for special case: each processor gets entire matrix
737   */
738   if (ismax == 1 && C->rmap->N == C->cmap->N) {
739     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
740     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
741     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
742     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
743     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
744       wantallmatrix = PETSC_TRUE;
745 
746       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
747     }
748   }
749   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
750   if (twantallmatrix) {
751     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
752     PetscFunctionReturn(0);
753   }
754 
755   /* Allocate memory to hold all the submatrices */
756   if (scall != MAT_REUSE_MATRIX) {
757     ierr = PetscMalloc1((ismax+1),submat);CHKERRQ(ierr);
758   }
759 
760   /* Check for special case: each processor gets entire matrix columns */
761   ierr = PetscMalloc1((ismax+1),&allcolumns);CHKERRQ(ierr);
762   for (i=0; i<ismax; i++) {
763     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
764     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
765     if (colflag && ncol == C->cmap->N) {
766       allcolumns[i] = PETSC_TRUE;
767     } else {
768       allcolumns[i] = PETSC_FALSE;
769     }
770   }
771 
772   /* Determine the number of stages through which submatrices are done */
773   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
774 
775   /*
776      Each stage will extract nmax submatrices.
777      nmax is determined by the matrix column dimension.
778      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
779   */
780   if (!nmax) nmax = 1;
781   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
782 
783   /* Make sure every processor loops through the nstages */
784   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
785 
786   for (i=0,pos=0; i<nstages; i++) {
787     if (pos+nmax <= ismax) max_no = nmax;
788     else if (pos == ismax) max_no = 0;
789     else                   max_no = ismax-pos;
790     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
791     pos += max_no;
792   }
793 
794   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
795   PetscFunctionReturn(0);
796 }
797 
798 /* -------------------------------------------------------------------------*/
799 #undef __FUNCT__
800 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
801 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
802 {
803   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
804   Mat            A  = c->A;
805   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
806   const PetscInt **icol,**irow;
807   PetscInt       *nrow,*ncol,start;
808   PetscErrorCode ierr;
809   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
810   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
811   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
812   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
813   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
814 #if defined(PETSC_USE_CTABLE)
815   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
816 #else
817   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
818 #endif
819   const PetscInt *irow_i;
820   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
821   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
822   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
823   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
824   MPI_Status     *r_status3,*r_status4,*s_status4;
825   MPI_Comm       comm;
826   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
827   PetscMPIInt    *onodes1,*olengths1;
828   PetscMPIInt    idex,idex2,end;
829 
830   PetscFunctionBegin;
831   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
832   tag0 = ((PetscObject)C)->tag;
833   size = c->size;
834   rank = c->rank;
835 
836   /* Get some new tags to keep the communication clean */
837   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
838   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
839   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
840 
841   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
842 
843   for (i=0; i<ismax; i++) {
844     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
845     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
846     if (allcolumns[i]) {
847       icol[i] = NULL;
848       ncol[i] = C->cmap->N;
849     } else {
850       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
851       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
852     }
853   }
854 
855   /* evaluate communication - mesg to who, length of mesg, and buffer space
856      required. Based on this, buffers are allocated, and data copied into them*/
857   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
858   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
859   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
860   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
861   for (i=0; i<ismax; i++) {
862     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
863     jmax   = nrow[i];
864     irow_i = irow[i];
865     for (j=0; j<jmax; j++) {
866       l   = 0;
867       row = irow_i[j];
868       while (row >= C->rmap->range[l+1]) l++;
869       proc = l;
870       w4[proc]++;
871     }
872     for (j=0; j<size; j++) {
873       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
874     }
875   }
876 
877   nrqs     = 0;              /* no of outgoing messages */
878   msz      = 0;              /* total mesg length (for all procs) */
879   w1[rank] = 0;              /* no mesg sent to self */
880   w3[rank] = 0;
881   for (i=0; i<size; i++) {
882     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
883   }
884   ierr = PetscMalloc1((nrqs+1),&pa);CHKERRQ(ierr); /*(proc -array)*/
885   for (i=0,j=0; i<size; i++) {
886     if (w1[i]) { pa[j] = i; j++; }
887   }
888 
889   /* Each message would have a header = 1 + 2*(no of IS) + data */
890   for (i=0; i<nrqs; i++) {
891     j      = pa[i];
892     w1[j] += w2[j] + 2* w3[j];
893     msz   += w1[j];
894   }
895   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
896 
897   /* Determine the number of messages to expect, their lengths, from from-ids */
898   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
899   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
900 
901   /* Now post the Irecvs corresponding to these messages */
902   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
903 
904   ierr = PetscFree(onodes1);CHKERRQ(ierr);
905   ierr = PetscFree(olengths1);CHKERRQ(ierr);
906 
907   /* Allocate Memory for outgoing messages */
908   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
909   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
910   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
911 
912   {
913     PetscInt *iptr = tmp,ict = 0;
914     for (i=0; i<nrqs; i++) {
915       j        = pa[i];
916       iptr    += ict;
917       sbuf1[j] = iptr;
918       ict      = w1[j];
919     }
920   }
921 
922   /* Form the outgoing messages */
923   /* Initialize the header space */
924   for (i=0; i<nrqs; i++) {
925     j           = pa[i];
926     sbuf1[j][0] = 0;
927     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
928     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
929   }
930 
931   /* Parse the isrow and copy data into outbuf */
932   for (i=0; i<ismax; i++) {
933     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
934     irow_i = irow[i];
935     jmax   = nrow[i];
936     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
937       l   = 0;
938       row = irow_i[j];
939       while (row >= C->rmap->range[l+1]) l++;
940       proc = l;
941       if (proc != rank) { /* copy to the outgoing buf*/
942         ctr[proc]++;
943         *ptr[proc] = row;
944         ptr[proc]++;
945       }
946     }
947     /* Update the headers for the current IS */
948     for (j=0; j<size; j++) { /* Can Optimise this loop too */
949       if ((ctr_j = ctr[j])) {
950         sbuf1_j        = sbuf1[j];
951         k              = ++sbuf1_j[0];
952         sbuf1_j[2*k]   = ctr_j;
953         sbuf1_j[2*k-1] = i;
954       }
955     }
956   }
957 
958   /*  Now  post the sends */
959   ierr = PetscMalloc1((nrqs+1),&s_waits1);CHKERRQ(ierr);
960   for (i=0; i<nrqs; ++i) {
961     j    = pa[i];
962     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
963   }
964 
965   /* Post Receives to capture the buffer size */
966   ierr     = PetscMalloc1((nrqs+1),&r_waits2);CHKERRQ(ierr);
967   ierr     = PetscMalloc1((nrqs+1),&rbuf2);CHKERRQ(ierr);
968   rbuf2[0] = tmp + msz;
969   for (i=1; i<nrqs; ++i) {
970     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
971   }
972   for (i=0; i<nrqs; ++i) {
973     j    = pa[i];
974     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
975   }
976 
977   /* Send to other procs the buf size they should allocate */
978 
979 
980   /* Receive messages*/
981   ierr = PetscMalloc1((nrqr+1),&s_waits2);CHKERRQ(ierr);
982   ierr = PetscMalloc1((nrqr+1),&r_status1);CHKERRQ(ierr);
983   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
984   {
985     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
986     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
987     PetscInt   *sbuf2_i;
988 
989     for (i=0; i<nrqr; ++i) {
990       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
991 
992       req_size[idex] = 0;
993       rbuf1_i        = rbuf1[idex];
994       start          = 2*rbuf1_i[0] + 1;
995       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
996       ierr           = PetscMalloc1((end+1),&sbuf2[idex]);CHKERRQ(ierr);
997       sbuf2_i        = sbuf2[idex];
998       for (j=start; j<end; j++) {
999         id              = rbuf1_i[j] - rstart;
1000         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1001         sbuf2_i[j]      = ncols;
1002         req_size[idex] += ncols;
1003       }
1004       req_source[idex] = r_status1[i].MPI_SOURCE;
1005       /* form the header */
1006       sbuf2_i[0] = req_size[idex];
1007       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1008 
1009       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1010     }
1011   }
1012   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1013   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1014 
1015   /*  recv buffer sizes */
1016   /* Receive messages*/
1017 
1018   ierr = PetscMalloc1((nrqs+1),&rbuf3);CHKERRQ(ierr);
1019   ierr = PetscMalloc1((nrqs+1),&rbuf4);CHKERRQ(ierr);
1020   ierr = PetscMalloc1((nrqs+1),&r_waits3);CHKERRQ(ierr);
1021   ierr = PetscMalloc1((nrqs+1),&r_waits4);CHKERRQ(ierr);
1022   ierr = PetscMalloc1((nrqs+1),&r_status2);CHKERRQ(ierr);
1023 
1024   for (i=0; i<nrqs; ++i) {
1025     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1026     ierr = PetscMalloc1((rbuf2[idex][0]+1),&rbuf3[idex]);CHKERRQ(ierr);
1027     ierr = PetscMalloc1((rbuf2[idex][0]+1),&rbuf4[idex]);CHKERRQ(ierr);
1028     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1029     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1030   }
1031   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1032   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1033 
1034   /* Wait on sends1 and sends2 */
1035   ierr = PetscMalloc1((nrqs+1),&s_status1);CHKERRQ(ierr);
1036   ierr = PetscMalloc1((nrqr+1),&s_status2);CHKERRQ(ierr);
1037 
1038   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1039   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1040   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1041   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1042   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1043   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1044 
1045   /* Now allocate buffers for a->j, and send them off */
1046   ierr = PetscMalloc1((nrqr+1),&sbuf_aj);CHKERRQ(ierr);
1047   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1048   ierr = PetscMalloc1((j+1),&sbuf_aj[0]);CHKERRQ(ierr);
1049   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1050 
1051   ierr = PetscMalloc1((nrqr+1),&s_waits3);CHKERRQ(ierr);
1052   {
1053     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1054     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1055     PetscInt cend = C->cmap->rend;
1056     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1057 
1058     for (i=0; i<nrqr; i++) {
1059       rbuf1_i   = rbuf1[i];
1060       sbuf_aj_i = sbuf_aj[i];
1061       ct1       = 2*rbuf1_i[0] + 1;
1062       ct2       = 0;
1063       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1064         kmax = rbuf1[i][2*j];
1065         for (k=0; k<kmax; k++,ct1++) {
1066           row    = rbuf1_i[ct1] - rstart;
1067           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1068           ncols  = nzA + nzB;
1069           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1070 
1071           /* load the column indices for this row into cols*/
1072           cols = sbuf_aj_i + ct2;
1073 
1074           lwrite = 0;
1075           for (l=0; l<nzB; l++) {
1076             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1077           }
1078           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1079           for (l=0; l<nzB; l++) {
1080             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1081           }
1082 
1083           ct2 += ncols;
1084         }
1085       }
1086       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1087     }
1088   }
1089   ierr = PetscMalloc1((nrqs+1),&r_status3);CHKERRQ(ierr);
1090   ierr = PetscMalloc1((nrqr+1),&s_status3);CHKERRQ(ierr);
1091 
1092   /* Allocate buffers for a->a, and send them off */
1093   ierr = PetscMalloc1((nrqr+1),&sbuf_aa);CHKERRQ(ierr);
1094   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1095   ierr = PetscMalloc1((j+1),&sbuf_aa[0]);CHKERRQ(ierr);
1096   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1097 
1098   ierr = PetscMalloc1((nrqr+1),&s_waits4);CHKERRQ(ierr);
1099   {
1100     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1101     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1102     PetscInt    cend   = C->cmap->rend;
1103     PetscInt    *b_j   = b->j;
1104     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1105 
1106     for (i=0; i<nrqr; i++) {
1107       rbuf1_i   = rbuf1[i];
1108       sbuf_aa_i = sbuf_aa[i];
1109       ct1       = 2*rbuf1_i[0]+1;
1110       ct2       = 0;
1111       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1112         kmax = rbuf1_i[2*j];
1113         for (k=0; k<kmax; k++,ct1++) {
1114           row    = rbuf1_i[ct1] - rstart;
1115           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1116           ncols  = nzA + nzB;
1117           cworkB = b_j + b_i[row];
1118           vworkA = a_a + a_i[row];
1119           vworkB = b_a + b_i[row];
1120 
1121           /* load the column values for this row into vals*/
1122           vals = sbuf_aa_i+ct2;
1123 
1124           lwrite = 0;
1125           for (l=0; l<nzB; l++) {
1126             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1127           }
1128           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1129           for (l=0; l<nzB; l++) {
1130             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1131           }
1132 
1133           ct2 += ncols;
1134         }
1135       }
1136       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1137     }
1138   }
1139   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1140   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1141   ierr = PetscMalloc1((nrqs+1),&r_status4);CHKERRQ(ierr);
1142   ierr = PetscMalloc1((nrqr+1),&s_status4);CHKERRQ(ierr);
1143 
1144   /* Form the matrix */
1145   /* create col map: global col of C -> local col of submatrices */
1146   {
1147     const PetscInt *icol_i;
1148 #if defined(PETSC_USE_CTABLE)
1149     ierr = PetscMalloc1((1+ismax),&cmap);CHKERRQ(ierr);
1150     for (i=0; i<ismax; i++) {
1151       if (!allcolumns[i]) {
1152         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1153 
1154         jmax   = ncol[i];
1155         icol_i = icol[i];
1156         cmap_i = cmap[i];
1157         for (j=0; j<jmax; j++) {
1158           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1159         }
1160       } else {
1161         cmap[i] = NULL;
1162       }
1163     }
1164 #else
1165     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1166     for (i=0; i<ismax; i++) {
1167       if (!allcolumns[i]) {
1168         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1169         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1170         jmax   = ncol[i];
1171         icol_i = icol[i];
1172         cmap_i = cmap[i];
1173         for (j=0; j<jmax; j++) {
1174           cmap_i[icol_i[j]] = j+1;
1175         }
1176       } else {
1177         cmap[i] = NULL;
1178       }
1179     }
1180 #endif
1181   }
1182 
1183   /* Create lens which is required for MatCreate... */
1184   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1185   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1186   if (ismax) {
1187     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1188     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1189   }
1190   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1191 
1192   /* Update lens from local data */
1193   for (i=0; i<ismax; i++) {
1194     jmax = nrow[i];
1195     if (!allcolumns[i]) cmap_i = cmap[i];
1196     irow_i = irow[i];
1197     lens_i = lens[i];
1198     for (j=0; j<jmax; j++) {
1199       l   = 0;
1200       row = irow_i[j];
1201       while (row >= C->rmap->range[l+1]) l++;
1202       proc = l;
1203       if (proc == rank) {
1204         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1205         if (!allcolumns[i]) {
1206           for (k=0; k<ncols; k++) {
1207 #if defined(PETSC_USE_CTABLE)
1208             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1209 #else
1210             tcol = cmap_i[cols[k]];
1211 #endif
1212             if (tcol) lens_i[j]++;
1213           }
1214         } else { /* allcolumns */
1215           lens_i[j] = ncols;
1216         }
1217         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1218       }
1219     }
1220   }
1221 
1222   /* Create row map: global row of C -> local row of submatrices */
1223 #if defined(PETSC_USE_CTABLE)
1224   ierr = PetscMalloc1((1+ismax),&rmap);CHKERRQ(ierr);
1225   for (i=0; i<ismax; i++) {
1226     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1227     rmap_i = rmap[i];
1228     irow_i = irow[i];
1229     jmax   = nrow[i];
1230     for (j=0; j<jmax; j++) {
1231       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1232     }
1233   }
1234 #else
1235   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1236   if (ismax) {
1237     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1238     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1239   }
1240   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1241   for (i=0; i<ismax; i++) {
1242     rmap_i = rmap[i];
1243     irow_i = irow[i];
1244     jmax   = nrow[i];
1245     for (j=0; j<jmax; j++) {
1246       rmap_i[irow_i[j]] = j;
1247     }
1248   }
1249 #endif
1250 
1251   /* Update lens from offproc data */
1252   {
1253     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1254 
1255     for (tmp2=0; tmp2<nrqs; tmp2++) {
1256       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1257       idex    = pa[idex2];
1258       sbuf1_i = sbuf1[idex];
1259       jmax    = sbuf1_i[0];
1260       ct1     = 2*jmax+1;
1261       ct2     = 0;
1262       rbuf2_i = rbuf2[idex2];
1263       rbuf3_i = rbuf3[idex2];
1264       for (j=1; j<=jmax; j++) {
1265         is_no  = sbuf1_i[2*j-1];
1266         max1   = sbuf1_i[2*j];
1267         lens_i = lens[is_no];
1268         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1269         rmap_i = rmap[is_no];
1270         for (k=0; k<max1; k++,ct1++) {
1271 #if defined(PETSC_USE_CTABLE)
1272           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1273           row--;
1274           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1275 #else
1276           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1277 #endif
1278           max2 = rbuf2_i[ct1];
1279           for (l=0; l<max2; l++,ct2++) {
1280             if (!allcolumns[is_no]) {
1281 #if defined(PETSC_USE_CTABLE)
1282               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1283 #else
1284               tcol = cmap_i[rbuf3_i[ct2]];
1285 #endif
1286               if (tcol) lens_i[row]++;
1287             } else { /* allcolumns */
1288               lens_i[row]++; /* lens_i[row] += max2 ? */
1289             }
1290           }
1291         }
1292       }
1293     }
1294   }
1295   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1296   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1297   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1298   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1299   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1300 
1301   /* Create the submatrices */
1302   if (scall == MAT_REUSE_MATRIX) {
1303     PetscBool flag;
1304 
1305     /*
1306         Assumes new rows are same length as the old rows,hence bug!
1307     */
1308     for (i=0; i<ismax; i++) {
1309       mat = (Mat_SeqAIJ*)(submats[i]->data);
1310       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1311       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1312       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1313       /* Initial matrix as if empty */
1314       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1315 
1316       submats[i]->factortype = C->factortype;
1317     }
1318   } else {
1319     for (i=0; i<ismax; i++) {
1320       PetscInt rbs,cbs;
1321       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1322       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1323 
1324       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1325       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1326 
1327       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1328       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1329       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1330     }
1331   }
1332 
1333   /* Assemble the matrices */
1334   /* First assemble the local rows */
1335   {
1336     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1337     PetscScalar *imat_a;
1338 
1339     for (i=0; i<ismax; i++) {
1340       mat       = (Mat_SeqAIJ*)submats[i]->data;
1341       imat_ilen = mat->ilen;
1342       imat_j    = mat->j;
1343       imat_i    = mat->i;
1344       imat_a    = mat->a;
1345 
1346       if (!allcolumns[i]) cmap_i = cmap[i];
1347       rmap_i = rmap[i];
1348       irow_i = irow[i];
1349       jmax   = nrow[i];
1350       for (j=0; j<jmax; j++) {
1351         l   = 0;
1352         row = irow_i[j];
1353         while (row >= C->rmap->range[l+1]) l++;
1354         proc = l;
1355         if (proc == rank) {
1356           old_row = row;
1357 #if defined(PETSC_USE_CTABLE)
1358           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1359           row--;
1360 #else
1361           row = rmap_i[row];
1362 #endif
1363           ilen_row = imat_ilen[row];
1364           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1365           mat_i    = imat_i[row];
1366           mat_a    = imat_a + mat_i;
1367           mat_j    = imat_j + mat_i;
1368           if (!allcolumns[i]) {
1369             for (k=0; k<ncols; k++) {
1370 #if defined(PETSC_USE_CTABLE)
1371               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1372 #else
1373               tcol = cmap_i[cols[k]];
1374 #endif
1375               if (tcol) {
1376                 *mat_j++ = tcol - 1;
1377                 *mat_a++ = vals[k];
1378                 ilen_row++;
1379               }
1380             }
1381           } else { /* allcolumns */
1382             for (k=0; k<ncols; k++) {
1383               *mat_j++ = cols[k];  /* global col index! */
1384               *mat_a++ = vals[k];
1385               ilen_row++;
1386             }
1387           }
1388           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1389 
1390           imat_ilen[row] = ilen_row;
1391         }
1392       }
1393     }
1394   }
1395 
1396   /*   Now assemble the off proc rows*/
1397   {
1398     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1399     PetscInt    *imat_j,*imat_i;
1400     PetscScalar *imat_a,*rbuf4_i;
1401 
1402     for (tmp2=0; tmp2<nrqs; tmp2++) {
1403       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1404       idex    = pa[idex2];
1405       sbuf1_i = sbuf1[idex];
1406       jmax    = sbuf1_i[0];
1407       ct1     = 2*jmax + 1;
1408       ct2     = 0;
1409       rbuf2_i = rbuf2[idex2];
1410       rbuf3_i = rbuf3[idex2];
1411       rbuf4_i = rbuf4[idex2];
1412       for (j=1; j<=jmax; j++) {
1413         is_no     = sbuf1_i[2*j-1];
1414         rmap_i    = rmap[is_no];
1415         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1416         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1417         imat_ilen = mat->ilen;
1418         imat_j    = mat->j;
1419         imat_i    = mat->i;
1420         imat_a    = mat->a;
1421         max1      = sbuf1_i[2*j];
1422         for (k=0; k<max1; k++,ct1++) {
1423           row = sbuf1_i[ct1];
1424 #if defined(PETSC_USE_CTABLE)
1425           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1426           row--;
1427 #else
1428           row = rmap_i[row];
1429 #endif
1430           ilen  = imat_ilen[row];
1431           mat_i = imat_i[row];
1432           mat_a = imat_a + mat_i;
1433           mat_j = imat_j + mat_i;
1434           max2  = rbuf2_i[ct1];
1435           if (!allcolumns[is_no]) {
1436             for (l=0; l<max2; l++,ct2++) {
1437 
1438 #if defined(PETSC_USE_CTABLE)
1439               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1440 #else
1441               tcol = cmap_i[rbuf3_i[ct2]];
1442 #endif
1443               if (tcol) {
1444                 *mat_j++ = tcol - 1;
1445                 *mat_a++ = rbuf4_i[ct2];
1446                 ilen++;
1447               }
1448             }
1449           } else { /* allcolumns */
1450             for (l=0; l<max2; l++,ct2++) {
1451               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1452               *mat_a++ = rbuf4_i[ct2];
1453               ilen++;
1454             }
1455           }
1456           imat_ilen[row] = ilen;
1457         }
1458       }
1459     }
1460   }
1461 
1462   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1463   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1464   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1465   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1466   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1467 
1468   /* Restore the indices */
1469   for (i=0; i<ismax; i++) {
1470     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1471     if (!allcolumns[i]) {
1472       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1473     }
1474   }
1475 
1476   /* Destroy allocated memory */
1477   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1478   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1479   ierr = PetscFree(pa);CHKERRQ(ierr);
1480 
1481   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1482   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1483   for (i=0; i<nrqr; ++i) {
1484     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1485   }
1486   for (i=0; i<nrqs; ++i) {
1487     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1488     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1489   }
1490 
1491   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1492   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1493   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1494   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1495   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1496   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1497   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1498 
1499 #if defined(PETSC_USE_CTABLE)
1500   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1501 #else
1502   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1503 #endif
1504   ierr = PetscFree(rmap);CHKERRQ(ierr);
1505 
1506   for (i=0; i<ismax; i++) {
1507     if (!allcolumns[i]) {
1508 #if defined(PETSC_USE_CTABLE)
1509       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1510 #else
1511       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1512 #endif
1513     }
1514   }
1515   ierr = PetscFree(cmap);CHKERRQ(ierr);
1516   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1517   ierr = PetscFree(lens);CHKERRQ(ierr);
1518 
1519   for (i=0; i<ismax; i++) {
1520     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1521     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1522   }
1523   PetscFunctionReturn(0);
1524 }
1525 
1526 /*
1527  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1528  Be careful not to destroy them elsewhere.
1529  */
1530 #undef __FUNCT__
1531 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1532 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1533 {
1534   /* If making this function public, change the error returned in this function away from _PLIB. */
1535   PetscErrorCode ierr;
1536   Mat_MPIAIJ     *aij;
1537   PetscBool      seqaij;
1538 
1539   PetscFunctionBegin;
1540   /* Check to make sure the component matrices are compatible with C. */
1541   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1542   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1543   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1544   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1545   if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1546 
1547   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1548   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1549   ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr);
1550   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1551   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1552   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1553   ierr   = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1554   aij    = (Mat_MPIAIJ*)((*C)->data);
1555   aij->A = A;
1556   aij->B = B;
1557   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr);
1558   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr);
1559 
1560   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1561   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 #undef __FUNCT__
1566 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1567 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1568 {
1569   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1570 
1571   PetscFunctionBegin;
1572   PetscValidPointer(A,2);
1573   PetscValidPointer(B,3);
1574   *A = aij->A;
1575   *B = aij->B;
1576   /* Note that we don't incref *A and *B, so be careful! */
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 #undef __FUNCT__
1581 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1582 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1583                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1584                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1585                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1586 {
1587   PetscErrorCode ierr;
1588   PetscMPIInt    size, flag;
1589   PetscInt       i,ii;
1590   PetscInt       ismax_c;
1591 
1592   PetscFunctionBegin;
1593   if (!ismax) PetscFunctionReturn(0);
1594 
1595   for (i = 0, ismax_c = 0; i < ismax; ++i) {
1596     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
1597     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1598     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1599     if (size > 1) ++ismax_c;
1600   }
1601   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1602     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1603   } else { /* if (ismax_c) */
1604     Mat         *A,*B;
1605     IS          *isrow_c, *iscol_c;
1606     PetscMPIInt size;
1607     /*
1608      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1609      array of sequential matrices underlying the resulting parallel matrices.
1610      Which arrays to allocate is based on the value of MatReuse scall.
1611      There are as many diag matrices as there are original index sets.
1612      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1613 
1614      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1615      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1616      will deposite the extracted diag and off-diag parts.
1617      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1618      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1619     */
1620 
1621     /* Parallel matrix array is allocated only if no reuse is taking place. */
1622     if (scall != MAT_REUSE_MATRIX) {
1623       ierr = PetscMalloc1((ismax),submat);CHKERRQ(ierr);
1624     } else {
1625       ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr);
1626       ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr);
1627       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1628       for (i = 0, ii = 0; i < ismax; ++i) {
1629         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1630         if (size > 1) {
1631           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
1632           ++ii;
1633         } else A[i] = (*submat)[i];
1634       }
1635     }
1636     /*
1637      Construct the complements of the iscol ISs for parallel ISs only.
1638      These are used to extract the off-diag portion of the resulting parallel matrix.
1639      The row IS for the off-diag portion is the same as for the diag portion,
1640      so we merely alias the row IS, while skipping those that are sequential.
1641     */
1642     ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr);
1643     for (i = 0, ii = 0; i < ismax; ++i) {
1644       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1645       if (size > 1) {
1646         isrow_c[ii] = isrow[i];
1647 
1648         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
1649         ++ii;
1650       }
1651     }
1652     /* Now obtain the sequential A and B submatrices separately. */
1653     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
1654     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
1655     for (ii = 0; ii < ismax_c; ++ii) {
1656       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
1657     }
1658     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
1659     /*
1660      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1661      have been extracted directly into the parallel matrices containing them, or
1662      simply into the sequential matrix identical with the corresponding A (if size == 1).
1663      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1664      A is put into the correct submat slot (if size == 1).
1665      */
1666     if (scall != MAT_REUSE_MATRIX) {
1667       for (i = 0, ii = 0; i < ismax; ++i) {
1668         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1669         if (size > 1) {
1670           /*
1671            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1672            */
1673           /* Construct submat[i] from the Seq pieces A and B. */
1674           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
1675 
1676           ++ii;
1677         } else (*submat)[i] = A[i];
1678       }
1679     }
1680     ierr = PetscFree(A);CHKERRQ(ierr);
1681     ierr = PetscFree(B);CHKERRQ(ierr);
1682   }
1683   PetscFunctionReturn(0);
1684 } /* MatGetSubMatricesParallel_MPIXAIJ() */
1685 
1686 
1687 
1688 #undef __FUNCT__
1689 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1690 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1691 {
1692   PetscErrorCode ierr;
1693 
1694   PetscFunctionBegin;
1695   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
1696   PetscFunctionReturn(0);
1697 }
1698