xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision e6e75211d226c622f451867f53ce5d558649ff4f)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 #include <../src/mat/impls/aij/mpi/mpiaij.h>
8 #include <petscbt.h>
9 
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
15 
16 #undef __FUNCT__
17 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
18 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
19 {
20   PetscErrorCode ierr;
21   PetscInt       i;
22 
23   PetscFunctionBegin;
24   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25   for (i=0; i<ov; ++i) {
26     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
27   }
28   PetscFunctionReturn(0);
29 }
30 
31 /*
32   Sample message format:
33   If a processor A wants processor B to process some elements corresponding
34   to index sets is[1],is[5]
35   mesg [0] = 2   (no of index sets in the mesg)
36   -----------
37   mesg [1] = 1 => is[1]
38   mesg [2] = sizeof(is[1]);
39   -----------
40   mesg [3] = 5  => is[5]
41   mesg [4] = sizeof(is[5]);
42   -----------
43   mesg [5]
44   mesg [n]  datas[1]
45   -----------
46   mesg[n+1]
47   mesg[m]  data(is[5])
48   -----------
49 
50   Notes:
51   nrqs - no of requests sent (or to be sent out)
52   nrqr - no of requests recieved (which have to be or which have been processed
53 */
54 #undef __FUNCT__
55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
57 {
58   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
59   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
60   const PetscInt **idx,*idx_i;
61   PetscInt       *n,**data,len;
62   PetscErrorCode ierr;
63   PetscMPIInt    size,rank,tag1,tag2;
64   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
65   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
66   PetscBT        *table;
67   MPI_Comm       comm;
68   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
69   MPI_Status     *s_status,*recv_status;
70   char           *t_p;
71 
72   PetscFunctionBegin;
73   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
74   size = c->size;
75   rank = c->rank;
76   M    = C->rmap->N;
77 
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
79   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
80 
81   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
82 
83   for (i=0; i<imax; i++) {
84     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
85     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
86   }
87 
88   /* evaluate communication - mesg to who,length of mesg, and buffer space
89      required. Based on this, buffers are allocated, and data copied into them*/
90   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
91   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
94   for (i=0; i<imax; i++) {
95     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
96     idx_i = idx[i];
97     len   = n[i];
98     for (j=0; j<len; j++) {
99       row = idx_i[j];
100       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
101       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
102       w4[proc]++;
103     }
104     for (j=0; j<size; j++) {
105       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
106     }
107   }
108 
109   nrqs     = 0;              /* no of outgoing messages */
110   msz      = 0;              /* total mesg length (for all proc */
111   w1[rank] = 0;              /* no mesg sent to intself */
112   w3[rank] = 0;
113   for (i=0; i<size; i++) {
114     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
115   }
116   /* pa - is list of processors to communicate with */
117   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
118   for (i=0,j=0; i<size; i++) {
119     if (w1[i]) {pa[j] = i; j++;}
120   }
121 
122   /* Each message would have a header = 1 + 2*(no of IS) + data */
123   for (i=0; i<nrqs; i++) {
124     j      = pa[i];
125     w1[j] += w2[j] + 2*w3[j];
126     msz   += w1[j];
127   }
128 
129   /* Determine the number of messages to expect, their lengths, from from-ids */
130   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
131   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
132 
133   /* Now post the Irecvs corresponding to these messages */
134   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
135 
136   /* Allocate Memory for outgoing messages */
137   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
138   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
139   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
140 
141   {
142     PetscInt *iptr = tmp,ict  = 0;
143     for (i=0; i<nrqs; i++) {
144       j         = pa[i];
145       iptr     +=  ict;
146       outdat[j] = iptr;
147       ict       = w1[j];
148     }
149   }
150 
151   /* Form the outgoing messages */
152   /*plug in the headers*/
153   for (i=0; i<nrqs; i++) {
154     j            = pa[i];
155     outdat[j][0] = 0;
156     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
157     ptr[j]       = outdat[j] + 2*w3[j] + 1;
158   }
159 
160   /* Memory for doing local proc's work*/
161   {
162     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
163 
164     for (i=0; i<imax; i++) {
165       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
166       data[i]  = d_p + M*i;
167     }
168   }
169 
170   /* Parse the IS and update local tables and the outgoing buf with the data*/
171   {
172     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
173     PetscBT  table_i;
174 
175     for (i=0; i<imax; i++) {
176       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
177       n_i     = n[i];
178       table_i = table[i];
179       idx_i   = idx[i];
180       data_i  = data[i];
181       isz_i   = isz[i];
182       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
183         row  = idx_i[j];
184         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
185         if (proc != rank) { /* copy to the outgoing buffer */
186           ctr[proc]++;
187           *ptr[proc] = row;
188           ptr[proc]++;
189         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
190       }
191       /* Update the headers for the current IS */
192       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
193         if ((ctr_j = ctr[j])) {
194           outdat_j        = outdat[j];
195           k               = ++outdat_j[0];
196           outdat_j[2*k]   = ctr_j;
197           outdat_j[2*k-1] = i;
198         }
199       }
200       isz[i] = isz_i;
201     }
202   }
203 
204   /*  Now  post the sends */
205   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
206   for (i=0; i<nrqs; ++i) {
207     j    = pa[i];
208     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
209   }
210 
211   /* No longer need the original indices*/
212   for (i=0; i<imax; ++i) {
213     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
214   }
215   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
216 
217   for (i=0; i<imax; ++i) {
218     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
219   }
220 
221   /* Do Local work*/
222   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
223 
224   /* Receive messages*/
225   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
226   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
227 
228   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
229   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
230 
231   /* Phase 1 sends are complete - deallocate buffers */
232   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
233   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
234 
235   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
236   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
237   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
238   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
239   ierr = PetscFree(rbuf);CHKERRQ(ierr);
240 
241 
242   /* Send the data back*/
243   /* Do a global reduction to know the buffer space req for incoming messages*/
244   {
245     PetscMPIInt *rw1;
246 
247     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
248 
249     for (i=0; i<nrqr; ++i) {
250       proc = recv_status[i].MPI_SOURCE;
251 
252       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
253       rw1[proc] = isz1[i];
254     }
255     ierr = PetscFree(onodes1);CHKERRQ(ierr);
256     ierr = PetscFree(olengths1);CHKERRQ(ierr);
257 
258     /* Determine the number of messages to expect, their lengths, from from-ids */
259     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
260     ierr = PetscFree(rw1);CHKERRQ(ierr);
261   }
262   /* Now post the Irecvs corresponding to these messages */
263   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
264 
265   /*  Now  post the sends */
266   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
267   for (i=0; i<nrqr; ++i) {
268     j    = recv_status[i].MPI_SOURCE;
269     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
270   }
271 
272   /* receive work done on other processors*/
273   {
274     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
275     PetscMPIInt idex;
276     PetscBT     table_i;
277     MPI_Status  *status2;
278 
279     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
280     for (i=0; i<nrqs; ++i) {
281       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
282       /* Process the message*/
283       rbuf2_i = rbuf2[idex];
284       ct1     = 2*rbuf2_i[0]+1;
285       jmax    = rbuf2[idex][0];
286       for (j=1; j<=jmax; j++) {
287         max     = rbuf2_i[2*j];
288         is_no   = rbuf2_i[2*j-1];
289         isz_i   = isz[is_no];
290         data_i  = data[is_no];
291         table_i = table[is_no];
292         for (k=0; k<max; k++,ct1++) {
293           row = rbuf2_i[ct1];
294           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
295         }
296         isz[is_no] = isz_i;
297       }
298     }
299 
300     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
301     ierr = PetscFree(status2);CHKERRQ(ierr);
302   }
303 
304   for (i=0; i<imax; ++i) {
305     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
306   }
307 
308   ierr = PetscFree(onodes2);CHKERRQ(ierr);
309   ierr = PetscFree(olengths2);CHKERRQ(ierr);
310 
311   ierr = PetscFree(pa);CHKERRQ(ierr);
312   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
313   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
314   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
315   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
316   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
317   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
318   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
319   ierr = PetscFree(s_status);CHKERRQ(ierr);
320   ierr = PetscFree(recv_status);CHKERRQ(ierr);
321   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
322   ierr = PetscFree(xdata);CHKERRQ(ierr);
323   ierr = PetscFree(isz1);CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 #undef __FUNCT__
328 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
329 /*
330    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
331        the work on the local processor.
332 
333      Inputs:
334       C      - MAT_MPIAIJ;
335       imax - total no of index sets processed at a time;
336       table  - an array of char - size = m bits.
337 
338      Output:
339       isz    - array containing the count of the solution elements corresponding
340                to each index set;
341       data   - pointer to the solutions
342 */
343 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
344 {
345   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
346   Mat        A  = c->A,B = c->B;
347   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
348   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
349   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
350   PetscBT    table_i;
351 
352   PetscFunctionBegin;
353   rstart = C->rmap->rstart;
354   cstart = C->cmap->rstart;
355   ai     = a->i;
356   aj     = a->j;
357   bi     = b->i;
358   bj     = b->j;
359   garray = c->garray;
360 
361 
362   for (i=0; i<imax; i++) {
363     data_i  = data[i];
364     table_i = table[i];
365     isz_i   = isz[i];
366     for (j=0,max=isz[i]; j<max; j++) {
367       row   = data_i[j] - rstart;
368       start = ai[row];
369       end   = ai[row+1];
370       for (k=start; k<end; k++) { /* Amat */
371         val = aj[k] + cstart;
372         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
373       }
374       start = bi[row];
375       end   = bi[row+1];
376       for (k=start; k<end; k++) { /* Bmat */
377         val = garray[bj[k]];
378         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
379       }
380     }
381     isz[i] = isz_i;
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 #undef __FUNCT__
387 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
388 /*
389       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
390          and return the output
391 
392          Input:
393            C    - the matrix
394            nrqr - no of messages being processed.
395            rbuf - an array of pointers to the recieved requests
396 
397          Output:
398            xdata - array of messages to be sent back
399            isz1  - size of each message
400 
401   For better efficiency perhaps we should malloc separately each xdata[i],
402 then if a remalloc is required we need only copy the data for that one row
403 rather then all previous rows as it is now where a single large chunck of
404 memory is used.
405 
406 */
407 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
408 {
409   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
410   Mat            A  = c->A,B = c->B;
411   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
412   PetscErrorCode ierr;
413   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
414   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
415   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
416   PetscInt       *rbuf_i,kmax,rbuf_0;
417   PetscBT        xtable;
418 
419   PetscFunctionBegin;
420   m      = C->rmap->N;
421   rstart = C->rmap->rstart;
422   cstart = C->cmap->rstart;
423   ai     = a->i;
424   aj     = a->j;
425   bi     = b->i;
426   bj     = b->j;
427   garray = c->garray;
428 
429 
430   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
431     rbuf_i =  rbuf[i];
432     rbuf_0 =  rbuf_i[0];
433     ct    += rbuf_0;
434     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
435   }
436 
437   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
438   else max1 = 1;
439   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
440   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
441   ++no_malloc;
442   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
443   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
444 
445   ct3 = 0;
446   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
447     rbuf_i =  rbuf[i];
448     rbuf_0 =  rbuf_i[0];
449     ct1    =  2*rbuf_0+1;
450     ct2    =  ct1;
451     ct3   += ct1;
452     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
453       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
454       oct2 = ct2;
455       kmax = rbuf_i[2*j];
456       for (k=0; k<kmax; k++,ct1++) {
457         row = rbuf_i[ct1];
458         if (!PetscBTLookupSet(xtable,row)) {
459           if (!(ct3 < mem_estimate)) {
460             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
461             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
462             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
463             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
464             xdata[0]     = tmp;
465             mem_estimate = new_estimate; ++no_malloc;
466             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
467           }
468           xdata[i][ct2++] = row;
469           ct3++;
470         }
471       }
472       for (k=oct2,max2=ct2; k<max2; k++) {
473         row   = xdata[i][k] - rstart;
474         start = ai[row];
475         end   = ai[row+1];
476         for (l=start; l<end; l++) {
477           val = aj[l] + cstart;
478           if (!PetscBTLookupSet(xtable,val)) {
479             if (!(ct3 < mem_estimate)) {
480               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
481               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
482               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
483               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
484               xdata[0]     = tmp;
485               mem_estimate = new_estimate; ++no_malloc;
486               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
487             }
488             xdata[i][ct2++] = val;
489             ct3++;
490           }
491         }
492         start = bi[row];
493         end   = bi[row+1];
494         for (l=start; l<end; l++) {
495           val = garray[bj[l]];
496           if (!PetscBTLookupSet(xtable,val)) {
497             if (!(ct3 < mem_estimate)) {
498               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
499               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
500               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
501               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
502               xdata[0]     = tmp;
503               mem_estimate = new_estimate; ++no_malloc;
504               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
505             }
506             xdata[i][ct2++] = val;
507             ct3++;
508           }
509         }
510       }
511       /* Update the header*/
512       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
513       xdata[i][2*j-1] = rbuf_i[2*j-1];
514     }
515     xdata[i][0] = rbuf_0;
516     xdata[i+1]  = xdata[i] + ct2;
517     isz1[i]     = ct2; /* size of each message */
518   }
519   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
520   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
521   PetscFunctionReturn(0);
522 }
523 /* -------------------------------------------------------------------------*/
524 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
525 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
526 /*
527     Every processor gets the entire matrix
528 */
529 #undef __FUNCT__
530 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
531 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
532 {
533   Mat            B;
534   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
535   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
536   PetscErrorCode ierr;
537   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
538   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
539   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
540   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
541 
542   PetscFunctionBegin;
543   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
544   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
545 
546   if (scall == MAT_INITIAL_MATRIX) {
547     /* ----------------------------------------------------------------
548          Tell every processor the number of nonzeros per row
549     */
550     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
551     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
552       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
553     }
554     sendcount = A->rmap->rend - A->rmap->rstart;
555     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
556     for (i=0; i<size; i++) {
557       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
558       displs[i]     = A->rmap->range[i];
559     }
560 #if defined(PETSC_HAVE_MPI_IN_PLACE)
561     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
562 #else
563     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
564 #endif
565     /* ---------------------------------------------------------------
566          Create the sequential matrix of the same type as the local block diagonal
567     */
568     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
569     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
570     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
571     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
572     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
573     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
574     **Bin = B;
575     b     = (Mat_SeqAIJ*)B->data;
576 
577     /*--------------------------------------------------------------------
578        Copy my part of matrix column indices over
579     */
580     sendcount  = ad->nz + bd->nz;
581     jsendbuf   = b->j + b->i[rstarts[rank]];
582     a_jsendbuf = ad->j;
583     b_jsendbuf = bd->j;
584     n          = A->rmap->rend - A->rmap->rstart;
585     cnt        = 0;
586     for (i=0; i<n; i++) {
587 
588       /* put in lower diagonal portion */
589       m = bd->i[i+1] - bd->i[i];
590       while (m > 0) {
591         /* is it above diagonal (in bd (compressed) numbering) */
592         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
593         jsendbuf[cnt++] = garray[*b_jsendbuf++];
594         m--;
595       }
596 
597       /* put in diagonal portion */
598       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
599         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
600       }
601 
602       /* put in upper diagonal portion */
603       while (m-- > 0) {
604         jsendbuf[cnt++] = garray[*b_jsendbuf++];
605       }
606     }
607     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
608 
609     /*--------------------------------------------------------------------
610        Gather all column indices to all processors
611     */
612     for (i=0; i<size; i++) {
613       recvcounts[i] = 0;
614       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
615         recvcounts[i] += lens[j];
616       }
617     }
618     displs[0] = 0;
619     for (i=1; i<size; i++) {
620       displs[i] = displs[i-1] + recvcounts[i-1];
621     }
622 #if defined(PETSC_HAVE_MPI_IN_PLACE)
623     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
624 #else
625     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
626 #endif
627     /*--------------------------------------------------------------------
628         Assemble the matrix into useable form (note numerical values not yet set)
629     */
630     /* set the b->ilen (length of each row) values */
631     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
632     /* set the b->i indices */
633     b->i[0] = 0;
634     for (i=1; i<=A->rmap->N; i++) {
635       b->i[i] = b->i[i-1] + lens[i-1];
636     }
637     ierr = PetscFree(lens);CHKERRQ(ierr);
638     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
639     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
640 
641   } else {
642     B = **Bin;
643     b = (Mat_SeqAIJ*)B->data;
644   }
645 
646   /*--------------------------------------------------------------------
647        Copy my part of matrix numerical values into the values location
648   */
649   if (flag == MAT_GET_VALUES) {
650     sendcount = ad->nz + bd->nz;
651     sendbuf   = b->a + b->i[rstarts[rank]];
652     a_sendbuf = ad->a;
653     b_sendbuf = bd->a;
654     b_sendj   = bd->j;
655     n         = A->rmap->rend - A->rmap->rstart;
656     cnt       = 0;
657     for (i=0; i<n; i++) {
658 
659       /* put in lower diagonal portion */
660       m = bd->i[i+1] - bd->i[i];
661       while (m > 0) {
662         /* is it above diagonal (in bd (compressed) numbering) */
663         if (garray[*b_sendj] > A->rmap->rstart + i) break;
664         sendbuf[cnt++] = *b_sendbuf++;
665         m--;
666         b_sendj++;
667       }
668 
669       /* put in diagonal portion */
670       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
671         sendbuf[cnt++] = *a_sendbuf++;
672       }
673 
674       /* put in upper diagonal portion */
675       while (m-- > 0) {
676         sendbuf[cnt++] = *b_sendbuf++;
677         b_sendj++;
678       }
679     }
680     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
681 
682     /* -----------------------------------------------------------------
683        Gather all numerical values to all processors
684     */
685     if (!recvcounts) {
686       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
687     }
688     for (i=0; i<size; i++) {
689       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
690     }
691     displs[0] = 0;
692     for (i=1; i<size; i++) {
693       displs[i] = displs[i-1] + recvcounts[i-1];
694     }
695     recvbuf = b->a;
696 #if defined(PETSC_HAVE_MPI_IN_PLACE)
697     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
698 #else
699     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
700 #endif
701   }  /* endof (flag == MAT_GET_VALUES) */
702   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
703 
704   if (A->symmetric) {
705     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
706   } else if (A->hermitian) {
707     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
708   } else if (A->structurally_symmetric) {
709     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
710   }
711   PetscFunctionReturn(0);
712 }
713 
714 
715 
716 #undef __FUNCT__
717 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
718 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
719 {
720   PetscErrorCode ierr;
721   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
722   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
723 
724   PetscFunctionBegin;
725 
726   /*
727        Check for special case: each processor gets entire matrix
728   */
729   if (ismax == 1 && C->rmap->N == C->cmap->N) {
730     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
731     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
732     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
733     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
734     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
735       wantallmatrix = PETSC_TRUE;
736 
737       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
738     }
739   }
740   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
741   if (twantallmatrix) {
742     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
743     PetscFunctionReturn(0);
744   }
745 
746   /* Allocate memory to hold all the submatrices */
747   if (scall != MAT_REUSE_MATRIX) {
748     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
749   }
750 
751   /* Check for special case: each processor gets entire matrix columns */
752   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
753   for (i=0; i<ismax; i++) {
754     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
755     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
756     if (colflag && ncol == C->cmap->N) {
757       allcolumns[i] = PETSC_TRUE;
758     } else {
759       allcolumns[i] = PETSC_FALSE;
760     }
761   }
762 
763   /* Determine the number of stages through which submatrices are done */
764   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
765 
766   /*
767      Each stage will extract nmax submatrices.
768      nmax is determined by the matrix column dimension.
769      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
770   */
771   if (!nmax) nmax = 1;
772   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
773 
774   /* Make sure every processor loops through the nstages */
775   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
776 
777   for (i=0,pos=0; i<nstages; i++) {
778     if (pos+nmax <= ismax) max_no = nmax;
779     else if (pos == ismax) max_no = 0;
780     else                   max_no = ismax-pos;
781     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
782     pos += max_no;
783   }
784 
785   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
786   PetscFunctionReturn(0);
787 }
788 
789 /* -------------------------------------------------------------------------*/
790 #undef __FUNCT__
791 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
792 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
793 {
794   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
795   Mat            A  = c->A;
796   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
797   const PetscInt **icol,**irow;
798   PetscInt       *nrow,*ncol,start;
799   PetscErrorCode ierr;
800   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
801   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
802   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
803   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
804   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
805 #if defined(PETSC_USE_CTABLE)
806   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
807 #else
808   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
809 #endif
810   const PetscInt *irow_i;
811   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
812   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
813   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
814   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
815   MPI_Status     *r_status3,*r_status4,*s_status4;
816   MPI_Comm       comm;
817   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
818   PetscMPIInt    *onodes1,*olengths1;
819   PetscMPIInt    idex,idex2,end;
820 
821   PetscFunctionBegin;
822   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
823   tag0 = ((PetscObject)C)->tag;
824   size = c->size;
825   rank = c->rank;
826 
827   /* Get some new tags to keep the communication clean */
828   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
829   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
830   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
831 
832   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
833 
834   for (i=0; i<ismax; i++) {
835     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
836     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
837     if (allcolumns[i]) {
838       icol[i] = NULL;
839       ncol[i] = C->cmap->N;
840     } else {
841       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
842       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
843     }
844   }
845 
846   /* evaluate communication - mesg to who, length of mesg, and buffer space
847      required. Based on this, buffers are allocated, and data copied into them*/
848   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
849   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
850   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
851   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
852   for (i=0; i<ismax; i++) {
853     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
854     jmax   = nrow[i];
855     irow_i = irow[i];
856     for (j=0; j<jmax; j++) {
857       l   = 0;
858       row = irow_i[j];
859       while (row >= C->rmap->range[l+1]) l++;
860       proc = l;
861       w4[proc]++;
862     }
863     for (j=0; j<size; j++) {
864       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
865     }
866   }
867 
868   nrqs     = 0;              /* no of outgoing messages */
869   msz      = 0;              /* total mesg length (for all procs) */
870   w1[rank] = 0;              /* no mesg sent to self */
871   w3[rank] = 0;
872   for (i=0; i<size; i++) {
873     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
874   }
875   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
876   for (i=0,j=0; i<size; i++) {
877     if (w1[i]) { pa[j] = i; j++; }
878   }
879 
880   /* Each message would have a header = 1 + 2*(no of IS) + data */
881   for (i=0; i<nrqs; i++) {
882     j      = pa[i];
883     w1[j] += w2[j] + 2* w3[j];
884     msz   += w1[j];
885   }
886   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
887 
888   /* Determine the number of messages to expect, their lengths, from from-ids */
889   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
890   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
891 
892   /* Now post the Irecvs corresponding to these messages */
893   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
894 
895   ierr = PetscFree(onodes1);CHKERRQ(ierr);
896   ierr = PetscFree(olengths1);CHKERRQ(ierr);
897 
898   /* Allocate Memory for outgoing messages */
899   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
900   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
901   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
902 
903   {
904     PetscInt *iptr = tmp,ict = 0;
905     for (i=0; i<nrqs; i++) {
906       j        = pa[i];
907       iptr    += ict;
908       sbuf1[j] = iptr;
909       ict      = w1[j];
910     }
911   }
912 
913   /* Form the outgoing messages */
914   /* Initialize the header space */
915   for (i=0; i<nrqs; i++) {
916     j           = pa[i];
917     sbuf1[j][0] = 0;
918     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
919     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
920   }
921 
922   /* Parse the isrow and copy data into outbuf */
923   for (i=0; i<ismax; i++) {
924     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
925     irow_i = irow[i];
926     jmax   = nrow[i];
927     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
928       l   = 0;
929       row = irow_i[j];
930       while (row >= C->rmap->range[l+1]) l++;
931       proc = l;
932       if (proc != rank) { /* copy to the outgoing buf*/
933         ctr[proc]++;
934         *ptr[proc] = row;
935         ptr[proc]++;
936       }
937     }
938     /* Update the headers for the current IS */
939     for (j=0; j<size; j++) { /* Can Optimise this loop too */
940       if ((ctr_j = ctr[j])) {
941         sbuf1_j        = sbuf1[j];
942         k              = ++sbuf1_j[0];
943         sbuf1_j[2*k]   = ctr_j;
944         sbuf1_j[2*k-1] = i;
945       }
946     }
947   }
948 
949   /*  Now  post the sends */
950   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
951   for (i=0; i<nrqs; ++i) {
952     j    = pa[i];
953     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
954   }
955 
956   /* Post Receives to capture the buffer size */
957   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
958   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
959   rbuf2[0] = tmp + msz;
960   for (i=1; i<nrqs; ++i) {
961     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
962   }
963   for (i=0; i<nrqs; ++i) {
964     j    = pa[i];
965     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
966   }
967 
968   /* Send to other procs the buf size they should allocate */
969 
970 
971   /* Receive messages*/
972   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
973   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
974   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
975   {
976     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
977     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
978     PetscInt   *sbuf2_i;
979 
980     for (i=0; i<nrqr; ++i) {
981       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
982 
983       req_size[idex] = 0;
984       rbuf1_i        = rbuf1[idex];
985       start          = 2*rbuf1_i[0] + 1;
986       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
987       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
988       sbuf2_i        = sbuf2[idex];
989       for (j=start; j<end; j++) {
990         id              = rbuf1_i[j] - rstart;
991         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
992         sbuf2_i[j]      = ncols;
993         req_size[idex] += ncols;
994       }
995       req_source[idex] = r_status1[i].MPI_SOURCE;
996       /* form the header */
997       sbuf2_i[0] = req_size[idex];
998       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
999 
1000       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1001     }
1002   }
1003   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1004   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1005 
1006   /*  recv buffer sizes */
1007   /* Receive messages*/
1008 
1009   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1010   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1011   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1012   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1013   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1014 
1015   for (i=0; i<nrqs; ++i) {
1016     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1017     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1018     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1019     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1020     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1021   }
1022   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1023   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1024 
1025   /* Wait on sends1 and sends2 */
1026   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1027   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1028 
1029   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1030   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1031   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1032   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1033   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1034   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1035 
1036   /* Now allocate buffers for a->j, and send them off */
1037   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1038   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1039   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1040   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1041 
1042   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1043   {
1044     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1045     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1046     PetscInt cend = C->cmap->rend;
1047     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1048 
1049     for (i=0; i<nrqr; i++) {
1050       rbuf1_i   = rbuf1[i];
1051       sbuf_aj_i = sbuf_aj[i];
1052       ct1       = 2*rbuf1_i[0] + 1;
1053       ct2       = 0;
1054       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1055         kmax = rbuf1[i][2*j];
1056         for (k=0; k<kmax; k++,ct1++) {
1057           row    = rbuf1_i[ct1] - rstart;
1058           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1059           ncols  = nzA + nzB;
1060           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1061 
1062           /* load the column indices for this row into cols*/
1063           cols = sbuf_aj_i + ct2;
1064 
1065           lwrite = 0;
1066           for (l=0; l<nzB; l++) {
1067             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1068           }
1069           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1070           for (l=0; l<nzB; l++) {
1071             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1072           }
1073 
1074           ct2 += ncols;
1075         }
1076       }
1077       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1078     }
1079   }
1080   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1081   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1082 
1083   /* Allocate buffers for a->a, and send them off */
1084   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1085   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1086   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1087   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1088 
1089   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1090   {
1091     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1092     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1093     PetscInt    cend   = C->cmap->rend;
1094     PetscInt    *b_j   = b->j;
1095     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1096 
1097     for (i=0; i<nrqr; i++) {
1098       rbuf1_i   = rbuf1[i];
1099       sbuf_aa_i = sbuf_aa[i];
1100       ct1       = 2*rbuf1_i[0]+1;
1101       ct2       = 0;
1102       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1103         kmax = rbuf1_i[2*j];
1104         for (k=0; k<kmax; k++,ct1++) {
1105           row    = rbuf1_i[ct1] - rstart;
1106           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1107           ncols  = nzA + nzB;
1108           cworkB = b_j + b_i[row];
1109           vworkA = a_a + a_i[row];
1110           vworkB = b_a + b_i[row];
1111 
1112           /* load the column values for this row into vals*/
1113           vals = sbuf_aa_i+ct2;
1114 
1115           lwrite = 0;
1116           for (l=0; l<nzB; l++) {
1117             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1118           }
1119           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1120           for (l=0; l<nzB; l++) {
1121             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1122           }
1123 
1124           ct2 += ncols;
1125         }
1126       }
1127       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1128     }
1129   }
1130   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1131   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1132   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1133   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1134 
1135   /* Form the matrix */
1136   /* create col map: global col of C -> local col of submatrices */
1137   {
1138     const PetscInt *icol_i;
1139 #if defined(PETSC_USE_CTABLE)
1140     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1141     for (i=0; i<ismax; i++) {
1142       if (!allcolumns[i]) {
1143         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1144 
1145         jmax   = ncol[i];
1146         icol_i = icol[i];
1147         cmap_i = cmap[i];
1148         for (j=0; j<jmax; j++) {
1149           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1150         }
1151       } else {
1152         cmap[i] = NULL;
1153       }
1154     }
1155 #else
1156     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1157     for (i=0; i<ismax; i++) {
1158       if (!allcolumns[i]) {
1159         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1160         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1161         jmax   = ncol[i];
1162         icol_i = icol[i];
1163         cmap_i = cmap[i];
1164         for (j=0; j<jmax; j++) {
1165           cmap_i[icol_i[j]] = j+1;
1166         }
1167       } else {
1168         cmap[i] = NULL;
1169       }
1170     }
1171 #endif
1172   }
1173 
1174   /* Create lens which is required for MatCreate... */
1175   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1176   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1177   if (ismax) {
1178     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1179     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1180   }
1181   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1182 
1183   /* Update lens from local data */
1184   for (i=0; i<ismax; i++) {
1185     jmax = nrow[i];
1186     if (!allcolumns[i]) cmap_i = cmap[i];
1187     irow_i = irow[i];
1188     lens_i = lens[i];
1189     for (j=0; j<jmax; j++) {
1190       l   = 0;
1191       row = irow_i[j];
1192       while (row >= C->rmap->range[l+1]) l++;
1193       proc = l;
1194       if (proc == rank) {
1195         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1196         if (!allcolumns[i]) {
1197           for (k=0; k<ncols; k++) {
1198 #if defined(PETSC_USE_CTABLE)
1199             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1200 #else
1201             tcol = cmap_i[cols[k]];
1202 #endif
1203             if (tcol) lens_i[j]++;
1204           }
1205         } else { /* allcolumns */
1206           lens_i[j] = ncols;
1207         }
1208         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1209       }
1210     }
1211   }
1212 
1213   /* Create row map: global row of C -> local row of submatrices */
1214 #if defined(PETSC_USE_CTABLE)
1215   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1216   for (i=0; i<ismax; i++) {
1217     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1218     rmap_i = rmap[i];
1219     irow_i = irow[i];
1220     jmax   = nrow[i];
1221     for (j=0; j<jmax; j++) {
1222       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1223     }
1224   }
1225 #else
1226   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1227   if (ismax) {
1228     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1229     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1230   }
1231   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1232   for (i=0; i<ismax; i++) {
1233     rmap_i = rmap[i];
1234     irow_i = irow[i];
1235     jmax   = nrow[i];
1236     for (j=0; j<jmax; j++) {
1237       rmap_i[irow_i[j]] = j;
1238     }
1239   }
1240 #endif
1241 
1242   /* Update lens from offproc data */
1243   {
1244     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1245 
1246     for (tmp2=0; tmp2<nrqs; tmp2++) {
1247       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1248       idex    = pa[idex2];
1249       sbuf1_i = sbuf1[idex];
1250       jmax    = sbuf1_i[0];
1251       ct1     = 2*jmax+1;
1252       ct2     = 0;
1253       rbuf2_i = rbuf2[idex2];
1254       rbuf3_i = rbuf3[idex2];
1255       for (j=1; j<=jmax; j++) {
1256         is_no  = sbuf1_i[2*j-1];
1257         max1   = sbuf1_i[2*j];
1258         lens_i = lens[is_no];
1259         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1260         rmap_i = rmap[is_no];
1261         for (k=0; k<max1; k++,ct1++) {
1262 #if defined(PETSC_USE_CTABLE)
1263           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1264           row--;
1265           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1266 #else
1267           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1268 #endif
1269           max2 = rbuf2_i[ct1];
1270           for (l=0; l<max2; l++,ct2++) {
1271             if (!allcolumns[is_no]) {
1272 #if defined(PETSC_USE_CTABLE)
1273               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1274 #else
1275               tcol = cmap_i[rbuf3_i[ct2]];
1276 #endif
1277               if (tcol) lens_i[row]++;
1278             } else { /* allcolumns */
1279               lens_i[row]++; /* lens_i[row] += max2 ? */
1280             }
1281           }
1282         }
1283       }
1284     }
1285   }
1286   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1287   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1288   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1289   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1290   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1291 
1292   /* Create the submatrices */
1293   if (scall == MAT_REUSE_MATRIX) {
1294     PetscBool flag;
1295 
1296     /*
1297         Assumes new rows are same length as the old rows,hence bug!
1298     */
1299     for (i=0; i<ismax; i++) {
1300       mat = (Mat_SeqAIJ*)(submats[i]->data);
1301       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1302       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1303       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1304       /* Initial matrix as if empty */
1305       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1306 
1307       submats[i]->factortype = C->factortype;
1308     }
1309   } else {
1310     for (i=0; i<ismax; i++) {
1311       PetscInt rbs,cbs;
1312       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1313       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1314 
1315       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1316       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1317 
1318       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1319       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1320       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1321     }
1322   }
1323 
1324   /* Assemble the matrices */
1325   /* First assemble the local rows */
1326   {
1327     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1328     PetscScalar *imat_a;
1329 
1330     for (i=0; i<ismax; i++) {
1331       mat       = (Mat_SeqAIJ*)submats[i]->data;
1332       imat_ilen = mat->ilen;
1333       imat_j    = mat->j;
1334       imat_i    = mat->i;
1335       imat_a    = mat->a;
1336 
1337       if (!allcolumns[i]) cmap_i = cmap[i];
1338       rmap_i = rmap[i];
1339       irow_i = irow[i];
1340       jmax   = nrow[i];
1341       for (j=0; j<jmax; j++) {
1342         l   = 0;
1343         row = irow_i[j];
1344         while (row >= C->rmap->range[l+1]) l++;
1345         proc = l;
1346         if (proc == rank) {
1347           old_row = row;
1348 #if defined(PETSC_USE_CTABLE)
1349           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1350           row--;
1351 #else
1352           row = rmap_i[row];
1353 #endif
1354           ilen_row = imat_ilen[row];
1355           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1356           mat_i    = imat_i[row];
1357           mat_a    = imat_a + mat_i;
1358           mat_j    = imat_j + mat_i;
1359           if (!allcolumns[i]) {
1360             for (k=0; k<ncols; k++) {
1361 #if defined(PETSC_USE_CTABLE)
1362               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1363 #else
1364               tcol = cmap_i[cols[k]];
1365 #endif
1366               if (tcol) {
1367                 *mat_j++ = tcol - 1;
1368                 *mat_a++ = vals[k];
1369                 ilen_row++;
1370               }
1371             }
1372           } else { /* allcolumns */
1373             for (k=0; k<ncols; k++) {
1374               *mat_j++ = cols[k];  /* global col index! */
1375               *mat_a++ = vals[k];
1376               ilen_row++;
1377             }
1378           }
1379           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1380 
1381           imat_ilen[row] = ilen_row;
1382         }
1383       }
1384     }
1385   }
1386 
1387   /*   Now assemble the off proc rows*/
1388   {
1389     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1390     PetscInt    *imat_j,*imat_i;
1391     PetscScalar *imat_a,*rbuf4_i;
1392 
1393     for (tmp2=0; tmp2<nrqs; tmp2++) {
1394       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1395       idex    = pa[idex2];
1396       sbuf1_i = sbuf1[idex];
1397       jmax    = sbuf1_i[0];
1398       ct1     = 2*jmax + 1;
1399       ct2     = 0;
1400       rbuf2_i = rbuf2[idex2];
1401       rbuf3_i = rbuf3[idex2];
1402       rbuf4_i = rbuf4[idex2];
1403       for (j=1; j<=jmax; j++) {
1404         is_no     = sbuf1_i[2*j-1];
1405         rmap_i    = rmap[is_no];
1406         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1407         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1408         imat_ilen = mat->ilen;
1409         imat_j    = mat->j;
1410         imat_i    = mat->i;
1411         imat_a    = mat->a;
1412         max1      = sbuf1_i[2*j];
1413         for (k=0; k<max1; k++,ct1++) {
1414           row = sbuf1_i[ct1];
1415 #if defined(PETSC_USE_CTABLE)
1416           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1417           row--;
1418 #else
1419           row = rmap_i[row];
1420 #endif
1421           ilen  = imat_ilen[row];
1422           mat_i = imat_i[row];
1423           mat_a = imat_a + mat_i;
1424           mat_j = imat_j + mat_i;
1425           max2  = rbuf2_i[ct1];
1426           if (!allcolumns[is_no]) {
1427             for (l=0; l<max2; l++,ct2++) {
1428 
1429 #if defined(PETSC_USE_CTABLE)
1430               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1431 #else
1432               tcol = cmap_i[rbuf3_i[ct2]];
1433 #endif
1434               if (tcol) {
1435                 *mat_j++ = tcol - 1;
1436                 *mat_a++ = rbuf4_i[ct2];
1437                 ilen++;
1438               }
1439             }
1440           } else { /* allcolumns */
1441             for (l=0; l<max2; l++,ct2++) {
1442               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1443               *mat_a++ = rbuf4_i[ct2];
1444               ilen++;
1445             }
1446           }
1447           imat_ilen[row] = ilen;
1448         }
1449       }
1450     }
1451   }
1452 
1453   /* sort the rows */
1454   {
1455     PetscInt    *imat_ilen,*imat_j,*imat_i;
1456     PetscScalar *imat_a;
1457 
1458     for (i=0; i<ismax; i++) {
1459       mat       = (Mat_SeqAIJ*)submats[i]->data;
1460       imat_j    = mat->j;
1461       imat_i    = mat->i;
1462       imat_a    = mat->a;
1463       imat_ilen = mat->ilen;
1464 
1465       if (allcolumns[i]) continue;
1466       jmax = nrow[i];
1467       for (j=0; j<jmax; j++) {
1468         PetscInt ilen;
1469 
1470         mat_i = imat_i[j];
1471         mat_a = imat_a + mat_i;
1472         mat_j = imat_j + mat_i;
1473         ilen  = imat_ilen[j];
1474         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
1475       }
1476     }
1477   }
1478 
1479   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1480   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1481   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1482   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1483   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1484 
1485   /* Restore the indices */
1486   for (i=0; i<ismax; i++) {
1487     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1488     if (!allcolumns[i]) {
1489       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1490     }
1491   }
1492 
1493   /* Destroy allocated memory */
1494   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1495   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1496   ierr = PetscFree(pa);CHKERRQ(ierr);
1497 
1498   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1499   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1500   for (i=0; i<nrqr; ++i) {
1501     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1502   }
1503   for (i=0; i<nrqs; ++i) {
1504     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1505     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1506   }
1507 
1508   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1509   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1510   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1511   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1512   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1513   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1514   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1515 
1516 #if defined(PETSC_USE_CTABLE)
1517   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1518 #else
1519   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1520 #endif
1521   ierr = PetscFree(rmap);CHKERRQ(ierr);
1522 
1523   for (i=0; i<ismax; i++) {
1524     if (!allcolumns[i]) {
1525 #if defined(PETSC_USE_CTABLE)
1526       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1527 #else
1528       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1529 #endif
1530     }
1531   }
1532   ierr = PetscFree(cmap);CHKERRQ(ierr);
1533   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1534   ierr = PetscFree(lens);CHKERRQ(ierr);
1535 
1536   for (i=0; i<ismax; i++) {
1537     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1538     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1539   }
1540   PetscFunctionReturn(0);
1541 }
1542 
1543 /*
1544  Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
1545  Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
1546  of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
1547  If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
1548  After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
1549  state, and needs to be "assembled" later by compressing B's column space.
1550 
1551  This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
1552  Following this call, C->A & C->B have been created, even if empty.
1553  */
1554 #undef __FUNCT__
1555 #define __FUNCT__ "MatSetSeqMats_MPIAIJ"
1556 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
1557 {
1558   /* If making this function public, change the error returned in this function away from _PLIB. */
1559   PetscErrorCode ierr;
1560   Mat_MPIAIJ     *aij;
1561   Mat_SeqAIJ     *Baij;
1562   PetscBool      seqaij,Bdisassembled;
1563   PetscInt       m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
1564   PetscScalar    v;
1565   const PetscInt *rowindices,*colindices;
1566 
1567   PetscFunctionBegin;
1568   /* Check to make sure the component matrices (and embeddings) are compatible with C. */
1569   if (A) {
1570     ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
1571     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
1572     if (rowemb) {
1573       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
1574       if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
1575     } else {
1576       if (C->rmap->n != A->rmap->n) {
1577 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
1578       }
1579     }
1580     if (dcolemb) {
1581       ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr);
1582       if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
1583     } else {
1584       if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
1585     }
1586   }
1587   if (B) {
1588     ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr);
1589     if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
1590     if (rowemb) {
1591       ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr);
1592       if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
1593     } else {
1594       if (C->rmap->n != B->rmap->n) {
1595 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
1596       }
1597     }
1598     if (ocolemb) {
1599       ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr);
1600       if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
1601     } else {
1602       if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
1603     }
1604   }
1605 
1606   aij    = (Mat_MPIAIJ*)(C->data);
1607   if (!aij->A) {
1608     /* Mimic parts of MatMPIAIJSetPreallocation() */
1609     ierr   = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr);
1610     ierr   = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr);
1611     ierr   = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr);
1612     ierr   = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr);
1613     ierr   = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr);
1614   }
1615   if (A) {
1616     ierr   = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr);
1617   } else {
1618     ierr = MatSetUp(aij->A);CHKERRQ(ierr);
1619   }
1620   if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
1621     /*
1622       If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
1623       need to "disassemble" B -- convert it to using C's global indices.
1624       To insert the values we take the safer, albeit more expensive, route of MatSetValues().
1625 
1626       If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
1627       we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
1628 
1629       TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
1630       At least avoid calling MatSetValues() and the implied searches?
1631     */
1632 
1633     if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
1634 #if defined(PETSC_USE_CTABLE)
1635       ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr);
1636 #else
1637       ierr = PetscFree(aij->colmap);CHKERRQ(ierr);
1638       ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1639 #endif
1640       ngcol = 0;
1641       if (aij->lvec) {
1642 	ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr);
1643       }
1644       if (aij->garray) {
1645 	ierr = PetscFree(aij->garray);CHKERRQ(ierr);
1646 	ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr);
1647       }
1648       ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr);
1649       ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr);
1650     }
1651     if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
1652       ierr = MatDestroy(&aij->B);CHKERRQ(ierr);
1653     }
1654     if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
1655       ierr = MatZeroEntries(aij->B);CHKERRQ(ierr);
1656     }
1657   }
1658   Bdisassembled = PETSC_FALSE;
1659   if (!aij->B) {
1660     ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr);
1661     ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr);
1662     ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr);
1663     ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr);
1664     ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr);
1665     Bdisassembled = PETSC_TRUE;
1666   }
1667   if (B) {
1668     Baij = (Mat_SeqAIJ*)(B->data);
1669     if (pattern == DIFFERENT_NONZERO_PATTERN) {
1670       ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr);
1671       for (i=0; i<B->rmap->n; i++) {
1672 	nz[i] = Baij->i[i+1] - Baij->i[i];
1673       }
1674       ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr);
1675       ierr = PetscFree(nz);CHKERRQ(ierr);
1676     }
1677 
1678     ierr  = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr);
1679     shift = rend-rstart;
1680     count = 0;
1681     rowindices = NULL;
1682     colindices = NULL;
1683     if (rowemb) {
1684       ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr);
1685     }
1686     if (ocolemb) {
1687       ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr);
1688     }
1689     for (i=0; i<B->rmap->n; i++) {
1690       PetscInt row;
1691       row = i;
1692       if (rowindices) row = rowindices[i];
1693       for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
1694 	col  = Baij->j[count];
1695 	if (colindices) col = colindices[col];
1696 	if (Bdisassembled && col>=rstart) col += shift;
1697 	v    = Baij->a[count];
1698 	ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr);
1699 	++count;
1700       }
1701     }
1702     /* No assembly for aij->B is necessary. */
1703     /* FIXME: set aij->B's nonzerostate correctly. */
1704   } else {
1705     ierr = MatSetUp(aij->B);CHKERRQ(ierr);
1706   }
1707   C->preallocated  = PETSC_TRUE;
1708   C->was_assembled = PETSC_FALSE;
1709   C->assembled     = PETSC_FALSE;
1710    /*
1711       C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
1712       Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
1713    */
1714   PetscFunctionReturn(0);
1715 }
1716 
1717 #undef __FUNCT__
1718 #define __FUNCT__ "MatGetSeqMats_MPIAIJ"
1719 /*
1720   B uses local indices with column indices ranging between 0 and N-n; they  must be interpreted using garray.
1721  */
1722 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
1723 {
1724   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1725 
1726   PetscFunctionBegin;
1727   PetscValidPointer(A,2);
1728   PetscValidPointer(B,3);
1729   /* FIXME: make sure C is assembled */
1730   *A = aij->A;
1731   *B = aij->B;
1732   /* Note that we don't incref *A and *B, so be careful! */
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 /*
1737   Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
1738   NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
1739 */
1740 #undef __FUNCT__
1741 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ"
1742 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1743                                                  PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
1744 					         PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
1745 					         PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
1746 					         PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
1747 {
1748   PetscErrorCode ierr;
1749   PetscMPIInt    isize,flag;
1750   PetscInt       i,ii,cismax,ispar;
1751   Mat            *A,*B;
1752   IS             *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
1753 
1754   PetscFunctionBegin;
1755   if (!ismax) PetscFunctionReturn(0);
1756 
1757   for (i = 0, cismax = 0; i < ismax; ++i) {
1758     PetscMPIInt isize;
1759     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr);
1760     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1761     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr);
1762     if (isize > 1) ++cismax;
1763   }
1764   /*
1765      If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
1766      ispar counts the number of parallel ISs across C's comm.
1767   */
1768   ierr = MPI_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
1769   if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
1770     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1771     PetscFunctionReturn(0);
1772   }
1773 
1774   /* if (ispar) */
1775   /*
1776     Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
1777     These are used to extract the off-diag portion of the resulting parallel matrix.
1778     The row IS for the off-diag portion is the same as for the diag portion,
1779     so we merely alias (without increfing) the row IS, while skipping those that are sequential.
1780   */
1781   ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr);
1782   ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr);
1783   for (i = 0, ii = 0; i < ismax; ++i) {
1784     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
1785     if (isize > 1) {
1786       /*
1787 	 TODO: This is the part that's ***NOT SCALABLE***.
1788 	 To fix this we need to extract just the indices of C's nonzero columns
1789 	 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
1790 	 part of iscol[i] -- without actually computing ciscol[ii]. This also has
1791 	 to be done without serializing on the IS list, so, most likely, it is best
1792 	 done by rewriting MatGetSubMatrices_MPIAIJ() directly.
1793       */
1794       ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr);
1795       /* Now we have to
1796 	 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
1797 	     were sorted on each rank, concatenated they might no longer be sorted;
1798 	 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
1799 	     indices in the nondecreasing order to the original index positions.
1800 	 If ciscol[ii] is strictly increasing, the permutation IS is NULL.
1801       */
1802       ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr);
1803       ierr = ISSort(ciscol[ii]);CHKERRQ(ierr);
1804       ++ii;
1805     }
1806   }
1807   ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr);
1808   for (i = 0, ii = 0; i < ismax; ++i) {
1809     PetscInt       j,issize;
1810     const PetscInt *indices;
1811 
1812     /*
1813        Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
1814      */
1815     ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr);
1816     ierr = ISSort(isrow[i]);CHKERRQ(ierr);
1817     ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr);
1818     ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr);
1819     for (j = 1; j < issize; ++j) {
1820       if (indices[j] == indices[j-1]) {
1821 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
1822       }
1823     }
1824     ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr);
1825 
1826 
1827     ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr);
1828     ierr = ISSort(iscol[i]);CHKERRQ(ierr);
1829     ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr);
1830     ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr);
1831     for (j = 1; j < issize; ++j) {
1832       if (indices[j-1] == indices[j]) {
1833 	SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
1834       }
1835     }
1836     ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr);
1837     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
1838     if (isize > 1) {
1839       cisrow[ii] = isrow[i];
1840       ++ii;
1841     }
1842   }
1843   /*
1844     Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1845     array of sequential matrices underlying the resulting parallel matrices.
1846     Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
1847     contain duplicates.
1848 
1849     There are as many diag matrices as there are original index sets. There are only as many parallel
1850     and off-diag matrices, as there are parallel (comm size > 1) index sets.
1851 
1852     ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
1853     - If the array of MPI matrices already exists and is being reused, we need to allocate the array
1854       and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
1855       will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
1856       with A[i] and B[ii] extracted from the corresponding MPI submat.
1857     - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
1858       will have a different order from what getsubmats_seq expects.  To handle this case -- indicated
1859       by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
1860       (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
1861       values into A[i] and B[ii] sitting inside the corresponding submat.
1862     - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
1863       A[i], B[ii], AA[i] or BB[ii] matrices.
1864   */
1865   /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
1866   if (scall == MAT_INITIAL_MATRIX) {
1867     ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
1868     /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */
1869   } else {
1870     ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr);
1871     ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr);
1872     /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */
1873     for (i = 0, ii = 0; i < ismax; ++i) {
1874       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
1875       if (isize > 1) {
1876 	Mat AA,BB;
1877 	ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr);
1878 	if (!isrow_p[i] && !iscol_p[i]) {
1879 	  A[i] = AA;
1880 	} else {
1881 	  /* TODO: extract A[i] composed on AA. */
1882 	  ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
1883 	}
1884 	if (!isrow_p[i] && !ciscol_p[ii]) {
1885 	  B[ii] = BB;
1886 	} else {
1887 	  /* TODO: extract B[ii] composed on BB. */
1888 	  ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr);
1889 	}
1890 	++ii;
1891       } else {
1892 	if (!isrow_p[i] && !iscol_p[i]) {
1893 	  A[i] = (*submat)[i];
1894 	} else {
1895 	  /* TODO: extract A[i] composed on (*submat)[i]. */
1896 	  ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr);
1897 	}
1898       }
1899     }
1900   }
1901   /* Now obtain the sequential A and B submatrices separately. */
1902   ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr);
1903   ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr);
1904   /*
1905     If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
1906     matrices A & B have been extracted directly into the parallel matrices containing them, or
1907     simply into the sequential matrix identical with the corresponding A (if isize == 1).
1908     Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
1909     to have the same sparsity pattern.
1910     Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
1911     must be constructed for C. This is done by setseqmat(s).
1912   */
1913   for (i = 0, ii = 0; i < ismax; ++i) {
1914     /*
1915        TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
1916        That way we can avoid sorting and computing permutations when reusing.
1917        To this end:
1918         - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
1919 	- if caching arrays to hold the ISs, make and compose a container for them so that it can
1920 	  be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
1921     */
1922     MatStructure pattern;
1923     if (scall == MAT_INITIAL_MATRIX) {
1924       pattern = DIFFERENT_NONZERO_PATTERN;
1925     } else {
1926       pattern = SAME_NONZERO_PATTERN;
1927     }
1928     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr);
1929     /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
1930     if (isize > 1) {
1931       if (scall == MAT_INITIAL_MATRIX) {
1932 	ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr);
1933 	ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1934 	ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr);
1935 	ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr);
1936 	ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr);
1937       }
1938       /*
1939 	For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
1940       */
1941       {
1942 	Mat AA,BB;
1943 	AA = NULL;
1944 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i];
1945 	BB = NULL;
1946 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii];
1947 	if (AA || BB) {
1948 	  ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr);
1949 	  ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1950 	  ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1951 	}
1952 	if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) {
1953 	  /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */
1954 	  ierr = MatDestroy(&AA);CHKERRQ(ierr);
1955 	}
1956 	if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) {
1957 	  /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */
1958 	  ierr = MatDestroy(&BB);CHKERRQ(ierr);
1959 	}
1960       }
1961       ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr);
1962       ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr);
1963       ++ii;
1964     } else { /* if (isize == 1) */
1965       if (scall == MAT_INITIAL_MATRIX) {
1966 	if (isrow_p[i] || iscol_p[i]) {
1967 	  ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr);
1968 	} else (*submat)[i] = A[i];
1969       }
1970       if (isrow_p[i] || iscol_p[i]) {
1971 	ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr);
1972 	/* Otherwise A is extracted straight into (*submats)[i]. */
1973 	/* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
1974 	ierr = MatDestroy(A+i);CHKERRQ(ierr);
1975       }
1976     }
1977     ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr);
1978     ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr);
1979   }
1980   ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr);
1981   ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr);
1982   ierr = PetscFree(ciscol_p);CHKERRQ(ierr);
1983   ierr = PetscFree(A);CHKERRQ(ierr);
1984   ierr = PetscFree(B);CHKERRQ(ierr);
1985   PetscFunctionReturn(0);
1986 }
1987 
1988 
1989 
1990 #undef __FUNCT__
1991 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ"
1992 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1993 {
1994   PetscErrorCode ierr;
1995 
1996   PetscFunctionBegin;
1997   ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr);
1998   PetscFunctionReturn(0);
1999 }
2000