xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision a9b180a608ab5b0c8e2e0dfcd432ec21ea26b82a)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/aij/mpi/mpiaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
18 {
19   PetscErrorCode ierr;
20   PetscInt       i;
21 
22   PetscFunctionBegin;
23   if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24   for (i=0; i<ov; ++i) {
25     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
26   }
27   PetscFunctionReturn(0);
28 }
29 
30 /*
31   Sample message format:
32   If a processor A wants processor B to process some elements corresponding
33   to index sets is[1],is[5]
34   mesg [0] = 2   (no of index sets in the mesg)
35   -----------
36   mesg [1] = 1 => is[1]
37   mesg [2] = sizeof(is[1]);
38   -----------
39   mesg [3] = 5  => is[5]
40   mesg [4] = sizeof(is[5]);
41   -----------
42   mesg [5]
43   mesg [n]  datas[1]
44   -----------
45   mesg[n+1]
46   mesg[m]  data(is[5])
47   -----------
48 
49   Notes:
50   nrqs - no of requests sent (or to be sent out)
51   nrqr - no of requests recieved (which have to be or which have been processed
52 */
53 #undef __FUNCT__
54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
56 {
57   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
58   PetscMPIInt    *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
59   const PetscInt **idx,*idx_i;
60   PetscInt       *n,**data,len;
61   PetscErrorCode ierr;
62   PetscMPIInt    size,rank,tag1,tag2;
63   PetscInt       M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr;
64   PetscInt       *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
65   PetscBT        *table;
66   MPI_Comm       comm;
67   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68   MPI_Status     *s_status,*recv_status;
69   char           *t_p;
70 
71   PetscFunctionBegin;
72   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
73   size = c->size;
74   rank = c->rank;
75   M    = C->rmap->N;
76 
77   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
78   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
79 
80   ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr);
81 
82   for (i=0; i<imax; i++) {
83     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
84     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
85   }
86 
87   /* evaluate communication - mesg to who,length of mesg, and buffer space
88      required. Based on this, buffers are allocated, and data copied into them*/
89   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
90   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
91   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
92   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
93   for (i=0; i<imax; i++) {
94     ierr  = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/
95     idx_i = idx[i];
96     len   = n[i];
97     for (j=0; j<len; j++) {
98       row = idx_i[j];
99       if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100       ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
101       w4[proc]++;
102     }
103     for (j=0; j<size; j++) {
104       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105     }
106   }
107 
108   nrqs     = 0;              /* no of outgoing messages */
109   msz      = 0;              /* total mesg length (for all proc */
110   w1[rank] = 0;              /* no mesg sent to intself */
111   w3[rank] = 0;
112   for (i=0; i<size; i++) {
113     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114   }
115   /* pa - is list of processors to communicate with */
116   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
117   for (i=0,j=0; i<size; i++) {
118     if (w1[i]) {pa[j] = i; j++;}
119   }
120 
121   /* Each message would have a header = 1 + 2*(no of IS) + data */
122   for (i=0; i<nrqs; i++) {
123     j      = pa[i];
124     w1[j] += w2[j] + 2*w3[j];
125     msz   += w1[j];
126   }
127 
128   /* Determine the number of messages to expect, their lengths, from from-ids */
129   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131 
132   /* Now post the Irecvs corresponding to these messages */
133   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134 
135   /* Allocate Memory for outgoing messages */
136   ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
137   ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr);
138   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
139 
140   {
141     PetscInt *iptr = tmp,ict  = 0;
142     for (i=0; i<nrqs; i++) {
143       j         = pa[i];
144       iptr     +=  ict;
145       outdat[j] = iptr;
146       ict       = w1[j];
147     }
148   }
149 
150   /* Form the outgoing messages */
151   /*plug in the headers*/
152   for (i=0; i<nrqs; i++) {
153     j            = pa[i];
154     outdat[j][0] = 0;
155     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
156     ptr[j]       = outdat[j] + 2*w3[j] + 1;
157   }
158 
159   /* Memory for doing local proc's work*/
160   {
161     ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
162 
163     for (i=0; i<imax; i++) {
164       table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
165       data[i]  = d_p + M*i;
166     }
167   }
168 
169   /* Parse the IS and update local tables and the outgoing buf with the data*/
170   {
171     PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
172     PetscBT  table_i;
173 
174     for (i=0; i<imax; i++) {
175       ierr    = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
176       n_i     = n[i];
177       table_i = table[i];
178       idx_i   = idx[i];
179       data_i  = data[i];
180       isz_i   = isz[i];
181       for (j=0; j<n_i; j++) {   /* parse the indices of each IS */
182         row  = idx_i[j];
183         ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr);
184         if (proc != rank) { /* copy to the outgoing buffer */
185           ctr[proc]++;
186           *ptr[proc] = row;
187           ptr[proc]++;
188         } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */
189       }
190       /* Update the headers for the current IS */
191       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
192         if ((ctr_j = ctr[j])) {
193           outdat_j        = outdat[j];
194           k               = ++outdat_j[0];
195           outdat_j[2*k]   = ctr_j;
196           outdat_j[2*k-1] = i;
197         }
198       }
199       isz[i] = isz_i;
200     }
201   }
202 
203   /*  Now  post the sends */
204   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
205   for (i=0; i<nrqs; ++i) {
206     j    = pa[i];
207     ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
208   }
209 
210   /* No longer need the original indices*/
211   for (i=0; i<imax; ++i) {
212     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
213   }
214   ierr = PetscFree2(idx,n);CHKERRQ(ierr);
215 
216   for (i=0; i<imax; ++i) {
217     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
218   }
219 
220   /* Do Local work*/
221   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
222 
223   /* Receive messages*/
224   ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
225   if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
226 
227   ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
228   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
229 
230   /* Phase 1 sends are complete - deallocate buffers */
231   ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
232   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
233 
234   ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
235   ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
236   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
237   ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
238   ierr = PetscFree(rbuf);CHKERRQ(ierr);
239 
240 
241   /* Send the data back*/
242   /* Do a global reduction to know the buffer space req for incoming messages*/
243   {
244     PetscMPIInt *rw1;
245 
246     ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
247 
248     for (i=0; i<nrqr; ++i) {
249       proc = recv_status[i].MPI_SOURCE;
250 
251       if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
252       rw1[proc] = isz1[i];
253     }
254     ierr = PetscFree(onodes1);CHKERRQ(ierr);
255     ierr = PetscFree(olengths1);CHKERRQ(ierr);
256 
257     /* Determine the number of messages to expect, their lengths, from from-ids */
258     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
259     ierr = PetscFree(rw1);CHKERRQ(ierr);
260   }
261   /* Now post the Irecvs corresponding to these messages */
262   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
263 
264   /*  Now  post the sends */
265   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
266   for (i=0; i<nrqr; ++i) {
267     j    = recv_status[i].MPI_SOURCE;
268     ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
269   }
270 
271   /* receive work done on other processors*/
272   {
273     PetscInt    is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
274     PetscMPIInt idex;
275     PetscBT     table_i;
276     MPI_Status  *status2;
277 
278     ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr);
279     for (i=0; i<nrqs; ++i) {
280       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
281       /* Process the message*/
282       rbuf2_i = rbuf2[idex];
283       ct1     = 2*rbuf2_i[0]+1;
284       jmax    = rbuf2[idex][0];
285       for (j=1; j<=jmax; j++) {
286         max     = rbuf2_i[2*j];
287         is_no   = rbuf2_i[2*j-1];
288         isz_i   = isz[is_no];
289         data_i  = data[is_no];
290         table_i = table[is_no];
291         for (k=0; k<max; k++,ct1++) {
292           row = rbuf2_i[ct1];
293           if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
294         }
295         isz[is_no] = isz_i;
296       }
297     }
298 
299     if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
300     ierr = PetscFree(status2);CHKERRQ(ierr);
301   }
302 
303   for (i=0; i<imax; ++i) {
304     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
305   }
306 
307   ierr = PetscFree(onodes2);CHKERRQ(ierr);
308   ierr = PetscFree(olengths2);CHKERRQ(ierr);
309 
310   ierr = PetscFree(pa);CHKERRQ(ierr);
311   ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
312   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
313   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
314   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
315   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
316   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
317   ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
318   ierr = PetscFree(s_status);CHKERRQ(ierr);
319   ierr = PetscFree(recv_status);CHKERRQ(ierr);
320   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
321   ierr = PetscFree(xdata);CHKERRQ(ierr);
322   ierr = PetscFree(isz1);CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 #undef __FUNCT__
327 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
328 /*
329    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
330        the work on the local processor.
331 
332      Inputs:
333       C      - MAT_MPIAIJ;
334       imax - total no of index sets processed at a time;
335       table  - an array of char - size = m bits.
336 
337      Output:
338       isz    - array containing the count of the solution elements corresponding
339                to each index set;
340       data   - pointer to the solutions
341 */
342 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
343 {
344   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
345   Mat        A  = c->A,B = c->B;
346   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
347   PetscInt   start,end,val,max,rstart,cstart,*ai,*aj;
348   PetscInt   *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
349   PetscBT    table_i;
350 
351   PetscFunctionBegin;
352   rstart = C->rmap->rstart;
353   cstart = C->cmap->rstart;
354   ai     = a->i;
355   aj     = a->j;
356   bi     = b->i;
357   bj     = b->j;
358   garray = c->garray;
359 
360 
361   for (i=0; i<imax; i++) {
362     data_i  = data[i];
363     table_i = table[i];
364     isz_i   = isz[i];
365     for (j=0,max=isz[i]; j<max; j++) {
366       row   = data_i[j] - rstart;
367       start = ai[row];
368       end   = ai[row+1];
369       for (k=start; k<end; k++) { /* Amat */
370         val = aj[k] + cstart;
371         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
372       }
373       start = bi[row];
374       end   = bi[row+1];
375       for (k=start; k<end; k++) { /* Bmat */
376         val = garray[bj[k]];
377         if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
378       }
379     }
380     isz[i] = isz_i;
381   }
382   PetscFunctionReturn(0);
383 }
384 
385 #undef __FUNCT__
386 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
387 /*
388       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
389          and return the output
390 
391          Input:
392            C    - the matrix
393            nrqr - no of messages being processed.
394            rbuf - an array of pointers to the recieved requests
395 
396          Output:
397            xdata - array of messages to be sent back
398            isz1  - size of each message
399 
400   For better efficiency perhaps we should malloc separately each xdata[i],
401 then if a remalloc is required we need only copy the data for that one row
402 rather then all previous rows as it is now where a single large chunck of
403 memory is used.
404 
405 */
406 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
407 {
408   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
409   Mat            A  = c->A,B = c->B;
410   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
411   PetscErrorCode ierr;
412   PetscInt       rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
413   PetscInt       row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
414   PetscInt       val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
415   PetscInt       *rbuf_i,kmax,rbuf_0;
416   PetscBT        xtable;
417 
418   PetscFunctionBegin;
419   m      = C->rmap->N;
420   rstart = C->rmap->rstart;
421   cstart = C->cmap->rstart;
422   ai     = a->i;
423   aj     = a->j;
424   bi     = b->i;
425   bj     = b->j;
426   garray = c->garray;
427 
428 
429   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
430     rbuf_i =  rbuf[i];
431     rbuf_0 =  rbuf_i[0];
432     ct    += rbuf_0;
433     for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
434   }
435 
436   if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
437   else max1 = 1;
438   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
439   ierr         = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
440   ++no_malloc;
441   ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr);
442   ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr);
443 
444   ct3 = 0;
445   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
446     rbuf_i =  rbuf[i];
447     rbuf_0 =  rbuf_i[0];
448     ct1    =  2*rbuf_0+1;
449     ct2    =  ct1;
450     ct3   += ct1;
451     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
452       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
453       oct2 = ct2;
454       kmax = rbuf_i[2*j];
455       for (k=0; k<kmax; k++,ct1++) {
456         row = rbuf_i[ct1];
457         if (!PetscBTLookupSet(xtable,row)) {
458           if (!(ct3 < mem_estimate)) {
459             new_estimate = (PetscInt)(1.5*mem_estimate)+1;
460             ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
461             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
462             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
463             xdata[0]     = tmp;
464             mem_estimate = new_estimate; ++no_malloc;
465             for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
466           }
467           xdata[i][ct2++] = row;
468           ct3++;
469         }
470       }
471       for (k=oct2,max2=ct2; k<max2; k++) {
472         row   = xdata[i][k] - rstart;
473         start = ai[row];
474         end   = ai[row+1];
475         for (l=start; l<end; l++) {
476           val = aj[l] + cstart;
477           if (!PetscBTLookupSet(xtable,val)) {
478             if (!(ct3 < mem_estimate)) {
479               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
480               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
481               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
482               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
483               xdata[0]     = tmp;
484               mem_estimate = new_estimate; ++no_malloc;
485               for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
486             }
487             xdata[i][ct2++] = val;
488             ct3++;
489           }
490         }
491         start = bi[row];
492         end   = bi[row+1];
493         for (l=start; l<end; l++) {
494           val = garray[bj[l]];
495           if (!PetscBTLookupSet(xtable,val)) {
496             if (!(ct3 < mem_estimate)) {
497               new_estimate = (PetscInt)(1.5*mem_estimate)+1;
498               ierr         = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
499               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr);
500               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
501               xdata[0]     = tmp;
502               mem_estimate = new_estimate; ++no_malloc;
503               for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
504             }
505             xdata[i][ct2++] = val;
506             ct3++;
507           }
508         }
509       }
510       /* Update the header*/
511       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
512       xdata[i][2*j-1] = rbuf_i[2*j-1];
513     }
514     xdata[i][0] = rbuf_0;
515     xdata[i+1]  = xdata[i] + ct2;
516     isz1[i]     = ct2; /* size of each message */
517   }
518   ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
519   ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
520   PetscFunctionReturn(0);
521 }
522 /* -------------------------------------------------------------------------*/
523 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*);
524 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
525 /*
526     Every processor gets the entire matrix
527 */
528 #undef __FUNCT__
529 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
530 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[])
531 {
532   Mat            B;
533   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
534   Mat_SeqAIJ     *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
535   PetscErrorCode ierr;
536   PetscMPIInt    size,rank,*recvcounts = 0,*displs = 0;
537   PetscInt       sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
538   PetscInt       m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
539   MatScalar      *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
540 
541   PetscFunctionBegin;
542   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
543   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr);
544 
545   if (scall == MAT_INITIAL_MATRIX) {
546     /* ----------------------------------------------------------------
547          Tell every processor the number of nonzeros per row
548     */
549     ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr);
550     for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
551       lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
552     }
553     sendcount = A->rmap->rend - A->rmap->rstart;
554     ierr      = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
555     for (i=0; i<size; i++) {
556       recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
557       displs[i]     = A->rmap->range[i];
558     }
559 #if defined(PETSC_HAVE_MPI_IN_PLACE)
560     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
561 #else
562     ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
563 #endif
564     /* ---------------------------------------------------------------
565          Create the sequential matrix of the same type as the local block diagonal
566     */
567     ierr  = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr);
568     ierr  = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
569     ierr  = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr);
570     ierr  = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr);
571     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
572     ierr  = PetscMalloc1(1,Bin);CHKERRQ(ierr);
573     **Bin = B;
574     b     = (Mat_SeqAIJ*)B->data;
575 
576     /*--------------------------------------------------------------------
577        Copy my part of matrix column indices over
578     */
579     sendcount  = ad->nz + bd->nz;
580     jsendbuf   = b->j + b->i[rstarts[rank]];
581     a_jsendbuf = ad->j;
582     b_jsendbuf = bd->j;
583     n          = A->rmap->rend - A->rmap->rstart;
584     cnt        = 0;
585     for (i=0; i<n; i++) {
586 
587       /* put in lower diagonal portion */
588       m = bd->i[i+1] - bd->i[i];
589       while (m > 0) {
590         /* is it above diagonal (in bd (compressed) numbering) */
591         if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
592         jsendbuf[cnt++] = garray[*b_jsendbuf++];
593         m--;
594       }
595 
596       /* put in diagonal portion */
597       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
598         jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
599       }
600 
601       /* put in upper diagonal portion */
602       while (m-- > 0) {
603         jsendbuf[cnt++] = garray[*b_jsendbuf++];
604       }
605     }
606     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
607 
608     /*--------------------------------------------------------------------
609        Gather all column indices to all processors
610     */
611     for (i=0; i<size; i++) {
612       recvcounts[i] = 0;
613       for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
614         recvcounts[i] += lens[j];
615       }
616     }
617     displs[0] = 0;
618     for (i=1; i<size; i++) {
619       displs[i] = displs[i-1] + recvcounts[i-1];
620     }
621 #if defined(PETSC_HAVE_MPI_IN_PLACE)
622     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
623 #else
624     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
625 #endif
626     /*--------------------------------------------------------------------
627         Assemble the matrix into useable form (note numerical values not yet set)
628     */
629     /* set the b->ilen (length of each row) values */
630     ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
631     /* set the b->i indices */
632     b->i[0] = 0;
633     for (i=1; i<=A->rmap->N; i++) {
634       b->i[i] = b->i[i-1] + lens[i-1];
635     }
636     ierr = PetscFree(lens);CHKERRQ(ierr);
637     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
638     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
639 
640   } else {
641     B = **Bin;
642     b = (Mat_SeqAIJ*)B->data;
643   }
644 
645   /*--------------------------------------------------------------------
646        Copy my part of matrix numerical values into the values location
647   */
648   if (flag == MAT_GET_VALUES) {
649     sendcount = ad->nz + bd->nz;
650     sendbuf   = b->a + b->i[rstarts[rank]];
651     a_sendbuf = ad->a;
652     b_sendbuf = bd->a;
653     b_sendj   = bd->j;
654     n         = A->rmap->rend - A->rmap->rstart;
655     cnt       = 0;
656     for (i=0; i<n; i++) {
657 
658       /* put in lower diagonal portion */
659       m = bd->i[i+1] - bd->i[i];
660       while (m > 0) {
661         /* is it above diagonal (in bd (compressed) numbering) */
662         if (garray[*b_sendj] > A->rmap->rstart + i) break;
663         sendbuf[cnt++] = *b_sendbuf++;
664         m--;
665         b_sendj++;
666       }
667 
668       /* put in diagonal portion */
669       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
670         sendbuf[cnt++] = *a_sendbuf++;
671       }
672 
673       /* put in upper diagonal portion */
674       while (m-- > 0) {
675         sendbuf[cnt++] = *b_sendbuf++;
676         b_sendj++;
677       }
678     }
679     if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
680 
681     /* -----------------------------------------------------------------
682        Gather all numerical values to all processors
683     */
684     if (!recvcounts) {
685       ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr);
686     }
687     for (i=0; i<size; i++) {
688       recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
689     }
690     displs[0] = 0;
691     for (i=1; i<size; i++) {
692       displs[i] = displs[i-1] + recvcounts[i-1];
693     }
694     recvbuf = b->a;
695 #if defined(PETSC_HAVE_MPI_IN_PLACE)
696     ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
697 #else
698     ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr);
699 #endif
700   }  /* endof (flag == MAT_GET_VALUES) */
701   ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr);
702 
703   if (A->symmetric) {
704     ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
705   } else if (A->hermitian) {
706     ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr);
707   } else if (A->structurally_symmetric) {
708     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
709   }
710   PetscFunctionReturn(0);
711 }
712 
713 
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
717 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
718 {
719   PetscErrorCode ierr;
720   PetscInt       nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
721   PetscBool      rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns;
722 
723   PetscFunctionBegin;
724 
725   /*
726        Check for special case: each processor gets entire matrix
727   */
728   if (ismax == 1 && C->rmap->N == C->cmap->N) {
729     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
730     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
731     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
732     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
733     if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
734       wantallmatrix = PETSC_TRUE;
735 
736       ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr);
737     }
738   }
739   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
740   if (twantallmatrix) {
741     ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr);
742     PetscFunctionReturn(0);
743   }
744 
745   /* Allocate memory to hold all the submatrices */
746   if (scall != MAT_REUSE_MATRIX) {
747     ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr);
748   }
749 
750   /* Check for special case: each processor gets entire matrix columns */
751   ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr);
752   for (i=0; i<ismax; i++) {
753     ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
754     ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr);
755     if (colflag && ncol == C->cmap->N) {
756       allcolumns[i] = PETSC_TRUE;
757     } else {
758       allcolumns[i] = PETSC_FALSE;
759     }
760   }
761 
762   /* Determine the number of stages through which submatrices are done */
763   nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
764 
765   /*
766      Each stage will extract nmax submatrices.
767      nmax is determined by the matrix column dimension.
768      If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
769   */
770   if (!nmax) nmax = 1;
771   nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
772 
773   /* Make sure every processor loops through the nstages */
774   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
775 
776   for (i=0,pos=0; i<nstages; i++) {
777     if (pos+nmax <= ismax) max_no = nmax;
778     else if (pos == ismax) max_no = 0;
779     else                   max_no = ismax-pos;
780     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr);
781     pos += max_no;
782   }
783 
784   ierr = PetscFree(allcolumns);CHKERRQ(ierr);
785   PetscFunctionReturn(0);
786 }
787 
788 /* -------------------------------------------------------------------------*/
789 #undef __FUNCT__
790 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
791 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats)
792 {
793   Mat_MPIAIJ     *c = (Mat_MPIAIJ*)C->data;
794   Mat            A  = c->A;
795   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
796   const PetscInt **icol,**irow;
797   PetscInt       *nrow,*ncol,start;
798   PetscErrorCode ierr;
799   PetscMPIInt    rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
800   PetscInt       **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
801   PetscInt       nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
802   PetscInt       **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
803   PetscInt       **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
804 #if defined(PETSC_USE_CTABLE)
805   PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
806 #else
807   PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
808 #endif
809   const PetscInt *irow_i;
810   PetscInt       ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
811   MPI_Request    *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
812   MPI_Request    *r_waits4,*s_waits3,*s_waits4;
813   MPI_Status     *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
814   MPI_Status     *r_status3,*r_status4,*s_status4;
815   MPI_Comm       comm;
816   PetscScalar    **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
817   PetscMPIInt    *onodes1,*olengths1;
818   PetscMPIInt    idex,idex2,end;
819 
820   PetscFunctionBegin;
821   ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
822   tag0 = ((PetscObject)C)->tag;
823   size = c->size;
824   rank = c->rank;
825 
826   /* Get some new tags to keep the communication clean */
827   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
828   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
829   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
830 
831   ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr);
832 
833   for (i=0; i<ismax; i++) {
834     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
835     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
836     if (allcolumns[i]) {
837       icol[i] = NULL;
838       ncol[i] = C->cmap->N;
839     } else {
840       ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
841       ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
842     }
843   }
844 
845   /* evaluate communication - mesg to who, length of mesg, and buffer space
846      required. Based on this, buffers are allocated, and data copied into them*/
847   ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);   /* mesg size */
848   ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
849   ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
850   ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr);   /* initialize work vector*/
851   for (i=0; i<ismax; i++) {
852     ierr   = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/
853     jmax   = nrow[i];
854     irow_i = irow[i];
855     for (j=0; j<jmax; j++) {
856       l   = 0;
857       row = irow_i[j];
858       while (row >= C->rmap->range[l+1]) l++;
859       proc = l;
860       w4[proc]++;
861     }
862     for (j=0; j<size; j++) {
863       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
864     }
865   }
866 
867   nrqs     = 0;              /* no of outgoing messages */
868   msz      = 0;              /* total mesg length (for all procs) */
869   w1[rank] = 0;              /* no mesg sent to self */
870   w3[rank] = 0;
871   for (i=0; i<size; i++) {
872     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
873   }
874   ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
875   for (i=0,j=0; i<size; i++) {
876     if (w1[i]) { pa[j] = i; j++; }
877   }
878 
879   /* Each message would have a header = 1 + 2*(no of IS) + data */
880   for (i=0; i<nrqs; i++) {
881     j      = pa[i];
882     w1[j] += w2[j] + 2* w3[j];
883     msz   += w1[j];
884   }
885   ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
886 
887   /* Determine the number of messages to expect, their lengths, from from-ids */
888   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
889   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
890 
891   /* Now post the Irecvs corresponding to these messages */
892   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
893 
894   ierr = PetscFree(onodes1);CHKERRQ(ierr);
895   ierr = PetscFree(olengths1);CHKERRQ(ierr);
896 
897   /* Allocate Memory for outgoing messages */
898   ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
899   ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr);
900   ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr);
901 
902   {
903     PetscInt *iptr = tmp,ict = 0;
904     for (i=0; i<nrqs; i++) {
905       j        = pa[i];
906       iptr    += ict;
907       sbuf1[j] = iptr;
908       ict      = w1[j];
909     }
910   }
911 
912   /* Form the outgoing messages */
913   /* Initialize the header space */
914   for (i=0; i<nrqs; i++) {
915     j           = pa[i];
916     sbuf1[j][0] = 0;
917     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr);
918     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
919   }
920 
921   /* Parse the isrow and copy data into outbuf */
922   for (i=0; i<ismax; i++) {
923     ierr   = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr);
924     irow_i = irow[i];
925     jmax   = nrow[i];
926     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
927       l   = 0;
928       row = irow_i[j];
929       while (row >= C->rmap->range[l+1]) l++;
930       proc = l;
931       if (proc != rank) { /* copy to the outgoing buf*/
932         ctr[proc]++;
933         *ptr[proc] = row;
934         ptr[proc]++;
935       }
936     }
937     /* Update the headers for the current IS */
938     for (j=0; j<size; j++) { /* Can Optimise this loop too */
939       if ((ctr_j = ctr[j])) {
940         sbuf1_j        = sbuf1[j];
941         k              = ++sbuf1_j[0];
942         sbuf1_j[2*k]   = ctr_j;
943         sbuf1_j[2*k-1] = i;
944       }
945     }
946   }
947 
948   /*  Now  post the sends */
949   ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
950   for (i=0; i<nrqs; ++i) {
951     j    = pa[i];
952     ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
953   }
954 
955   /* Post Receives to capture the buffer size */
956   ierr     = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
957   ierr     = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr);
958   rbuf2[0] = tmp + msz;
959   for (i=1; i<nrqs; ++i) {
960     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
961   }
962   for (i=0; i<nrqs; ++i) {
963     j    = pa[i];
964     ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
965   }
966 
967   /* Send to other procs the buf size they should allocate */
968 
969 
970   /* Receive messages*/
971   ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
972   ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
973   ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr);
974   {
975     Mat_SeqAIJ *sA  = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
976     PetscInt   *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart;
977     PetscInt   *sbuf2_i;
978 
979     for (i=0; i<nrqr; ++i) {
980       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
981 
982       req_size[idex] = 0;
983       rbuf1_i        = rbuf1[idex];
984       start          = 2*rbuf1_i[0] + 1;
985       ierr           = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
986       ierr           = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr);
987       sbuf2_i        = sbuf2[idex];
988       for (j=start; j<end; j++) {
989         id              = rbuf1_i[j] - rstart;
990         ncols           = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
991         sbuf2_i[j]      = ncols;
992         req_size[idex] += ncols;
993       }
994       req_source[idex] = r_status1[i].MPI_SOURCE;
995       /* form the header */
996       sbuf2_i[0] = req_size[idex];
997       for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
998 
999       ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
1000     }
1001   }
1002   ierr = PetscFree(r_status1);CHKERRQ(ierr);
1003   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
1004 
1005   /*  recv buffer sizes */
1006   /* Receive messages*/
1007 
1008   ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr);
1009   ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1010   ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
1011   ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1012   ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
1013 
1014   for (i=0; i<nrqs; ++i) {
1015     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1016     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr);
1017     ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr);
1018     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1019     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1020   }
1021   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1022   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1023 
1024   /* Wait on sends1 and sends2 */
1025   ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
1026   ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
1027 
1028   if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
1029   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
1030   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1031   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1032   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1033   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1034 
1035   /* Now allocate buffers for a->j, and send them off */
1036   ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
1037   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1038   ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
1039   for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1040 
1041   ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
1042   {
1043     PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1044     PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1045     PetscInt cend = C->cmap->rend;
1046     PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1047 
1048     for (i=0; i<nrqr; i++) {
1049       rbuf1_i   = rbuf1[i];
1050       sbuf_aj_i = sbuf_aj[i];
1051       ct1       = 2*rbuf1_i[0] + 1;
1052       ct2       = 0;
1053       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1054         kmax = rbuf1[i][2*j];
1055         for (k=0; k<kmax; k++,ct1++) {
1056           row    = rbuf1_i[ct1] - rstart;
1057           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1058           ncols  = nzA + nzB;
1059           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1060 
1061           /* load the column indices for this row into cols*/
1062           cols = sbuf_aj_i + ct2;
1063 
1064           lwrite = 0;
1065           for (l=0; l<nzB; l++) {
1066             if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1067           }
1068           for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1069           for (l=0; l<nzB; l++) {
1070             if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1071           }
1072 
1073           ct2 += ncols;
1074         }
1075       }
1076       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1077     }
1078   }
1079   ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr);
1080   ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr);
1081 
1082   /* Allocate buffers for a->a, and send them off */
1083   ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1084   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1085   ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr);
1086   for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1087 
1088   ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1089   {
1090     PetscInt    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1091     PetscInt    cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
1092     PetscInt    cend   = C->cmap->rend;
1093     PetscInt    *b_j   = b->j;
1094     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1095 
1096     for (i=0; i<nrqr; i++) {
1097       rbuf1_i   = rbuf1[i];
1098       sbuf_aa_i = sbuf_aa[i];
1099       ct1       = 2*rbuf1_i[0]+1;
1100       ct2       = 0;
1101       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1102         kmax = rbuf1_i[2*j];
1103         for (k=0; k<kmax; k++,ct1++) {
1104           row    = rbuf1_i[ct1] - rstart;
1105           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1106           ncols  = nzA + nzB;
1107           cworkB = b_j + b_i[row];
1108           vworkA = a_a + a_i[row];
1109           vworkB = b_a + b_i[row];
1110 
1111           /* load the column values for this row into vals*/
1112           vals = sbuf_aa_i+ct2;
1113 
1114           lwrite = 0;
1115           for (l=0; l<nzB; l++) {
1116             if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1117           }
1118           for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1119           for (l=0; l<nzB; l++) {
1120             if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1121           }
1122 
1123           ct2 += ncols;
1124         }
1125       }
1126       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1127     }
1128   }
1129   ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr);
1130   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1131   ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1132   ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1133 
1134   /* Form the matrix */
1135   /* create col map: global col of C -> local col of submatrices */
1136   {
1137     const PetscInt *icol_i;
1138 #if defined(PETSC_USE_CTABLE)
1139     ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr);
1140     for (i=0; i<ismax; i++) {
1141       if (!allcolumns[i]) {
1142         ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr);
1143 
1144         jmax   = ncol[i];
1145         icol_i = icol[i];
1146         cmap_i = cmap[i];
1147         for (j=0; j<jmax; j++) {
1148           ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1149         }
1150       } else {
1151         cmap[i] = NULL;
1152       }
1153     }
1154 #else
1155     ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr);
1156     for (i=0; i<ismax; i++) {
1157       if (!allcolumns[i]) {
1158         ierr   = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr);
1159         ierr   = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1160         jmax   = ncol[i];
1161         icol_i = icol[i];
1162         cmap_i = cmap[i];
1163         for (j=0; j<jmax; j++) {
1164           cmap_i[icol_i[j]] = j+1;
1165         }
1166       } else {
1167         cmap[i] = NULL;
1168       }
1169     }
1170 #endif
1171   }
1172 
1173   /* Create lens which is required for MatCreate... */
1174   for (i=0,j=0; i<ismax; i++) j += nrow[i];
1175   ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1176   if (ismax) {
1177     ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr);
1178     ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr);
1179   }
1180   for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1181 
1182   /* Update lens from local data */
1183   for (i=0; i<ismax; i++) {
1184     jmax = nrow[i];
1185     if (!allcolumns[i]) cmap_i = cmap[i];
1186     irow_i = irow[i];
1187     lens_i = lens[i];
1188     for (j=0; j<jmax; j++) {
1189       l   = 0;
1190       row = irow_i[j];
1191       while (row >= C->rmap->range[l+1]) l++;
1192       proc = l;
1193       if (proc == rank) {
1194         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1195         if (!allcolumns[i]) {
1196           for (k=0; k<ncols; k++) {
1197 #if defined(PETSC_USE_CTABLE)
1198             ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1199 #else
1200             tcol = cmap_i[cols[k]];
1201 #endif
1202             if (tcol) lens_i[j]++;
1203           }
1204         } else { /* allcolumns */
1205           lens_i[j] = ncols;
1206         }
1207         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1208       }
1209     }
1210   }
1211 
1212   /* Create row map: global row of C -> local row of submatrices */
1213 #if defined(PETSC_USE_CTABLE)
1214   ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr);
1215   for (i=0; i<ismax; i++) {
1216     ierr   = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr);
1217     rmap_i = rmap[i];
1218     irow_i = irow[i];
1219     jmax   = nrow[i];
1220     for (j=0; j<jmax; j++) {
1221       ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1222     }
1223   }
1224 #else
1225   ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr);
1226   if (ismax) {
1227     ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr);
1228     ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr);
1229   }
1230   for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N;
1231   for (i=0; i<ismax; i++) {
1232     rmap_i = rmap[i];
1233     irow_i = irow[i];
1234     jmax   = nrow[i];
1235     for (j=0; j<jmax; j++) {
1236       rmap_i[irow_i[j]] = j;
1237     }
1238   }
1239 #endif
1240 
1241   /* Update lens from offproc data */
1242   {
1243     PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1244 
1245     for (tmp2=0; tmp2<nrqs; tmp2++) {
1246       ierr    = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr);
1247       idex    = pa[idex2];
1248       sbuf1_i = sbuf1[idex];
1249       jmax    = sbuf1_i[0];
1250       ct1     = 2*jmax+1;
1251       ct2     = 0;
1252       rbuf2_i = rbuf2[idex2];
1253       rbuf3_i = rbuf3[idex2];
1254       for (j=1; j<=jmax; j++) {
1255         is_no  = sbuf1_i[2*j-1];
1256         max1   = sbuf1_i[2*j];
1257         lens_i = lens[is_no];
1258         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1259         rmap_i = rmap[is_no];
1260         for (k=0; k<max1; k++,ct1++) {
1261 #if defined(PETSC_USE_CTABLE)
1262           ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1263           row--;
1264           if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1265 #else
1266           row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1267 #endif
1268           max2 = rbuf2_i[ct1];
1269           for (l=0; l<max2; l++,ct2++) {
1270             if (!allcolumns[is_no]) {
1271 #if defined(PETSC_USE_CTABLE)
1272               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1273 #else
1274               tcol = cmap_i[rbuf3_i[ct2]];
1275 #endif
1276               if (tcol) lens_i[row]++;
1277             } else { /* allcolumns */
1278               lens_i[row]++; /* lens_i[row] += max2 ? */
1279             }
1280           }
1281         }
1282       }
1283     }
1284   }
1285   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1286   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1287   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1288   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1289   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1290 
1291   /* Create the submatrices */
1292   if (scall == MAT_REUSE_MATRIX) {
1293     PetscBool flag;
1294 
1295     /*
1296         Assumes new rows are same length as the old rows,hence bug!
1297     */
1298     for (i=0; i<ismax; i++) {
1299       mat = (Mat_SeqAIJ*)(submats[i]->data);
1300       if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1301       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr);
1302       if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1303       /* Initial matrix as if empty */
1304       ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr);
1305 
1306       submats[i]->factortype = C->factortype;
1307     }
1308   } else {
1309     for (i=0; i<ismax; i++) {
1310       PetscInt rbs,cbs;
1311       ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr);
1312       ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr);
1313 
1314       ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1315       ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1316 
1317       ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr);
1318       ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1319       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1320     }
1321   }
1322 
1323   /* Assemble the matrices */
1324   /* First assemble the local rows */
1325   {
1326     PetscInt    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1327     PetscScalar *imat_a;
1328 
1329     for (i=0; i<ismax; i++) {
1330       mat       = (Mat_SeqAIJ*)submats[i]->data;
1331       imat_ilen = mat->ilen;
1332       imat_j    = mat->j;
1333       imat_i    = mat->i;
1334       imat_a    = mat->a;
1335 
1336       if (!allcolumns[i]) cmap_i = cmap[i];
1337       rmap_i = rmap[i];
1338       irow_i = irow[i];
1339       jmax   = nrow[i];
1340       for (j=0; j<jmax; j++) {
1341         l   = 0;
1342         row = irow_i[j];
1343         while (row >= C->rmap->range[l+1]) l++;
1344         proc = l;
1345         if (proc == rank) {
1346           old_row = row;
1347 #if defined(PETSC_USE_CTABLE)
1348           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1349           row--;
1350 #else
1351           row = rmap_i[row];
1352 #endif
1353           ilen_row = imat_ilen[row];
1354           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1355           mat_i    = imat_i[row];
1356           mat_a    = imat_a + mat_i;
1357           mat_j    = imat_j + mat_i;
1358           if (!allcolumns[i]) {
1359             for (k=0; k<ncols; k++) {
1360 #if defined(PETSC_USE_CTABLE)
1361               ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr);
1362 #else
1363               tcol = cmap_i[cols[k]];
1364 #endif
1365               if (tcol) {
1366                 *mat_j++ = tcol - 1;
1367                 *mat_a++ = vals[k];
1368                 ilen_row++;
1369               }
1370             }
1371           } else { /* allcolumns */
1372             for (k=0; k<ncols; k++) {
1373               *mat_j++ = cols[k];  /* global col index! */
1374               *mat_a++ = vals[k];
1375               ilen_row++;
1376             }
1377           }
1378           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1379 
1380           imat_ilen[row] = ilen_row;
1381         }
1382       }
1383     }
1384   }
1385 
1386   /*   Now assemble the off proc rows*/
1387   {
1388     PetscInt    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1389     PetscInt    *imat_j,*imat_i;
1390     PetscScalar *imat_a,*rbuf4_i;
1391 
1392     for (tmp2=0; tmp2<nrqs; tmp2++) {
1393       ierr    = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr);
1394       idex    = pa[idex2];
1395       sbuf1_i = sbuf1[idex];
1396       jmax    = sbuf1_i[0];
1397       ct1     = 2*jmax + 1;
1398       ct2     = 0;
1399       rbuf2_i = rbuf2[idex2];
1400       rbuf3_i = rbuf3[idex2];
1401       rbuf4_i = rbuf4[idex2];
1402       for (j=1; j<=jmax; j++) {
1403         is_no     = sbuf1_i[2*j-1];
1404         rmap_i    = rmap[is_no];
1405         if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1406         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1407         imat_ilen = mat->ilen;
1408         imat_j    = mat->j;
1409         imat_i    = mat->i;
1410         imat_a    = mat->a;
1411         max1      = sbuf1_i[2*j];
1412         for (k=0; k<max1; k++,ct1++) {
1413           row = sbuf1_i[ct1];
1414 #if defined(PETSC_USE_CTABLE)
1415           ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1416           row--;
1417 #else
1418           row = rmap_i[row];
1419 #endif
1420           ilen  = imat_ilen[row];
1421           mat_i = imat_i[row];
1422           mat_a = imat_a + mat_i;
1423           mat_j = imat_j + mat_i;
1424           max2  = rbuf2_i[ct1];
1425           if (!allcolumns[is_no]) {
1426             for (l=0; l<max2; l++,ct2++) {
1427 
1428 #if defined(PETSC_USE_CTABLE)
1429               ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1430 #else
1431               tcol = cmap_i[rbuf3_i[ct2]];
1432 #endif
1433               if (tcol) {
1434                 *mat_j++ = tcol - 1;
1435                 *mat_a++ = rbuf4_i[ct2];
1436                 ilen++;
1437               }
1438             }
1439           } else { /* allcolumns */
1440             for (l=0; l<max2; l++,ct2++) {
1441               *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1442               *mat_a++ = rbuf4_i[ct2];
1443               ilen++;
1444             }
1445           }
1446           imat_ilen[row] = ilen;
1447         }
1448       }
1449     }
1450   }
1451 
1452   /* sort the rows */
1453   {
1454     PetscInt    *imat_ilen,*imat_j,*imat_i;
1455     PetscScalar *imat_a;
1456 
1457     for (i=0; i<ismax; i++) {
1458       mat       = (Mat_SeqAIJ*)submats[i]->data;
1459       imat_j    = mat->j;
1460       imat_i    = mat->i;
1461       imat_a    = mat->a;
1462       imat_ilen = mat->ilen;
1463 
1464       if (allcolumns[i]) continue;
1465       jmax = nrow[i];
1466       for (j=0; j<jmax; j++) {
1467         PetscInt ilen;
1468 
1469         mat_i = imat_i[j];
1470         mat_a = imat_a + mat_i;
1471         mat_j = imat_j + mat_i;
1472         ilen  = imat_ilen[j];
1473         ierr  = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr);
1474       }
1475     }
1476   }
1477 
1478   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1479   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1480   if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1481   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1482   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1483 
1484   /* Restore the indices */
1485   for (i=0; i<ismax; i++) {
1486     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1487     if (!allcolumns[i]) {
1488       ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1489     }
1490   }
1491 
1492   /* Destroy allocated memory */
1493   ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr);
1494   ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
1495   ierr = PetscFree(pa);CHKERRQ(ierr);
1496 
1497   ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr);
1498   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1499   for (i=0; i<nrqr; ++i) {
1500     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1501   }
1502   for (i=0; i<nrqs; ++i) {
1503     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1504     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1505   }
1506 
1507   ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr);
1508   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1509   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1510   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1511   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1512   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1513   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1514 
1515 #if defined(PETSC_USE_CTABLE)
1516   for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);}
1517 #else
1518   if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);}
1519 #endif
1520   ierr = PetscFree(rmap);CHKERRQ(ierr);
1521 
1522   for (i=0; i<ismax; i++) {
1523     if (!allcolumns[i]) {
1524 #if defined(PETSC_USE_CTABLE)
1525       ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);
1526 #else
1527       ierr = PetscFree(cmap[i]);CHKERRQ(ierr);
1528 #endif
1529     }
1530   }
1531   ierr = PetscFree(cmap);CHKERRQ(ierr);
1532   if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1533   ierr = PetscFree(lens);CHKERRQ(ierr);
1534 
1535   for (i=0; i<ismax; i++) {
1536     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1537     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1538   }
1539   PetscFunctionReturn(0);
1540 }
1541 
1542 /*
1543  Observe that the Seq matrices used to construct this MPI matrix are not increfed.
1544  Be careful not to destroy them elsewhere.
1545  */
1546 #undef __FUNCT__
1547 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private"
1548 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C)
1549 {
1550   /* If making this function public, change the error returned in this function away from _PLIB. */
1551   PetscErrorCode ierr;
1552   Mat_MPIAIJ     *aij;
1553   PetscBool      seqaij;
1554 
1555   PetscFunctionBegin;
1556   /* Check to make sure the component matrices are compatible with C. */
1557   ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1558   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type");
1559   ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr);
1560   if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type");
1561   if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1562 
1563   ierr = MatCreate(comm, C);CHKERRQ(ierr);
1564   ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr);
1565   ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr);
1566   ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
1567   ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
1568   if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix");
1569   ierr   = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr);
1570   aij    = (Mat_MPIAIJ*)((*C)->data);
1571   aij->A = A;
1572   aij->B = B;
1573   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr);
1574   ierr   = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr);
1575 
1576   (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated);
1577   (*C)->assembled    = (PetscBool)(A->assembled && B->assembled);
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 #undef __FUNCT__
1582 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private"
1583 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B)
1584 {
1585   Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data);
1586 
1587   PetscFunctionBegin;
1588   PetscValidPointer(A,2);
1589   PetscValidPointer(B,3);
1590   *A = aij->A;
1591   *B = aij->B;
1592   /* Note that we don't incref *A and *B, so be careful! */
1593   PetscFunctionReturn(0);
1594 }
1595 
1596 #undef __FUNCT__
1597 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ"
1598 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
1599                                                  PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**),
1600                                                  PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*),
1601                                                  PetscErrorCode(*extractseq)(Mat, Mat*, Mat*))
1602 {
1603   PetscErrorCode ierr;
1604   PetscMPIInt    size, flag;
1605   PetscInt       i,ii;
1606   PetscInt       ismax_c;
1607 
1608   PetscFunctionBegin;
1609   if (!ismax) PetscFunctionReturn(0);
1610 
1611   for (i = 0, ismax_c = 0; i < ismax; ++i) {
1612     ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr);
1613     if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
1614     ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1615     if (size > 1) ++ismax_c;
1616   }
1617   if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */
1618     ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr);
1619   } else { /* if (ismax_c) */
1620     Mat         *A,*B;
1621     IS          *isrow_c, *iscol_c;
1622     PetscMPIInt size;
1623     /*
1624      Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
1625      array of sequential matrices underlying the resulting parallel matrices.
1626      Which arrays to allocate is based on the value of MatReuse scall.
1627      There are as many diag matrices as there are original index sets.
1628      There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets.
1629 
1630      Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists,
1631      we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq
1632      will deposite the extracted diag and off-diag parts.
1633      However, if reuse is taking place, we have to allocate the seq matrix arrays here.
1634      If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq.
1635     */
1636 
1637     /* Parallel matrix array is allocated only if no reuse is taking place. */
1638     if (scall != MAT_REUSE_MATRIX) {
1639       ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr);
1640     } else {
1641       ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr);
1642       ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr);
1643       /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */
1644       for (i = 0, ii = 0; i < ismax; ++i) {
1645         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1646         if (size > 1) {
1647           ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr);
1648           ++ii;
1649         } else A[i] = (*submat)[i];
1650       }
1651     }
1652     /*
1653      Construct the complements of the iscol ISs for parallel ISs only.
1654      These are used to extract the off-diag portion of the resulting parallel matrix.
1655      The row IS for the off-diag portion is the same as for the diag portion,
1656      so we merely alias the row IS, while skipping those that are sequential.
1657     */
1658     ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr);
1659     for (i = 0, ii = 0; i < ismax; ++i) {
1660       ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1661       if (size > 1) {
1662         isrow_c[ii] = isrow[i];
1663 
1664         ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr);
1665         ++ii;
1666       }
1667     }
1668     /* Now obtain the sequential A and B submatrices separately. */
1669     ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr);
1670     ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr);
1671     for (ii = 0; ii < ismax_c; ++ii) {
1672       ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr);
1673     }
1674     ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr);
1675     /*
1676      If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B
1677      have been extracted directly into the parallel matrices containing them, or
1678      simply into the sequential matrix identical with the corresponding A (if size == 1).
1679      Otherwise, make sure that parallel matrices are constructed from A & B, or the
1680      A is put into the correct submat slot (if size == 1).
1681      */
1682     if (scall != MAT_REUSE_MATRIX) {
1683       for (i = 0, ii = 0; i < ismax; ++i) {
1684         ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr);
1685         if (size > 1) {
1686           /*
1687            For each parallel isrow[i], create parallel matrices from the extracted sequential matrices.
1688            */
1689           /* Construct submat[i] from the Seq pieces A and B. */
1690           ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr);
1691 
1692           ++ii;
1693         } else (*submat)[i] = A[i];
1694       }
1695     }
1696     ierr = PetscFree(A);CHKERRQ(ierr);
1697     ierr = PetscFree(B);CHKERRQ(ierr);
1698   }
1699   PetscFunctionReturn(0);
1700 } /* MatGetSubMatricesParallel_MPIXAIJ() */
1701 
1702 
1703 
1704 #undef __FUNCT__
1705 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ"
1706 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1707 {
1708   PetscErrorCode ierr;
1709 
1710   PetscFunctionBegin;
1711   ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr);
1712   PetscFunctionReturn(0);
1713 }
1714