xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision e600fa544e2bb197ca2af9b6e65ea465976dec56)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 #include <petsc/private/hashmapiv.h>
13 #include <petsc/private/hashseti.h>
14 #include <petscsf.h>
15 
16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
17 {
18   PetscErrorCode    ierr;
19   PetscBool         iascii;
20   PetscViewerFormat format;
21   Mat_APMPI         *ptap;
22 
23   PetscFunctionBegin;
24   MatCheckProduct(A,1);
25   ptap = (Mat_APMPI*)A->product->data;
26   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
27   if (iascii) {
28     ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
29     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
30       if (ptap->algType == 0) {
31         ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr);
32       } else if (ptap->algType == 1) {
33         ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr);
34       } else if (ptap->algType == 2) {
35         ierr = PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");CHKERRQ(ierr);
36       } else if (ptap->algType == 3) {
37         ierr = PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");CHKERRQ(ierr);
38       }
39     }
40   }
41   PetscFunctionReturn(0);
42 }
43 
44 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
45 {
46   PetscErrorCode      ierr;
47   Mat_APMPI           *ptap = (Mat_APMPI*)data;
48   Mat_Merge_SeqsToMPI *merge;
49 
50   PetscFunctionBegin;
51   ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr);
52   ierr = PetscFree(ptap->bufa);CHKERRQ(ierr);
53   ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr);
54   ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr);
55   ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */
56   ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr);
57   ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr);
58   if (ptap->AP_loc) { /* used by alg_rap */
59     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
60     ierr = PetscFree(ap->i);CHKERRQ(ierr);
61     ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr);
62     ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr);
63   } else { /* used by alg_ptap */
64     ierr = PetscFree(ptap->api);CHKERRQ(ierr);
65     ierr = PetscFree(ptap->apj);CHKERRQ(ierr);
66   }
67   ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr);
68   ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr);
69   if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);}
70 
71   ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr);
72 
73   merge = ptap->merge;
74   if (merge) { /* used by alg_ptap */
75     ierr = PetscFree(merge->id_r);CHKERRQ(ierr);
76     ierr = PetscFree(merge->len_s);CHKERRQ(ierr);
77     ierr = PetscFree(merge->len_r);CHKERRQ(ierr);
78     ierr = PetscFree(merge->bi);CHKERRQ(ierr);
79     ierr = PetscFree(merge->bj);CHKERRQ(ierr);
80     ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr);
81     ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr);
82     ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr);
83     ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr);
84     ierr = PetscFree(merge->coi);CHKERRQ(ierr);
85     ierr = PetscFree(merge->coj);CHKERRQ(ierr);
86     ierr = PetscFree(merge->owners_co);CHKERRQ(ierr);
87     ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr);
88     ierr = PetscFree(ptap->merge);CHKERRQ(ierr);
89   }
90   ierr = ISLocalToGlobalMappingDestroy(&ptap->ltog);CHKERRQ(ierr);
91 
92   ierr = PetscSFDestroy(&ptap->sf);CHKERRQ(ierr);
93   ierr = PetscFree(ptap->c_othi);CHKERRQ(ierr);
94   ierr = PetscFree(ptap->c_rmti);CHKERRQ(ierr);
95   ierr = PetscFree(ptap);CHKERRQ(ierr);
96   PetscFunctionReturn(0);
97 }
98 
99 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
100 {
101   PetscErrorCode    ierr;
102   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
103   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
104   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
105   Mat_APMPI         *ptap;
106   Mat               AP_loc,C_loc,C_oth;
107   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
108   PetscScalar       *apa;
109   const PetscInt    *cols;
110   const PetscScalar *vals;
111 
112   PetscFunctionBegin;
113   MatCheckProduct(C,3);
114   ptap = (Mat_APMPI*)C->product->data;
115   PetscCheckFalse(!ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
116   PetscCheckFalse(!ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
117 
118   ierr = MatZeroEntries(C);CHKERRQ(ierr);
119 
120   /* 1) get R = Pd^T,Ro = Po^T */
121   if (ptap->reuse == MAT_REUSE_MATRIX) {
122     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
123     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
124   }
125 
126   /* 2) get AP_loc */
127   AP_loc = ptap->AP_loc;
128   ap = (Mat_SeqAIJ*)AP_loc->data;
129 
130   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
131   /*-----------------------------------------------------*/
132   if (ptap->reuse == MAT_REUSE_MATRIX) {
133     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
134     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
135     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
136   }
137 
138   /* 2-2) compute numeric A_loc*P - dominating part */
139   /* ---------------------------------------------- */
140   /* get data from symbolic products */
141   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
142   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
143 
144   api   = ap->i;
145   apj   = ap->j;
146   ierr = ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);CHKERRQ(ierr);
147   for (i=0; i<am; i++) {
148     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
149     apnz = api[i+1] - api[i];
150     apa = ap->a + api[i];
151     ierr = PetscArrayzero(apa,apnz);CHKERRQ(ierr);
152     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
153   }
154   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);CHKERRQ(ierr);
155   PetscCheckFalse(api[AP_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,api[AP_loc->rmap->n],nout);
156 
157   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
158   /* Always use scalable version since we are in the MPI scalable version */
159   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr);
160   ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr);
161 
162   C_loc = ptap->C_loc;
163   C_oth = ptap->C_oth;
164 
165   /* add C_loc and Co to to C */
166   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
167 
168   /* C_loc -> C */
169   cm    = C_loc->rmap->N;
170   c_seq = (Mat_SeqAIJ*)C_loc->data;
171   cols = c_seq->j;
172   vals = c_seq->a;
173   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr);
174 
175   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
176   /* when there are no off-processor parts.  */
177   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
178   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
179   /* a table, and other, more complex stuff has to be done. */
180   if (C->assembled) {
181     C->was_assembled = PETSC_TRUE;
182     C->assembled     = PETSC_FALSE;
183   }
184   if (C->was_assembled) {
185     for (i=0; i<cm; i++) {
186       ncols = c_seq->i[i+1] - c_seq->i[i];
187       row = rstart + i;
188       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
189       cols += ncols; vals += ncols;
190     }
191   } else {
192     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
193   }
194   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr);
195   PetscCheckFalse(c_seq->i[C_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
196 
197   /* Co -> C, off-processor part */
198   cm = C_oth->rmap->N;
199   c_seq = (Mat_SeqAIJ*)C_oth->data;
200   cols = c_seq->j;
201   vals = c_seq->a;
202   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);CHKERRQ(ierr);
203   for (i=0; i<cm; i++) {
204     ncols = c_seq->i[i+1] - c_seq->i[i];
205     row = p->garray[i];
206     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
207     cols += ncols; vals += ncols;
208   }
209   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
210   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
211 
212   ptap->reuse = MAT_REUSE_MATRIX;
213 
214   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);CHKERRQ(ierr);
215   PetscCheckFalse(c_seq->i[C_oth->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
216   PetscFunctionReturn(0);
217 }
218 
219 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
220 {
221   PetscErrorCode      ierr;
222   Mat_APMPI           *ptap;
223   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
224   MPI_Comm            comm;
225   PetscMPIInt         size,rank;
226   Mat                 P_loc,P_oth;
227   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
228   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
229   PetscInt            *lnk,i,k,pnz,row,nsend;
230   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
231   PETSC_UNUSED PetscMPIInt icompleted=0;
232   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
233   const PetscInt      *owners;
234   PetscInt            len,proc,*dnz,*onz,nzi,nspacedouble;
235   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
236   MPI_Request         *swaits,*rwaits;
237   MPI_Status          *sstatus,rstatus;
238   PetscLayout         rowmap;
239   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
240   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
241   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
242   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
243   PetscScalar         *apv;
244   PetscTable          ta;
245   MatType             mtype;
246   const char          *prefix;
247 #if defined(PETSC_USE_INFO)
248   PetscReal           apfill;
249 #endif
250 
251   PetscFunctionBegin;
252   MatCheckProduct(Cmpi,4);
253   PetscCheckFalse(Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
254   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
255   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
256   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
257 
258   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
259 
260   /* create symbolic parallel matrix Cmpi */
261   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
262   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
263 
264   /* create struct Mat_APMPI and attached it to C later */
265   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
266   ptap->reuse = MAT_INITIAL_MATRIX;
267   ptap->algType = 0;
268 
269   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
270   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr);
271   /* get P_loc by taking all local rows of P */
272   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr);
273 
274   ptap->P_loc = P_loc;
275   ptap->P_oth = P_oth;
276 
277   /* (0) compute Rd = Pd^T, Ro = Po^T  */
278   /* --------------------------------- */
279   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
280   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
281 
282   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
283   /* ----------------------------------------------------------------- */
284   p_loc  = (Mat_SeqAIJ*)P_loc->data;
285   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
286 
287   /* create and initialize a linked list */
288   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
289   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
290   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
291   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
292 
293   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
294 
295   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
296   if (ao) {
297     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
298   } else {
299     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
300   }
301   current_space = free_space;
302   nspacedouble  = 0;
303 
304   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
305   api[0] = 0;
306   for (i=0; i<am; i++) {
307     /* diagonal portion: Ad[i,:]*P */
308     ai = ad->i; pi = p_loc->i;
309     nzi = ai[i+1] - ai[i];
310     aj  = ad->j + ai[i];
311     for (j=0; j<nzi; j++) {
312       row  = aj[j];
313       pnz  = pi[row+1] - pi[row];
314       Jptr = p_loc->j + pi[row];
315       /* add non-zero cols of P into the sorted linked list lnk */
316       ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
317     }
318     /* off-diagonal portion: Ao[i,:]*P */
319     if (ao) {
320       ai = ao->i; pi = p_oth->i;
321       nzi = ai[i+1] - ai[i];
322       aj  = ao->j + ai[i];
323       for (j=0; j<nzi; j++) {
324         row  = aj[j];
325         pnz  = pi[row+1] - pi[row];
326         Jptr = p_oth->j + pi[row];
327         ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr);
328       }
329     }
330     apnz     = lnk[0];
331     api[i+1] = api[i] + apnz;
332 
333     /* if free space is not available, double the total space in the list */
334     if (current_space->local_remaining<apnz) {
335       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
336       nspacedouble++;
337     }
338 
339     /* Copy data into free space, then initialize lnk */
340     ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr);
341 
342     current_space->array           += apnz;
343     current_space->local_used      += apnz;
344     current_space->local_remaining -= apnz;
345   }
346   /* Allocate space for apj and apv, initialize apj, and */
347   /* destroy list of free space and other temporary array(s) */
348   ierr = PetscCalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
349   ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
350   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
351 
352   /* Create AP_loc for reuse */
353   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
354   ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);CHKERRQ(ierr);
355 
356 #if defined(PETSC_USE_INFO)
357   if (ao) {
358     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
359   } else {
360     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
361   }
362   ptap->AP_loc->info.mallocs           = nspacedouble;
363   ptap->AP_loc->info.fill_ratio_given  = fill;
364   ptap->AP_loc->info.fill_ratio_needed = apfill;
365 
366   if (api[am]) {
367     ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
368     ierr = PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
369   } else {
370     ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
371   }
372 #endif
373 
374   /* (2-1) compute symbolic Co = Ro*AP_loc  */
375   /* -------------------------------------- */
376   ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr);
377   ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr);
378   ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr);
379   ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_");CHKERRQ(ierr);
380 
381   ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr);
382   ierr = MatProductSetAlgorithm(ptap->C_oth,"sorted");CHKERRQ(ierr);
383   ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr);
384   ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr);
385   ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr);
386 
387   /* (3) send coj of C_oth to other processors  */
388   /* ------------------------------------------ */
389   /* determine row ownership */
390   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
391   ierr = PetscLayoutSetLocalSize(rowmap, pn);CHKERRQ(ierr);
392   ierr = PetscLayoutSetBlockSize(rowmap, 1);CHKERRQ(ierr);
393   ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
394   ierr = PetscLayoutGetRanges(rowmap,&owners);CHKERRQ(ierr);
395 
396   /* determine the number of messages to send, their lengths */
397   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
398   ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr);
399   ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr);
400 
401   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
402   coi   = c_oth->i; coj = c_oth->j;
403   con   = ptap->C_oth->rmap->n;
404   proc  = 0;
405   ierr = ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);CHKERRQ(ierr);
406   for (i=0; i<con; i++) {
407     while (prmap[i] >= owners[proc+1]) proc++;
408     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
409     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
410   }
411 
412   len          = 0; /* max length of buf_si[], see (4) */
413   owners_co[0] = 0;
414   nsend        = 0;
415   for (proc=0; proc<size; proc++) {
416     owners_co[proc+1] = owners_co[proc] + len_si[proc];
417     if (len_s[proc]) {
418       nsend++;
419       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
420       len         += len_si[proc];
421     }
422   }
423 
424   /* determine the number and length of messages to receive for coi and coj  */
425   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
426   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
427 
428   /* post the Irecv and Isend of coj */
429   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
430   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
431   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
432   for (proc=0, k=0; proc<size; proc++) {
433     if (!len_s[proc]) continue;
434     i    = owners_co[proc];
435     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRMPI(ierr);
436     k++;
437   }
438 
439   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
440   /* ---------------------------------------- */
441   ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr);
442   ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr);
443   ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr);
444   ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr);
445 
446   ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr);
447   ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_");CHKERRQ(ierr);
448 
449   ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr);
450   ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr);
451 
452   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
453   ierr = ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);CHKERRQ(ierr);
454 
455   /* receives coj are complete */
456   for (i=0; i<nrecv; i++) {
457     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr);
458   }
459   ierr = PetscFree(rwaits);CHKERRQ(ierr);
460   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);}
461 
462   /* add received column indices into ta to update Crmax */
463   for (k=0; k<nrecv; k++) {/* k-th received message */
464     Jptr = buf_rj[k];
465     for (j=0; j<len_r[k]; j++) {
466       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
467     }
468   }
469   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
470   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
471 
472   /* (4) send and recv coi */
473   /*-----------------------*/
474   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
475   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
476   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
477   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
478   for (proc=0,k=0; proc<size; proc++) {
479     if (!len_s[proc]) continue;
480     /* form outgoing message for i-structure:
481          buf_si[0]:                 nrows to be sent
482                [1:nrows]:           row index (global)
483                [nrows+1:2*nrows+1]: i-structure index
484     */
485     /*-------------------------------------------*/
486     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
487     buf_si_i    = buf_si + nrows+1;
488     buf_si[0]   = nrows;
489     buf_si_i[0] = 0;
490     nrows       = 0;
491     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
492       nzi = coi[i+1] - coi[i];
493       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
494       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
495       nrows++;
496     }
497     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRMPI(ierr);
498     k++;
499     buf_si += len_si[proc];
500   }
501   for (i=0; i<nrecv; i++) {
502     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr);
503   }
504   ierr = PetscFree(rwaits);CHKERRQ(ierr);
505   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);}
506 
507   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
508   ierr = PetscFree(len_ri);CHKERRQ(ierr);
509   ierr = PetscFree(swaits);CHKERRQ(ierr);
510   ierr = PetscFree(buf_s);CHKERRQ(ierr);
511 
512   /* (5) compute the local portion of Cmpi      */
513   /* ------------------------------------------ */
514   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
515   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
516   current_space = free_space;
517 
518   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
519   for (k=0; k<nrecv; k++) {
520     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
521     nrows       = *buf_ri_k[k];
522     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
523     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
524   }
525 
526   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
527   ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr);
528   for (i=0; i<pn; i++) {
529     /* add C_loc into Cmpi */
530     nzi  = c_loc->i[i+1] - c_loc->i[i];
531     Jptr = c_loc->j + c_loc->i[i];
532     ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
533 
534     /* add received col data into lnk */
535     for (k=0; k<nrecv; k++) { /* k-th received message */
536       if (i == *nextrow[k]) { /* i-th row */
537         nzi  = *(nextci[k]+1) - *nextci[k];
538         Jptr = buf_rj[k] + *nextci[k];
539         ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr);
540         nextrow[k]++; nextci[k]++;
541       }
542     }
543     nzi = lnk[0];
544 
545     /* copy data into free space, then initialize lnk */
546     ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr);
547     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
548   }
549   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
550   ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr);
551   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
552 
553   /* local sizes and preallocation */
554   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
555   if (P->cmap->bs > 0) {
556     ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr);
557     ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr);
558   }
559   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
560   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
561 
562   /* members in merge */
563   ierr = PetscFree(id_r);CHKERRQ(ierr);
564   ierr = PetscFree(len_r);CHKERRQ(ierr);
565   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
566   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
567   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
568   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
569   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
570 
571   nout = 0;
572   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);CHKERRQ(ierr);
573   PetscCheckFalse(c_oth->i[ptap->C_oth->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_oth->i[ptap->C_oth->rmap->n],nout);
574   ierr = ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);CHKERRQ(ierr);
575   PetscCheckFalse(c_loc->i[ptap->C_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_loc->i[ptap->C_loc->rmap->n],nout);
576 
577   /* attach the supporting struct to Cmpi for reuse */
578   Cmpi->product->data    = ptap;
579   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
580   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
581 
582   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
583   Cmpi->assembled        = PETSC_FALSE;
584   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
585   PetscFunctionReturn(0);
586 }
587 
588 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
589 {
590   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
591   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
592   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
593   PetscInt             pcstart,pcend,column,offset;
594   PetscErrorCode       ierr;
595 
596   PetscFunctionBegin;
597   pcstart = P->cmap->rstart;
598   pcstart *= dof;
599   pcend   = P->cmap->rend;
600   pcend   *= dof;
601   /* diagonal portion: Ad[i,:]*P */
602   ai = ad->i;
603   nzi = ai[i+1] - ai[i];
604   aj  = ad->j + ai[i];
605   for (j=0; j<nzi; j++) {
606     row  = aj[j];
607     offset = row%dof;
608     row   /= dof;
609     nzpi = pd->i[row+1] - pd->i[row];
610     pj  = pd->j + pd->i[row];
611     for (k=0; k<nzpi; k++) {
612       ierr = PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);CHKERRQ(ierr);
613     }
614   }
615   /* off diag P */
616   for (j=0; j<nzi; j++) {
617     row  = aj[j];
618     offset = row%dof;
619     row   /= dof;
620     nzpi = po->i[row+1] - po->i[row];
621     pj  = po->j + po->i[row];
622     for (k=0; k<nzpi; k++) {
623       ierr = PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);CHKERRQ(ierr);
624     }
625   }
626 
627   /* off diagonal part: Ao[i, :]*P_oth */
628   if (ao) {
629     ai = ao->i;
630     pi = p_oth->i;
631     nzi = ai[i+1] - ai[i];
632     aj  = ao->j + ai[i];
633     for (j=0; j<nzi; j++) {
634       row  = aj[j];
635       offset = a->garray[row]%dof;
636       row  = map[row];
637       pnz  = pi[row+1] - pi[row];
638       p_othcols = p_oth->j + pi[row];
639       for (col=0; col<pnz; col++) {
640         column = p_othcols[col] * dof + offset;
641         if (column>=pcstart && column<pcend) {
642           ierr = PetscHSetIAdd(dht,column);CHKERRQ(ierr);
643         } else {
644           ierr = PetscHSetIAdd(oht,column);CHKERRQ(ierr);
645         }
646       }
647     }
648   } /* end if (ao) */
649   PetscFunctionReturn(0);
650 }
651 
652 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
653 {
654   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
655   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
656   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
657   PetscScalar          ra,*aa,*pa;
658   PetscErrorCode       ierr;
659 
660   PetscFunctionBegin;
661   pcstart = P->cmap->rstart;
662   pcstart *= dof;
663 
664   /* diagonal portion: Ad[i,:]*P */
665   ai  = ad->i;
666   nzi = ai[i+1] - ai[i];
667   aj  = ad->j + ai[i];
668   aa  = ad->a + ai[i];
669   for (j=0; j<nzi; j++) {
670     ra   = aa[j];
671     row  = aj[j];
672     offset = row%dof;
673     row    /= dof;
674     nzpi = pd->i[row+1] - pd->i[row];
675     pj = pd->j + pd->i[row];
676     pa = pd->a + pd->i[row];
677     for (k=0; k<nzpi; k++) {
678       ierr = PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);CHKERRQ(ierr);
679     }
680     ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr);
681   }
682   for (j=0; j<nzi; j++) {
683     ra   = aa[j];
684     row  = aj[j];
685     offset = row%dof;
686     row   /= dof;
687     nzpi = po->i[row+1] - po->i[row];
688     pj = po->j + po->i[row];
689     pa = po->a + po->i[row];
690     for (k=0; k<nzpi; k++) {
691       ierr = PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);CHKERRQ(ierr);
692     }
693     ierr = PetscLogFlops(2.0*nzpi);CHKERRQ(ierr);
694   }
695 
696   /* off diagonal part: Ao[i, :]*P_oth */
697   if (ao) {
698     ai = ao->i;
699     pi = p_oth->i;
700     nzi = ai[i+1] - ai[i];
701     aj  = ao->j + ai[i];
702     aa  = ao->a + ai[i];
703     for (j=0; j<nzi; j++) {
704       row  = aj[j];
705       offset = a->garray[row]%dof;
706       row    = map[row];
707       ra   = aa[j];
708       pnz  = pi[row+1] - pi[row];
709       p_othcols = p_oth->j + pi[row];
710       pa   = p_oth->a + pi[row];
711       for (col=0; col<pnz; col++) {
712         ierr = PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);CHKERRQ(ierr);
713       }
714       ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr);
715     }
716   } /* end if (ao) */
717 
718   PetscFunctionReturn(0);
719 }
720 
721 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
722 
723 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
724 {
725   PetscErrorCode    ierr;
726   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
727   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
728   Mat_APMPI         *ptap;
729   PetscHMapIV       hmap;
730   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
731   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
732   PetscInt          offset,ii,pocol;
733   const PetscInt    *mappingindices;
734   IS                map;
735 
736   PetscFunctionBegin;
737   MatCheckProduct(C,4);
738   ptap = (Mat_APMPI*)C->product->data;
739   PetscCheckFalse(!ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
740   PetscCheckFalse(!ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
741 
742   ierr = MatZeroEntries(C);CHKERRQ(ierr);
743 
744   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
745   /*-----------------------------------------------------*/
746   if (ptap->reuse == MAT_REUSE_MATRIX) {
747     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
748     ierr =  MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr);
749   }
750   ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr);
751 
752   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
753   pon *= dof;
754   ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr);
755   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
756   cmaxr = 0;
757   for (i=0; i<pon; i++) {
758     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
759   }
760   ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr);
761   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
762   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
763   ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr);
764   for (i=0; i<am && pon; i++) {
765     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
766     offset = i%dof;
767     ii     = i/dof;
768     nzi = po->i[ii+1] - po->i[ii];
769     if (!nzi) continue;
770     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr);
771     voff = 0;
772     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
773     if (!voff) continue;
774 
775     /* Form C(ii, :) */
776     poj = po->j + po->i[ii];
777     poa = po->a + po->i[ii];
778     for (j=0; j<nzi; j++) {
779       pocol = poj[j]*dof+offset;
780       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
781       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
782       for (jj=0; jj<voff; jj++) {
783         apvaluestmp[jj] = apvalues[jj]*poa[j];
784         /*If the row is empty */
785         if (!c_rmtc[pocol]) {
786           c_rmtjj[jj] = apindices[jj];
787           c_rmtaa[jj] = apvaluestmp[jj];
788           c_rmtc[pocol]++;
789         } else {
790           ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr);
791           if (loc>=0){ /* hit */
792             c_rmtaa[loc] += apvaluestmp[jj];
793             ierr = PetscLogFlops(1.0);CHKERRQ(ierr);
794           } else { /* new element */
795             loc = -(loc+1);
796             /* Move data backward */
797             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
798               c_rmtjj[kk] = c_rmtjj[kk-1];
799               c_rmtaa[kk] = c_rmtaa[kk-1];
800             }/* End kk */
801             c_rmtjj[loc] = apindices[jj];
802             c_rmtaa[loc] = apvaluestmp[jj];
803             c_rmtc[pocol]++;
804           }
805         }
806         ierr = PetscLogFlops(voff);CHKERRQ(ierr);
807       } /* End jj */
808     } /* End j */
809   } /* End i */
810 
811   ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr);
812   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
813 
814   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
815   pn *= dof;
816   ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr);
817 
818   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr);
819   ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr);
820   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
821   pcstart = pcstart*dof;
822   pcend   = pcend*dof;
823   cd = (Mat_SeqAIJ*)(c->A)->data;
824   co = (Mat_SeqAIJ*)(c->B)->data;
825 
826   cmaxr = 0;
827   for (i=0; i<pn; i++) {
828     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
829   }
830   ierr = PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);CHKERRQ(ierr);
831   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
832   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
833   for (i=0; i<am && pn; i++) {
834     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
835     offset = i%dof;
836     ii     = i/dof;
837     nzi = pd->i[ii+1] - pd->i[ii];
838     if (!nzi) continue;
839     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr);
840     voff = 0;
841     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
842     if (!voff) continue;
843     /* Form C(ii, :) */
844     pdj = pd->j + pd->i[ii];
845     pda = pd->a + pd->i[ii];
846     for (j=0; j<nzi; j++) {
847       row = pcstart + pdj[j] * dof + offset;
848       for (jj=0; jj<voff; jj++) {
849         apvaluestmp[jj] = apvalues[jj]*pda[j];
850       }
851       ierr = PetscLogFlops(voff);CHKERRQ(ierr);
852       ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr);
853     }
854   }
855   ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr);
856   ierr = MatGetOwnershipRangeColumn(C,&ccstart,&ccend);CHKERRQ(ierr);
857   ierr = PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);CHKERRQ(ierr);
858   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
859   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr);
860   ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr);
861   ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr);
862 
863   /* Add contributions from remote */
864   for (i = 0; i < pn; i++) {
865     row = i + pcstart;
866     ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr);
867   }
868   ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr);
869 
870   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
871   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
872 
873   ptap->reuse = MAT_REUSE_MATRIX;
874   PetscFunctionReturn(0);
875 }
876 
877 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
878 {
879   PetscErrorCode      ierr;
880 
881   PetscFunctionBegin;
882 
883   ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);CHKERRQ(ierr);
884   PetscFunctionReturn(0);
885 }
886 
887 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
888 {
889   PetscErrorCode    ierr;
890   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
891   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
892   Mat_APMPI         *ptap;
893   PetscHMapIV       hmap;
894   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
895   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
896   PetscInt          offset,ii,pocol;
897   const PetscInt    *mappingindices;
898   IS                map;
899 
900   PetscFunctionBegin;
901   MatCheckProduct(C,4);
902   ptap = (Mat_APMPI*)C->product->data;
903   PetscCheckFalse(!ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
904   PetscCheckFalse(!ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
905 
906   ierr = MatZeroEntries(C);CHKERRQ(ierr);
907 
908   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
909   /*-----------------------------------------------------*/
910   if (ptap->reuse == MAT_REUSE_MATRIX) {
911     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
912     ierr =  MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);CHKERRQ(ierr);
913   }
914   ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr);
915   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
916   pon *= dof;
917   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
918   pn  *= dof;
919 
920   ierr = PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);CHKERRQ(ierr);
921   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
922   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
923   pcstart *= dof;
924   pcend   *= dof;
925   cmaxr = 0;
926   for (i=0; i<pon; i++) {
927     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
928   }
929   cd = (Mat_SeqAIJ*)(c->A)->data;
930   co = (Mat_SeqAIJ*)(c->B)->data;
931   for (i=0; i<pn; i++) {
932     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
933   }
934   ierr = PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);CHKERRQ(ierr);
935   ierr = PetscHMapIVCreate(&hmap);CHKERRQ(ierr);
936   ierr = PetscHMapIVResize(hmap,cmaxr);CHKERRQ(ierr);
937   ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr);
938   for (i=0; i<am && (pon || pn); i++) {
939     ierr = PetscHMapIVClear(hmap);CHKERRQ(ierr);
940     offset = i%dof;
941     ii     = i/dof;
942     nzi  = po->i[ii+1] - po->i[ii];
943     dnzi = pd->i[ii+1] - pd->i[ii];
944     if (!nzi && !dnzi) continue;
945     ierr = MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);CHKERRQ(ierr);
946     voff = 0;
947     ierr = PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);CHKERRQ(ierr);
948     if (!voff) continue;
949 
950     /* Form remote C(ii, :) */
951     poj = po->j + po->i[ii];
952     poa = po->a + po->i[ii];
953     for (j=0; j<nzi; j++) {
954       pocol = poj[j]*dof+offset;
955       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
956       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
957       for (jj=0; jj<voff; jj++) {
958         apvaluestmp[jj] = apvalues[jj]*poa[j];
959         /*If the row is empty */
960         if (!c_rmtc[pocol]) {
961           c_rmtjj[jj] = apindices[jj];
962           c_rmtaa[jj] = apvaluestmp[jj];
963           c_rmtc[pocol]++;
964         } else {
965           ierr = PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);CHKERRQ(ierr);
966           if (loc>=0){ /* hit */
967             c_rmtaa[loc] += apvaluestmp[jj];
968             ierr = PetscLogFlops(1.0);CHKERRQ(ierr);
969           } else { /* new element */
970             loc = -(loc+1);
971             /* Move data backward */
972             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
973               c_rmtjj[kk] = c_rmtjj[kk-1];
974               c_rmtaa[kk] = c_rmtaa[kk-1];
975             }/* End kk */
976             c_rmtjj[loc] = apindices[jj];
977             c_rmtaa[loc] = apvaluestmp[jj];
978             c_rmtc[pocol]++;
979           }
980         }
981       } /* End jj */
982       ierr = PetscLogFlops(voff);CHKERRQ(ierr);
983     } /* End j */
984 
985     /* Form local C(ii, :) */
986     pdj = pd->j + pd->i[ii];
987     pda = pd->a + pd->i[ii];
988     for (j=0; j<dnzi; j++) {
989       row = pcstart + pdj[j] * dof + offset;
990       for (jj=0; jj<voff; jj++) {
991         apvaluestmp[jj] = apvalues[jj]*pda[j];
992       }/* End kk */
993       ierr = PetscLogFlops(voff);CHKERRQ(ierr);
994       ierr = MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);CHKERRQ(ierr);
995     }/* End j */
996   } /* End i */
997 
998   ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr);
999   ierr = PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);CHKERRQ(ierr);
1000   ierr = PetscHMapIVDestroy(&hmap);CHKERRQ(ierr);
1001   ierr = PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);CHKERRQ(ierr);
1002 
1003   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr);
1004   ierr = PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr);
1005   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr);
1006   ierr = PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);CHKERRQ(ierr);
1007   ierr = PetscFree2(c_rmtj,c_rmta);CHKERRQ(ierr);
1008 
1009   /* Add contributions from remote */
1010   for (i = 0; i < pn; i++) {
1011     row = i + pcstart;
1012     ierr = MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);CHKERRQ(ierr);
1013   }
1014   ierr = PetscFree2(c_othj,c_otha);CHKERRQ(ierr);
1015 
1016   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1017   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1018 
1019   ptap->reuse = MAT_REUSE_MATRIX;
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1024 {
1025   PetscErrorCode      ierr;
1026 
1027   PetscFunctionBegin;
1028 
1029   ierr = MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);CHKERRQ(ierr);
1030   PetscFunctionReturn(0);
1031 }
1032 
1033 /* TODO: move algorithm selection to MatProductSetFromOptions */
1034 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1035 {
1036   Mat_APMPI           *ptap;
1037   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
1038   MPI_Comm            comm;
1039   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1040   MatType             mtype;
1041   PetscSF             sf;
1042   PetscSFNode         *iremote;
1043   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1044   const PetscInt      *rootdegrees;
1045   PetscHSetI          ht,oht,*hta,*hto;
1046   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1047   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1048   PetscInt            nalg=2,alg=0,offset,ii;
1049   PetscMPIInt         owner;
1050   const PetscInt      *mappingindices;
1051   PetscBool           flg;
1052   const char          *algTypes[2] = {"overlapping","merged"};
1053   IS                  map;
1054   PetscErrorCode      ierr;
1055 
1056   PetscFunctionBegin;
1057   MatCheckProduct(Cmpi,5);
1058   PetscCheckFalse(Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
1059   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1060 
1061   /* Create symbolic parallel matrix Cmpi */
1062   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1063   pn *= dof;
1064   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1065   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1066   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1067 
1068   ierr = PetscNew(&ptap);CHKERRQ(ierr);
1069   ptap->reuse = MAT_INITIAL_MATRIX;
1070   ptap->algType = 2;
1071 
1072   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1073   ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr);
1074   ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr);
1075   /* This equals to the number of offdiag columns in P */
1076   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1077   pon *= dof;
1078   /* offsets */
1079   ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr);
1080   /* The number of columns we will send to remote ranks */
1081   ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr);
1082   ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr);
1083   for (i=0; i<pon; i++) {
1084     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
1085   }
1086   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
1087   ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr);
1088   /* Create hash table to merge all columns for C(i, :) */
1089   ierr = PetscHSetICreate(&ht);CHKERRQ(ierr);
1090 
1091   ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr);
1092   ptap->c_rmti[0] = 0;
1093   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1094   for (i=0; i<am && pon; i++) {
1095     /* Form one row of AP */
1096     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1097     offset = i%dof;
1098     ii     = i/dof;
1099     /* If the off diag is empty, we should not do any calculation */
1100     nzi = po->i[ii+1] - po->i[ii];
1101     if (!nzi) continue;
1102 
1103     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);CHKERRQ(ierr);
1104     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
1105     /* If AP is empty, just continue */
1106     if (!htsize) continue;
1107     /* Form C(ii, :) */
1108     poj = po->j + po->i[ii];
1109     for (j=0; j<nzi; j++) {
1110       ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr);
1111     }
1112   }
1113 
1114   for (i=0; i<pon; i++) {
1115     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
1116     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1117     c_rmtc[i] = htsize;
1118   }
1119 
1120   ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr);
1121 
1122   for (i=0; i<pon; i++) {
1123     off = 0;
1124     ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr);
1125     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
1126   }
1127   ierr = PetscFree(hta);CHKERRQ(ierr);
1128 
1129   ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr);
1130   for (i=0; i<pon; i++) {
1131     owner = 0; lidx = 0;
1132     offset = i%dof;
1133     ii     = i/dof;
1134     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr);
1135     iremote[i].index = lidx*dof + offset;
1136     iremote[i].rank  = owner;
1137   }
1138 
1139   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
1140   ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1141   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1142   ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr);
1143   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
1144   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1145   /* How many neighbors have contributions to my rows? */
1146   ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr);
1147   ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr);
1148   rootspacesize = 0;
1149   for (i = 0; i < pn; i++) {
1150     rootspacesize += rootdegrees[i];
1151   }
1152   ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr);
1153   ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr);
1154   /* Get information from leaves
1155    * Number of columns other people contribute to my rows
1156    * */
1157   ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1158   ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1159   ierr = PetscFree(c_rmtc);CHKERRQ(ierr);
1160   ierr = PetscCalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr);
1161   /* The number of columns is received for each row */
1162   ptap->c_othi[0] = 0;
1163   rootspacesize = 0;
1164   rootspaceoffsets[0] = 0;
1165   for (i = 0; i < pn; i++) {
1166     rcvncols = 0;
1167     for (j = 0; j<rootdegrees[i]; j++) {
1168       rcvncols += rootspace[rootspacesize];
1169       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1170       rootspacesize++;
1171     }
1172     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1173   }
1174   ierr = PetscFree(rootspace);CHKERRQ(ierr);
1175 
1176   ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr);
1177   ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1178   ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1179   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
1180   ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr);
1181 
1182   ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr);
1183   nleaves = 0;
1184   for (i = 0; i<pon; i++) {
1185     owner = 0;
1186     ii = i/dof;
1187     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr);
1188     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1189     for (j=0; j<sendncols; j++) {
1190       iremote[nleaves].rank = owner;
1191       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1192     }
1193   }
1194   ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr);
1195   ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr);
1196 
1197   ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr);
1198   ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1199   ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr);
1200   /* One to one map */
1201   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr);
1202 
1203   ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr);
1204   ierr = PetscHSetICreate(&oht);CHKERRQ(ierr);
1205   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1206   pcstart *= dof;
1207   pcend   *= dof;
1208   ierr = PetscMalloc2(pn,&hta,pn,&hto);CHKERRQ(ierr);
1209   for (i=0; i<pn; i++) {
1210     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
1211     ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr);
1212   }
1213   /* Work on local part */
1214   /* 4) Pass 1: Estimate memory for C_loc */
1215   for (i=0; i<am && pn; i++) {
1216     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1217     ierr = PetscHSetIClear(oht);CHKERRQ(ierr);
1218     offset = i%dof;
1219     ii     = i/dof;
1220     nzi = pd->i[ii+1] - pd->i[ii];
1221     if (!nzi) continue;
1222 
1223     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr);
1224     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
1225     ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr);
1226     if (!(htsize+htosize)) continue;
1227     /* Form C(ii, :) */
1228     pdj = pd->j + pd->i[ii];
1229     for (j=0; j<nzi; j++) {
1230       ierr = PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);CHKERRQ(ierr);
1231       ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr);
1232     }
1233   }
1234 
1235   ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr);
1236 
1237   ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr);
1238   ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr);
1239 
1240   /* Get remote data */
1241   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr);
1242   ierr = PetscFree(c_rmtj);CHKERRQ(ierr);
1243 
1244   for (i = 0; i < pn; i++) {
1245     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1246     rdj = c_othj + ptap->c_othi[i];
1247     for (j = 0; j < nzi; j++) {
1248       col = rdj[j];
1249       /* diag part */
1250       if (col>=pcstart && col<pcend) {
1251         ierr = PetscHSetIAdd(hta[i],col);CHKERRQ(ierr);
1252       } else { /* off diag */
1253         ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr);
1254       }
1255     }
1256     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
1257     dnz[i] = htsize;
1258     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
1259     ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr);
1260     onz[i] = htsize;
1261     ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr);
1262   }
1263 
1264   ierr = PetscFree2(hta,hto);CHKERRQ(ierr);
1265   ierr = PetscFree(c_othj);CHKERRQ(ierr);
1266 
1267   /* local sizes and preallocation */
1268   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1269   ierr = MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr);
1270   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1271   ierr = MatSetUp(Cmpi);CHKERRQ(ierr);
1272   ierr = PetscFree2(dnz,onz);CHKERRQ(ierr);
1273 
1274   /* attach the supporting struct to Cmpi for reuse */
1275   Cmpi->product->data    = ptap;
1276   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1277   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1278 
1279   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1280   Cmpi->assembled = PETSC_FALSE;
1281   /* pick an algorithm */
1282   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
1283   alg = 0;
1284   ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1285   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1286   switch (alg) {
1287     case 0:
1288       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1289       break;
1290     case 1:
1291       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1292       break;
1293     default:
1294       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1295   }
1296   PetscFunctionReturn(0);
1297 }
1298 
1299 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1300 {
1301   PetscErrorCode ierr;
1302 
1303   PetscFunctionBegin;
1304   ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);CHKERRQ(ierr);
1305   PetscFunctionReturn(0);
1306 }
1307 
1308 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1309 {
1310   Mat_APMPI           *ptap;
1311   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
1312   MPI_Comm            comm;
1313   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1314   MatType             mtype;
1315   PetscSF             sf;
1316   PetscSFNode         *iremote;
1317   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1318   const PetscInt      *rootdegrees;
1319   PetscHSetI          ht,oht,*hta,*hto,*htd;
1320   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1321   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1322   PetscInt            nalg=2,alg=0,offset,ii;
1323   PetscMPIInt         owner;
1324   PetscBool           flg;
1325   const char          *algTypes[2] = {"merged","overlapping"};
1326   const PetscInt      *mappingindices;
1327   IS                  map;
1328   PetscErrorCode      ierr;
1329 
1330   PetscFunctionBegin;
1331   MatCheckProduct(Cmpi,5);
1332   PetscCheckFalse(Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
1333   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1334 
1335   /* Create symbolic parallel matrix Cmpi */
1336   ierr = MatGetLocalSize(P,NULL,&pn);CHKERRQ(ierr);
1337   pn *= dof;
1338   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1339   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1340   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1341 
1342   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
1343   ptap->reuse = MAT_INITIAL_MATRIX;
1344   ptap->algType = 3;
1345 
1346   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1347   ierr = MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);CHKERRQ(ierr);
1348   ierr = PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);CHKERRQ(ierr);
1349 
1350   /* This equals to the number of offdiag columns in P */
1351   ierr = MatGetLocalSize(p->B,NULL,&pon);CHKERRQ(ierr);
1352   pon *= dof;
1353   /* offsets */
1354   ierr = PetscMalloc1(pon+1,&ptap->c_rmti);CHKERRQ(ierr);
1355   /* The number of columns we will send to remote ranks */
1356   ierr = PetscMalloc1(pon,&c_rmtc);CHKERRQ(ierr);
1357   ierr = PetscMalloc1(pon,&hta);CHKERRQ(ierr);
1358   for (i=0; i<pon; i++) {
1359     ierr = PetscHSetICreate(&hta[i]);CHKERRQ(ierr);
1360   }
1361   ierr = MatGetLocalSize(A,&am,NULL);CHKERRQ(ierr);
1362   ierr = MatGetOwnershipRange(A,&arstart,&arend);CHKERRQ(ierr);
1363   /* Create hash table to merge all columns for C(i, :) */
1364   ierr = PetscHSetICreate(&ht);CHKERRQ(ierr);
1365   ierr = PetscHSetICreate(&oht);CHKERRQ(ierr);
1366   ierr = PetscMalloc2(pn,&htd,pn,&hto);CHKERRQ(ierr);
1367   for (i=0; i<pn; i++) {
1368     ierr = PetscHSetICreate(&htd[i]);CHKERRQ(ierr);
1369     ierr = PetscHSetICreate(&hto[i]);CHKERRQ(ierr);
1370   }
1371 
1372   ierr = ISGetIndices(map,&mappingindices);CHKERRQ(ierr);
1373   ptap->c_rmti[0] = 0;
1374   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1375   for (i=0; i<am && (pon || pn); i++) {
1376     /* Form one row of AP */
1377     ierr = PetscHSetIClear(ht);CHKERRQ(ierr);
1378     ierr = PetscHSetIClear(oht);CHKERRQ(ierr);
1379     offset = i%dof;
1380     ii     = i/dof;
1381     /* If the off diag is empty, we should not do any calculation */
1382     nzi = po->i[ii+1] - po->i[ii];
1383     dnzi = pd->i[ii+1] - pd->i[ii];
1384     if (!nzi && !dnzi) continue;
1385 
1386     ierr = MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);CHKERRQ(ierr);
1387     ierr = PetscHSetIGetSize(ht,&htsize);CHKERRQ(ierr);
1388     ierr = PetscHSetIGetSize(oht,&htosize);CHKERRQ(ierr);
1389     /* If AP is empty, just continue */
1390     if (!(htsize+htosize)) continue;
1391 
1392     /* Form remote C(ii, :) */
1393     poj = po->j + po->i[ii];
1394     for (j=0; j<nzi; j++) {
1395       ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);CHKERRQ(ierr);
1396       ierr = PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);CHKERRQ(ierr);
1397     }
1398 
1399     /* Form local C(ii, :) */
1400     pdj = pd->j + pd->i[ii];
1401     for (j=0; j<dnzi; j++) {
1402       ierr = PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);CHKERRQ(ierr);
1403       ierr = PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);CHKERRQ(ierr);
1404     }
1405   }
1406 
1407   ierr = ISRestoreIndices(map,&mappingindices);CHKERRQ(ierr);
1408 
1409   ierr = PetscHSetIDestroy(&ht);CHKERRQ(ierr);
1410   ierr = PetscHSetIDestroy(&oht);CHKERRQ(ierr);
1411 
1412   for (i=0; i<pon; i++) {
1413     ierr = PetscHSetIGetSize(hta[i],&htsize);CHKERRQ(ierr);
1414     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1415     c_rmtc[i] = htsize;
1416   }
1417 
1418   ierr = PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);CHKERRQ(ierr);
1419 
1420   for (i=0; i<pon; i++) {
1421     off = 0;
1422     ierr = PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);CHKERRQ(ierr);
1423     ierr = PetscHSetIDestroy(&hta[i]);CHKERRQ(ierr);
1424   }
1425   ierr = PetscFree(hta);CHKERRQ(ierr);
1426 
1427   ierr = PetscMalloc1(pon,&iremote);CHKERRQ(ierr);
1428   for (i=0; i<pon; i++) {
1429     owner = 0; lidx = 0;
1430     offset = i%dof;
1431     ii     = i/dof;
1432     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);CHKERRQ(ierr);
1433     iremote[i].index = lidx*dof+offset;
1434     iremote[i].rank  = owner;
1435   }
1436 
1437   ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
1438   ierr = PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1439   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1440   ierr = PetscSFSetRankOrder(sf,PETSC_TRUE);CHKERRQ(ierr);
1441   ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr);
1442   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1443   /* How many neighbors have contributions to my rows? */
1444   ierr = PetscSFComputeDegreeBegin(sf,&rootdegrees);CHKERRQ(ierr);
1445   ierr = PetscSFComputeDegreeEnd(sf,&rootdegrees);CHKERRQ(ierr);
1446   rootspacesize = 0;
1447   for (i = 0; i < pn; i++) {
1448     rootspacesize += rootdegrees[i];
1449   }
1450   ierr = PetscMalloc1(rootspacesize,&rootspace);CHKERRQ(ierr);
1451   ierr = PetscMalloc1(rootspacesize+1,&rootspaceoffsets);CHKERRQ(ierr);
1452   /* Get information from leaves
1453    * Number of columns other people contribute to my rows
1454    * */
1455   ierr = PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1456   ierr = PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);CHKERRQ(ierr);
1457   ierr = PetscFree(c_rmtc);CHKERRQ(ierr);
1458   ierr = PetscMalloc1(pn+1,&ptap->c_othi);CHKERRQ(ierr);
1459   /* The number of columns is received for each row */
1460   ptap->c_othi[0]     = 0;
1461   rootspacesize       = 0;
1462   rootspaceoffsets[0] = 0;
1463   for (i = 0; i < pn; i++) {
1464     rcvncols = 0;
1465     for (j = 0; j<rootdegrees[i]; j++) {
1466       rcvncols += rootspace[rootspacesize];
1467       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1468       rootspacesize++;
1469     }
1470     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1471   }
1472   ierr = PetscFree(rootspace);CHKERRQ(ierr);
1473 
1474   ierr = PetscMalloc1(pon,&c_rmtoffsets);CHKERRQ(ierr);
1475   ierr = PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1476   ierr = PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);CHKERRQ(ierr);
1477   ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
1478   ierr = PetscFree(rootspaceoffsets);CHKERRQ(ierr);
1479 
1480   ierr = PetscCalloc1(ptap->c_rmti[pon],&iremote);CHKERRQ(ierr);
1481   nleaves = 0;
1482   for (i = 0; i<pon; i++) {
1483     owner = 0;
1484     ii    = i/dof;
1485     ierr = PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);CHKERRQ(ierr);
1486     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1487     for (j=0; j<sendncols; j++) {
1488       iremote[nleaves].rank    = owner;
1489       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1490     }
1491   }
1492   ierr = PetscFree(c_rmtoffsets);CHKERRQ(ierr);
1493   ierr = PetscCalloc1(ptap->c_othi[pn],&c_othj);CHKERRQ(ierr);
1494 
1495   ierr = PetscSFCreate(comm,&ptap->sf);CHKERRQ(ierr);
1496   ierr = PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1497   ierr = PetscSFSetFromOptions(ptap->sf);CHKERRQ(ierr);
1498   /* One to one map */
1499   ierr = PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr);
1500   /* Get remote data */
1501   ierr = PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);CHKERRQ(ierr);
1502   ierr = PetscFree(c_rmtj);CHKERRQ(ierr);
1503   ierr = PetscMalloc2(pn,&dnz,pn,&onz);CHKERRQ(ierr);
1504   ierr = MatGetOwnershipRangeColumn(P,&pcstart,&pcend);CHKERRQ(ierr);
1505   pcstart *= dof;
1506   pcend   *= dof;
1507   for (i = 0; i < pn; i++) {
1508     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1509     rdj = c_othj + ptap->c_othi[i];
1510     for (j = 0; j < nzi; j++) {
1511       col =  rdj[j];
1512       /* diag part */
1513       if (col>=pcstart && col<pcend) {
1514         ierr = PetscHSetIAdd(htd[i],col);CHKERRQ(ierr);
1515       } else { /* off diag */
1516         ierr = PetscHSetIAdd(hto[i],col);CHKERRQ(ierr);
1517       }
1518     }
1519     ierr = PetscHSetIGetSize(htd[i],&htsize);CHKERRQ(ierr);
1520     dnz[i] = htsize;
1521     ierr = PetscHSetIDestroy(&htd[i]);CHKERRQ(ierr);
1522     ierr = PetscHSetIGetSize(hto[i],&htsize);CHKERRQ(ierr);
1523     onz[i] = htsize;
1524     ierr = PetscHSetIDestroy(&hto[i]);CHKERRQ(ierr);
1525   }
1526 
1527   ierr = PetscFree2(htd,hto);CHKERRQ(ierr);
1528   ierr = PetscFree(c_othj);CHKERRQ(ierr);
1529 
1530   /* local sizes and preallocation */
1531   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1532   ierr = MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);CHKERRQ(ierr);
1533   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1534   ierr = PetscFree2(dnz,onz);CHKERRQ(ierr);
1535 
1536   /* attach the supporting struct to Cmpi for reuse */
1537   Cmpi->product->data    = ptap;
1538   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1539   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1540 
1541   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1542   Cmpi->assembled = PETSC_FALSE;
1543   /* pick an algorithm */
1544   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
1545   alg = 0;
1546   ierr = PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);CHKERRQ(ierr);
1547   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1548   switch (alg) {
1549     case 0:
1550       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1551       break;
1552     case 1:
1553       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1554       break;
1555     default:
1556       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1557   }
1558   PetscFunctionReturn(0);
1559 }
1560 
1561 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1562 {
1563   PetscErrorCode ierr;
1564 
1565   PetscFunctionBegin;
1566   ierr = MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);CHKERRQ(ierr);
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1571 {
1572   PetscErrorCode      ierr;
1573   Mat_APMPI           *ptap;
1574   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1575   MPI_Comm            comm;
1576   PetscMPIInt         size,rank;
1577   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1578   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1579   PetscInt            *lnk,i,k,pnz,row,nsend;
1580   PetscBT             lnkbt;
1581   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
1582   PETSC_UNUSED PetscMPIInt icompleted=0;
1583   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1584   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1585   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1586   MPI_Request         *swaits,*rwaits;
1587   MPI_Status          *sstatus,rstatus;
1588   PetscLayout         rowmap;
1589   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1590   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1591   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1592   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1593   PetscScalar         *apv;
1594   PetscTable          ta;
1595   MatType             mtype;
1596   const char          *prefix;
1597 #if defined(PETSC_USE_INFO)
1598   PetscReal           apfill;
1599 #endif
1600 
1601   PetscFunctionBegin;
1602   MatCheckProduct(Cmpi,4);
1603   PetscCheckFalse(Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
1604   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1605   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1606   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
1607 
1608   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1609 
1610   /* create symbolic parallel matrix Cmpi */
1611   ierr = MatGetType(A,&mtype);CHKERRQ(ierr);
1612   ierr = MatSetType(Cmpi,mtype);CHKERRQ(ierr);
1613 
1614   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1615   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1616 
1617   /* create struct Mat_APMPI and attached it to C later */
1618   ierr        = PetscNew(&ptap);CHKERRQ(ierr);
1619   ptap->reuse = MAT_INITIAL_MATRIX;
1620   ptap->algType = 1;
1621 
1622   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1623   ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1624   /* get P_loc by taking all local rows of P */
1625   ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1626 
1627   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1628   /* --------------------------------- */
1629   ierr = MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr);
1630   ierr = MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr);
1631 
1632   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1633   /* ----------------------------------------------------------------- */
1634   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1635   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1636 
1637   /* create and initialize a linked list */
1638   ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */
1639   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1640   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1641   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */
1642   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1643 
1644   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
1645 
1646   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1647   if (ao) {
1648     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr);
1649   } else {
1650     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr);
1651   }
1652   current_space = free_space;
1653   nspacedouble  = 0;
1654 
1655   ierr   = PetscMalloc1(am+1,&api);CHKERRQ(ierr);
1656   api[0] = 0;
1657   for (i=0; i<am; i++) {
1658     /* diagonal portion: Ad[i,:]*P */
1659     ai = ad->i; pi = p_loc->i;
1660     nzi = ai[i+1] - ai[i];
1661     aj  = ad->j + ai[i];
1662     for (j=0; j<nzi; j++) {
1663       row  = aj[j];
1664       pnz  = pi[row+1] - pi[row];
1665       Jptr = p_loc->j + pi[row];
1666       /* add non-zero cols of P into the sorted linked list lnk */
1667       ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1668     }
1669     /* off-diagonal portion: Ao[i,:]*P */
1670     if (ao) {
1671       ai = ao->i; pi = p_oth->i;
1672       nzi = ai[i+1] - ai[i];
1673       aj  = ao->j + ai[i];
1674       for (j=0; j<nzi; j++) {
1675         row  = aj[j];
1676         pnz  = pi[row+1] - pi[row];
1677         Jptr = p_oth->j + pi[row];
1678         ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1679       }
1680     }
1681     apnz     = lnk[0];
1682     api[i+1] = api[i] + apnz;
1683     if (ap_rmax < apnz) ap_rmax = apnz;
1684 
1685     /* if free space is not available, double the total space in the list */
1686     if (current_space->local_remaining<apnz) {
1687       ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);CHKERRQ(ierr);
1688       nspacedouble++;
1689     }
1690 
1691     /* Copy data into free space, then initialize lnk */
1692     ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1693 
1694     current_space->array           += apnz;
1695     current_space->local_used      += apnz;
1696     current_space->local_remaining -= apnz;
1697   }
1698   /* Allocate space for apj and apv, initialize apj, and */
1699   /* destroy list of free space and other temporary array(s) */
1700   ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr);
1701   ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr);
1702   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1703 
1704   /* Create AP_loc for reuse */
1705   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr);
1706   ierr = MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name);CHKERRQ(ierr);
1707 #if defined(PETSC_USE_INFO)
1708   if (ao) {
1709     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1710   } else {
1711     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1712   }
1713   ptap->AP_loc->info.mallocs           = nspacedouble;
1714   ptap->AP_loc->info.fill_ratio_given  = fill;
1715   ptap->AP_loc->info.fill_ratio_needed = apfill;
1716 
1717   if (api[am]) {
1718     ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr);
1719     ierr = PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr);
1720   } else {
1721     ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr);
1722   }
1723 #endif
1724 
1725   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1726   /* ------------------------------------ */
1727   ierr = MatGetOptionsPrefix(A,&prefix);CHKERRQ(ierr);
1728   ierr = MatSetOptionsPrefix(ptap->Ro,prefix);CHKERRQ(ierr);
1729   ierr = MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");CHKERRQ(ierr);
1730   ierr = MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);CHKERRQ(ierr);
1731   ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr);
1732   ierr = MatSetOptionsPrefix(ptap->C_oth,prefix);CHKERRQ(ierr);
1733   ierr = MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_");CHKERRQ(ierr);
1734   ierr = MatProductSetType(ptap->C_oth,MATPRODUCT_AB);CHKERRQ(ierr);
1735   ierr = MatProductSetAlgorithm(ptap->C_oth,"default");CHKERRQ(ierr);
1736   ierr = MatProductSetFill(ptap->C_oth,fill);CHKERRQ(ierr);
1737   ierr = MatProductSetFromOptions(ptap->C_oth);CHKERRQ(ierr);
1738   ierr = MatProductSymbolic(ptap->C_oth);CHKERRQ(ierr);
1739 
1740   /* (3) send coj of C_oth to other processors  */
1741   /* ------------------------------------------ */
1742   /* determine row ownership */
1743   ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr);
1744   rowmap->n  = pn;
1745   rowmap->bs = 1;
1746   ierr   = PetscLayoutSetUp(rowmap);CHKERRQ(ierr);
1747   owners = rowmap->range;
1748 
1749   /* determine the number of messages to send, their lengths */
1750   ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr);
1751   ierr = PetscArrayzero(len_s,size);CHKERRQ(ierr);
1752   ierr = PetscArrayzero(len_si,size);CHKERRQ(ierr);
1753 
1754   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1755   coi   = c_oth->i; coj = c_oth->j;
1756   con   = ptap->C_oth->rmap->n;
1757   proc  = 0;
1758   for (i=0; i<con; i++) {
1759     while (prmap[i] >= owners[proc+1]) proc++;
1760     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1761     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1762   }
1763 
1764   len          = 0; /* max length of buf_si[], see (4) */
1765   owners_co[0] = 0;
1766   nsend        = 0;
1767   for (proc=0; proc<size; proc++) {
1768     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1769     if (len_s[proc]) {
1770       nsend++;
1771       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1772       len         += len_si[proc];
1773     }
1774   }
1775 
1776   /* determine the number and length of messages to receive for coi and coj  */
1777   ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr);
1778   ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr);
1779 
1780   /* post the Irecv and Isend of coj */
1781   ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr);
1782   ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr);
1783   ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr);
1784   for (proc=0, k=0; proc<size; proc++) {
1785     if (!len_s[proc]) continue;
1786     i    = owners_co[proc];
1787     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRMPI(ierr);
1788     k++;
1789   }
1790 
1791   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1792   /* ---------------------------------------- */
1793   ierr = MatSetOptionsPrefix(ptap->Rd,prefix);CHKERRQ(ierr);
1794   ierr = MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");CHKERRQ(ierr);
1795   ierr = MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);CHKERRQ(ierr);
1796   ierr = MatGetOptionsPrefix(Cmpi,&prefix);CHKERRQ(ierr);
1797   ierr = MatSetOptionsPrefix(ptap->C_loc,prefix);CHKERRQ(ierr);
1798   ierr = MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_");CHKERRQ(ierr);
1799   ierr = MatProductSetType(ptap->C_loc,MATPRODUCT_AB);CHKERRQ(ierr);
1800   ierr = MatProductSetAlgorithm(ptap->C_loc,"default");CHKERRQ(ierr);
1801   ierr = MatProductSetFill(ptap->C_loc,fill);CHKERRQ(ierr);
1802   ierr = MatProductSetFromOptions(ptap->C_loc);CHKERRQ(ierr);
1803   ierr = MatProductSymbolic(ptap->C_loc);CHKERRQ(ierr);
1804 
1805   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1806 
1807   /* receives coj are complete */
1808   for (i=0; i<nrecv; i++) {
1809     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr);
1810   }
1811   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1812   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);}
1813 
1814   /* add received column indices into ta to update Crmax */
1815   for (k=0; k<nrecv; k++) {/* k-th received message */
1816     Jptr = buf_rj[k];
1817     for (j=0; j<len_r[k]; j++) {
1818       ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr);
1819     }
1820   }
1821   ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr);
1822   ierr = PetscTableDestroy(&ta);CHKERRQ(ierr);
1823 
1824   /* (4) send and recv coi */
1825   /*-----------------------*/
1826   ierr   = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr);
1827   ierr   = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr);
1828   ierr   = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr);
1829   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1830   for (proc=0,k=0; proc<size; proc++) {
1831     if (!len_s[proc]) continue;
1832     /* form outgoing message for i-structure:
1833          buf_si[0]:                 nrows to be sent
1834                [1:nrows]:           row index (global)
1835                [nrows+1:2*nrows+1]: i-structure index
1836     */
1837     /*-------------------------------------------*/
1838     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1839     buf_si_i    = buf_si + nrows+1;
1840     buf_si[0]   = nrows;
1841     buf_si_i[0] = 0;
1842     nrows       = 0;
1843     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1844       nzi = coi[i+1] - coi[i];
1845       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1846       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1847       nrows++;
1848     }
1849     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRMPI(ierr);
1850     k++;
1851     buf_si += len_si[proc];
1852   }
1853   for (i=0; i<nrecv; i++) {
1854     ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRMPI(ierr);
1855   }
1856   ierr = PetscFree(rwaits);CHKERRQ(ierr);
1857   if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRMPI(ierr);}
1858 
1859   ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr);
1860   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1861   ierr = PetscFree(swaits);CHKERRQ(ierr);
1862   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1863 
1864   /* (5) compute the local portion of Cmpi      */
1865   /* ------------------------------------------ */
1866   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1867   ierr          = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr);
1868   current_space = free_space;
1869 
1870   ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr);
1871   for (k=0; k<nrecv; k++) {
1872     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1873     nrows       = *buf_ri_k[k];
1874     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1875     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1876   }
1877 
1878   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
1879   ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr);
1880   for (i=0; i<pn; i++) {
1881     /* add C_loc into Cmpi */
1882     nzi  = c_loc->i[i+1] - c_loc->i[i];
1883     Jptr = c_loc->j + c_loc->i[i];
1884     ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1885 
1886     /* add received col data into lnk */
1887     for (k=0; k<nrecv; k++) { /* k-th received message */
1888       if (i == *nextrow[k]) { /* i-th row */
1889         nzi  = *(nextci[k]+1) - *nextci[k];
1890         Jptr = buf_rj[k] + *nextci[k];
1891         ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr);
1892         nextrow[k]++; nextci[k]++;
1893       }
1894     }
1895     nzi = lnk[0];
1896 
1897     /* copy data into free space, then initialize lnk */
1898     ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr);
1899     ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr);
1900   }
1901   ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr);
1902   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1903   ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr);
1904 
1905   /* local sizes and preallocation */
1906   ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1907   if (P->cmap->bs > 0) {
1908     ierr = PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);CHKERRQ(ierr);
1909     ierr = PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);CHKERRQ(ierr);
1910   }
1911   ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr);
1912   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1913 
1914   /* members in merge */
1915   ierr = PetscFree(id_r);CHKERRQ(ierr);
1916   ierr = PetscFree(len_r);CHKERRQ(ierr);
1917   ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr);
1918   ierr = PetscFree(buf_ri);CHKERRQ(ierr);
1919   ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr);
1920   ierr = PetscFree(buf_rj);CHKERRQ(ierr);
1921   ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr);
1922 
1923   ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr);
1924 
1925   /* attach the supporting struct to Cmpi for reuse */
1926   Cmpi->product->data    = ptap;
1927   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1928   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1929 
1930   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1931   Cmpi->assembled = PETSC_FALSE;
1932   PetscFunctionReturn(0);
1933 }
1934 
1935 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1936 {
1937   PetscErrorCode    ierr;
1938   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1939   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1940   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1941   Mat_APMPI         *ptap;
1942   Mat               AP_loc,C_loc,C_oth;
1943   PetscInt          i,rstart,rend,cm,ncols,row;
1944   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
1945   PetscScalar       *apa;
1946   const PetscInt    *cols;
1947   const PetscScalar *vals;
1948 
1949   PetscFunctionBegin;
1950   MatCheckProduct(C,3);
1951   ptap = (Mat_APMPI*)C->product->data;
1952   PetscCheckFalse(!ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
1953   PetscCheckFalse(!ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
1954 
1955   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1956   /* 1) get R = Pd^T,Ro = Po^T */
1957   if (ptap->reuse == MAT_REUSE_MATRIX) {
1958     ierr = MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr);
1959     ierr = MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr);
1960   }
1961 
1962   /* 2) get AP_loc */
1963   AP_loc = ptap->AP_loc;
1964   ap = (Mat_SeqAIJ*)AP_loc->data;
1965 
1966   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1967   /*-----------------------------------------------------*/
1968   if (ptap->reuse == MAT_REUSE_MATRIX) {
1969     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1970     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr);
1971     ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr);
1972   }
1973 
1974   /* 2-2) compute numeric A_loc*P - dominating part */
1975   /* ---------------------------------------------- */
1976   /* get data from symbolic products */
1977   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1978   if (ptap->P_oth) {
1979     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1980   }
1981   apa   = ptap->apa;
1982   api   = ap->i;
1983   apj   = ap->j;
1984   for (i=0; i<am; i++) {
1985     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1986     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1987     apnz = api[i+1] - api[i];
1988     for (j=0; j<apnz; j++) {
1989       col = apj[j+api[i]];
1990       ap->a[j+ap->i[i]] = apa[col];
1991       apa[col] = 0.0;
1992     }
1993   }
1994   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1995   ierr = PetscObjectStateIncrease((PetscObject)AP_loc);CHKERRQ(ierr);
1996 
1997   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1998   ierr = MatProductNumeric(ptap->C_loc);CHKERRQ(ierr);
1999   ierr = MatProductNumeric(ptap->C_oth);CHKERRQ(ierr);
2000   C_loc = ptap->C_loc;
2001   C_oth = ptap->C_oth;
2002 
2003   /* add C_loc and Co to to C */
2004   ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr);
2005 
2006   /* C_loc -> C */
2007   cm    = C_loc->rmap->N;
2008   c_seq = (Mat_SeqAIJ*)C_loc->data;
2009   cols = c_seq->j;
2010   vals = c_seq->a;
2011 
2012   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
2013   /* when there are no off-processor parts.  */
2014   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
2015   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2016   /* a table, and other, more complex stuff has to be done. */
2017   if (C->assembled) {
2018     C->was_assembled = PETSC_TRUE;
2019     C->assembled     = PETSC_FALSE;
2020   }
2021   if (C->was_assembled) {
2022     for (i=0; i<cm; i++) {
2023       ncols = c_seq->i[i+1] - c_seq->i[i];
2024       row = rstart + i;
2025       ierr = MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
2026       cols += ncols; vals += ncols;
2027     }
2028   } else {
2029     ierr = MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);CHKERRQ(ierr);
2030   }
2031 
2032   /* Co -> C, off-processor part */
2033   cm = C_oth->rmap->N;
2034   c_seq = (Mat_SeqAIJ*)C_oth->data;
2035   cols = c_seq->j;
2036   vals = c_seq->a;
2037   for (i=0; i<cm; i++) {
2038     ncols = c_seq->i[i+1] - c_seq->i[i];
2039     row = p->garray[i];
2040     ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr);
2041     cols += ncols; vals += ncols;
2042   }
2043 
2044   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2045   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2046 
2047   ptap->reuse = MAT_REUSE_MATRIX;
2048   PetscFunctionReturn(0);
2049 }
2050 
2051 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
2052 {
2053   PetscErrorCode      ierr;
2054   Mat_Product         *product = C->product;
2055   Mat                 A=product->A,P=product->B;
2056   MatProductAlgorithm alg=product->alg;
2057   PetscReal           fill=product->fill;
2058   PetscBool           flg;
2059 
2060   PetscFunctionBegin;
2061   /* scalable: do R=P^T locally, then C=R*A*P */
2062   ierr = PetscStrcmp(alg,"scalable",&flg);CHKERRQ(ierr);
2063   if (flg) {
2064     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C);CHKERRQ(ierr);
2065     C->ops->productnumeric = MatProductNumeric_PtAP;
2066     goto next;
2067   }
2068 
2069   /* nonscalable: do R=P^T locally, then C=R*A*P */
2070   ierr = PetscStrcmp(alg,"nonscalable",&flg);CHKERRQ(ierr);
2071   if (flg) {
2072     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);
2073     goto next;
2074   }
2075 
2076   /* allatonce */
2077   ierr = PetscStrcmp(alg,"allatonce",&flg);CHKERRQ(ierr);
2078   if (flg) {
2079     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);CHKERRQ(ierr);
2080     goto next;
2081   }
2082 
2083   /* allatonce_merged */
2084   ierr = PetscStrcmp(alg,"allatonce_merged",&flg);CHKERRQ(ierr);
2085   if (flg) {
2086     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);CHKERRQ(ierr);
2087     goto next;
2088   }
2089 
2090   /* backend general code */
2091   ierr = PetscStrcmp(alg,"backend",&flg);CHKERRQ(ierr);
2092   if (flg) {
2093     ierr = MatProductSymbolic_MPIAIJBACKEND(C);CHKERRQ(ierr);
2094     PetscFunctionReturn(0);
2095   }
2096 
2097   /* hypre */
2098 #if defined(PETSC_HAVE_HYPRE)
2099   ierr = PetscStrcmp(alg,"hypre",&flg);CHKERRQ(ierr);
2100   if (flg) {
2101     ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr);
2102     C->ops->productnumeric = MatProductNumeric_PtAP;
2103     PetscFunctionReturn(0);
2104   }
2105 #endif
2106   SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
2107 
2108 next:
2109   C->ops->productnumeric = MatProductNumeric_PtAP;
2110   PetscFunctionReturn(0);
2111 }
2112