xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision ffa8c5705e8ab2cf85ee1d14dbe507a6e2eb5283)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h>   /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <../src/mat/impls/aij/mpi/mpiaij.h>
10 #include <petscbt.h>
11 #include <petsctime.h>
12 #include <petsc/private/hashmapiv.h>
13 #include <petsc/private/hashseti.h>
14 #include <petscsf.h>
15 
16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
17 {
18   PetscBool         iascii;
19   PetscViewerFormat format;
20   Mat_APMPI         *ptap;
21 
22   PetscFunctionBegin;
23   MatCheckProduct(A,1);
24   ptap = (Mat_APMPI*)A->product->data;
25   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
26   if (iascii) {
27     PetscCall(PetscViewerGetFormat(viewer,&format));
28     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
29       if (ptap->algType == 0) {
30         PetscCall(PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n"));
31       } else if (ptap->algType == 1) {
32         PetscCall(PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n"));
33       } else if (ptap->algType == 2) {
34         PetscCall(PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n"));
35       } else if (ptap->algType == 3) {
36         PetscCall(PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n"));
37       }
38     }
39   }
40   PetscFunctionReturn(0);
41 }
42 
43 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
44 {
45   Mat_APMPI           *ptap = (Mat_APMPI*)data;
46   Mat_Merge_SeqsToMPI *merge;
47 
48   PetscFunctionBegin;
49   PetscCall(PetscFree2(ptap->startsj_s,ptap->startsj_r));
50   PetscCall(PetscFree(ptap->bufa));
51   PetscCall(MatDestroy(&ptap->P_loc));
52   PetscCall(MatDestroy(&ptap->P_oth));
53   PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */
54   PetscCall(MatDestroy(&ptap->Rd));
55   PetscCall(MatDestroy(&ptap->Ro));
56   if (ptap->AP_loc) { /* used by alg_rap */
57     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
58     PetscCall(PetscFree(ap->i));
59     PetscCall(PetscFree2(ap->j,ap->a));
60     PetscCall(MatDestroy(&ptap->AP_loc));
61   } else { /* used by alg_ptap */
62     PetscCall(PetscFree(ptap->api));
63     PetscCall(PetscFree(ptap->apj));
64   }
65   PetscCall(MatDestroy(&ptap->C_loc));
66   PetscCall(MatDestroy(&ptap->C_oth));
67   if (ptap->apa) PetscCall(PetscFree(ptap->apa));
68 
69   PetscCall(MatDestroy(&ptap->Pt));
70 
71   merge = ptap->merge;
72   if (merge) { /* used by alg_ptap */
73     PetscCall(PetscFree(merge->id_r));
74     PetscCall(PetscFree(merge->len_s));
75     PetscCall(PetscFree(merge->len_r));
76     PetscCall(PetscFree(merge->bi));
77     PetscCall(PetscFree(merge->bj));
78     PetscCall(PetscFree(merge->buf_ri[0]));
79     PetscCall(PetscFree(merge->buf_ri));
80     PetscCall(PetscFree(merge->buf_rj[0]));
81     PetscCall(PetscFree(merge->buf_rj));
82     PetscCall(PetscFree(merge->coi));
83     PetscCall(PetscFree(merge->coj));
84     PetscCall(PetscFree(merge->owners_co));
85     PetscCall(PetscLayoutDestroy(&merge->rowmap));
86     PetscCall(PetscFree(ptap->merge));
87   }
88   PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog));
89 
90   PetscCall(PetscSFDestroy(&ptap->sf));
91   PetscCall(PetscFree(ptap->c_othi));
92   PetscCall(PetscFree(ptap->c_rmti));
93   PetscCall(PetscFree(ptap));
94   PetscFunctionReturn(0);
95 }
96 
97 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
98 {
99   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
100   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
101   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
102   Mat_APMPI         *ptap;
103   Mat               AP_loc,C_loc,C_oth;
104   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
105   PetscScalar       *apa;
106   const PetscInt    *cols;
107   const PetscScalar *vals;
108 
109   PetscFunctionBegin;
110   MatCheckProduct(C,3);
111   ptap = (Mat_APMPI*)C->product->data;
112   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
113   PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
114 
115   PetscCall(MatZeroEntries(C));
116 
117   /* 1) get R = Pd^T,Ro = Po^T */
118   if (ptap->reuse == MAT_REUSE_MATRIX) {
119     PetscCall(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd));
120     PetscCall(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro));
121   }
122 
123   /* 2) get AP_loc */
124   AP_loc = ptap->AP_loc;
125   ap = (Mat_SeqAIJ*)AP_loc->data;
126 
127   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
128   /*-----------------------------------------------------*/
129   if (ptap->reuse == MAT_REUSE_MATRIX) {
130     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
131     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
132     PetscCall(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc));
133   }
134 
135   /* 2-2) compute numeric A_loc*P - dominating part */
136   /* ---------------------------------------------- */
137   /* get data from symbolic products */
138   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
139   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
140 
141   api   = ap->i;
142   apj   = ap->j;
143   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj));
144   for (i=0; i<am; i++) {
145     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
146     apnz = api[i+1] - api[i];
147     apa = ap->a + api[i];
148     PetscCall(PetscArrayzero(apa,apnz));
149     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
150   }
151   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj));
152   PetscCheckFalse(api[AP_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,api[AP_loc->rmap->n],nout);
153 
154   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
155   /* Always use scalable version since we are in the MPI scalable version */
156   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc));
157   PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth));
158 
159   C_loc = ptap->C_loc;
160   C_oth = ptap->C_oth;
161 
162   /* add C_loc and Co to to C */
163   PetscCall(MatGetOwnershipRange(C,&rstart,&rend));
164 
165   /* C_loc -> C */
166   cm    = C_loc->rmap->N;
167   c_seq = (Mat_SeqAIJ*)C_loc->data;
168   cols = c_seq->j;
169   vals = c_seq->a;
170   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j));
171 
172   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
173   /* when there are no off-processor parts.  */
174   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
175   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
176   /* a table, and other, more complex stuff has to be done. */
177   if (C->assembled) {
178     C->was_assembled = PETSC_TRUE;
179     C->assembled     = PETSC_FALSE;
180   }
181   if (C->was_assembled) {
182     for (i=0; i<cm; i++) {
183       ncols = c_seq->i[i+1] - c_seq->i[i];
184       row = rstart + i;
185       PetscCall(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES));
186       cols += ncols; vals += ncols;
187     }
188   } else {
189     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a));
190   }
191   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j));
192   PetscCheckFalse(c_seq->i[C_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
193 
194   /* Co -> C, off-processor part */
195   cm = C_oth->rmap->N;
196   c_seq = (Mat_SeqAIJ*)C_oth->data;
197   cols = c_seq->j;
198   vals = c_seq->a;
199   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j));
200   for (i=0; i<cm; i++) {
201     ncols = c_seq->i[i+1] - c_seq->i[i];
202     row = p->garray[i];
203     PetscCall(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES));
204     cols += ncols; vals += ncols;
205   }
206   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
207   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
208 
209   ptap->reuse = MAT_REUSE_MATRIX;
210 
211   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j));
212   PetscCheckFalse(c_seq->i[C_oth->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout);
213   PetscFunctionReturn(0);
214 }
215 
216 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
217 {
218   PetscErrorCode      ierr;
219   Mat_APMPI           *ptap;
220   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
221   MPI_Comm            comm;
222   PetscMPIInt         size,rank;
223   Mat                 P_loc,P_oth;
224   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
225   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
226   PetscInt            *lnk,i,k,pnz,row,nsend;
227   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
228   PETSC_UNUSED PetscMPIInt icompleted=0;
229   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
230   const PetscInt      *owners;
231   PetscInt            len,proc,*dnz,*onz,nzi,nspacedouble;
232   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
233   MPI_Request         *swaits,*rwaits;
234   MPI_Status          *sstatus,rstatus;
235   PetscLayout         rowmap;
236   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
237   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
238   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
239   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
240   PetscScalar         *apv;
241   PetscTable          ta;
242   MatType             mtype;
243   const char          *prefix;
244 #if defined(PETSC_USE_INFO)
245   PetscReal           apfill;
246 #endif
247 
248   PetscFunctionBegin;
249   MatCheckProduct(Cmpi,4);
250   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
251   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
252   PetscCallMPI(MPI_Comm_size(comm,&size));
253   PetscCallMPI(MPI_Comm_rank(comm,&rank));
254 
255   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
256 
257   /* create symbolic parallel matrix Cmpi */
258   PetscCall(MatGetType(A,&mtype));
259   PetscCall(MatSetType(Cmpi,mtype));
260 
261   /* create struct Mat_APMPI and attached it to C later */
262   PetscCall(PetscNew(&ptap));
263   ptap->reuse = MAT_INITIAL_MATRIX;
264   ptap->algType = 0;
265 
266   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
267   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth));
268   /* get P_loc by taking all local rows of P */
269   PetscCall(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc));
270 
271   ptap->P_loc = P_loc;
272   ptap->P_oth = P_oth;
273 
274   /* (0) compute Rd = Pd^T, Ro = Po^T  */
275   /* --------------------------------- */
276   PetscCall(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd));
277   PetscCall(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro));
278 
279   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
280   /* ----------------------------------------------------------------- */
281   p_loc  = (Mat_SeqAIJ*)P_loc->data;
282   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
283 
284   /* create and initialize a linked list */
285   PetscCall(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */
286   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
287   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
288   PetscCall(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */
289 
290   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
291 
292   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
293   if (ao) {
294     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space));
295   } else {
296     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space));
297   }
298   current_space = free_space;
299   nspacedouble  = 0;
300 
301   PetscCall(PetscMalloc1(am+1,&api));
302   api[0] = 0;
303   for (i=0; i<am; i++) {
304     /* diagonal portion: Ad[i,:]*P */
305     ai = ad->i; pi = p_loc->i;
306     nzi = ai[i+1] - ai[i];
307     aj  = ad->j + ai[i];
308     for (j=0; j<nzi; j++) {
309       row  = aj[j];
310       pnz  = pi[row+1] - pi[row];
311       Jptr = p_loc->j + pi[row];
312       /* add non-zero cols of P into the sorted linked list lnk */
313       PetscCall(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk));
314     }
315     /* off-diagonal portion: Ao[i,:]*P */
316     if (ao) {
317       ai = ao->i; pi = p_oth->i;
318       nzi = ai[i+1] - ai[i];
319       aj  = ao->j + ai[i];
320       for (j=0; j<nzi; j++) {
321         row  = aj[j];
322         pnz  = pi[row+1] - pi[row];
323         Jptr = p_oth->j + pi[row];
324         PetscCall(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk));
325       }
326     }
327     apnz     = lnk[0];
328     api[i+1] = api[i] + apnz;
329 
330     /* if free space is not available, double the total space in the list */
331     if (current_space->local_remaining<apnz) {
332       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space));
333       nspacedouble++;
334     }
335 
336     /* Copy data into free space, then initialize lnk */
337     PetscCall(PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk));
338 
339     current_space->array           += apnz;
340     current_space->local_used      += apnz;
341     current_space->local_remaining -= apnz;
342   }
343   /* Allocate space for apj and apv, initialize apj, and */
344   /* destroy list of free space and other temporary array(s) */
345   PetscCall(PetscCalloc2(api[am],&apj,api[am],&apv));
346   PetscCall(PetscFreeSpaceContiguous(&free_space,apj));
347   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
348 
349   /* Create AP_loc for reuse */
350   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc));
351   PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog));
352 
353 #if defined(PETSC_USE_INFO)
354   if (ao) {
355     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
356   } else {
357     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
358   }
359   ptap->AP_loc->info.mallocs           = nspacedouble;
360   ptap->AP_loc->info.fill_ratio_given  = fill;
361   ptap->AP_loc->info.fill_ratio_needed = apfill;
362 
363   if (api[am]) {
364     PetscCall(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill));
365     PetscCall(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill));
366   } else {
367     PetscCall(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n"));
368   }
369 #endif
370 
371   /* (2-1) compute symbolic Co = Ro*AP_loc  */
372   /* -------------------------------------- */
373   PetscCall(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth));
374   PetscCall(MatGetOptionsPrefix(A,&prefix));
375   PetscCall(MatSetOptionsPrefix(ptap->C_oth,prefix));
376   PetscCall(MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_"));
377 
378   PetscCall(MatProductSetType(ptap->C_oth,MATPRODUCT_AB));
379   PetscCall(MatProductSetAlgorithm(ptap->C_oth,"sorted"));
380   PetscCall(MatProductSetFill(ptap->C_oth,fill));
381   PetscCall(MatProductSetFromOptions(ptap->C_oth));
382   PetscCall(MatProductSymbolic(ptap->C_oth));
383 
384   /* (3) send coj of C_oth to other processors  */
385   /* ------------------------------------------ */
386   /* determine row ownership */
387   PetscCall(PetscLayoutCreate(comm,&rowmap));
388   PetscCall(PetscLayoutSetLocalSize(rowmap, pn));
389   PetscCall(PetscLayoutSetBlockSize(rowmap, 1));
390   PetscCall(PetscLayoutSetUp(rowmap));
391   PetscCall(PetscLayoutGetRanges(rowmap,&owners));
392 
393   /* determine the number of messages to send, their lengths */
394   PetscCall(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co));
395   PetscCall(PetscArrayzero(len_s,size));
396   PetscCall(PetscArrayzero(len_si,size));
397 
398   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
399   coi   = c_oth->i; coj = c_oth->j;
400   con   = ptap->C_oth->rmap->n;
401   proc  = 0;
402   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj));
403   for (i=0; i<con; i++) {
404     while (prmap[i] >= owners[proc+1]) proc++;
405     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
406     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
407   }
408 
409   len          = 0; /* max length of buf_si[], see (4) */
410   owners_co[0] = 0;
411   nsend        = 0;
412   for (proc=0; proc<size; proc++) {
413     owners_co[proc+1] = owners_co[proc] + len_si[proc];
414     if (len_s[proc]) {
415       nsend++;
416       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
417       len         += len_si[proc];
418     }
419   }
420 
421   /* determine the number and length of messages to receive for coi and coj  */
422   PetscCall(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv));
423   PetscCall(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri));
424 
425   /* post the Irecv and Isend of coj */
426   PetscCall(PetscCommGetNewTag(comm,&tagj));
427   PetscCall(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits));
428   PetscCall(PetscMalloc1(nsend+1,&swaits));
429   for (proc=0, k=0; proc<size; proc++) {
430     if (!len_s[proc]) continue;
431     i    = owners_co[proc];
432     PetscCallMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k));
433     k++;
434   }
435 
436   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
437   /* ---------------------------------------- */
438   PetscCall(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc));
439   PetscCall(MatProductSetType(ptap->C_loc,MATPRODUCT_AB));
440   PetscCall(MatProductSetAlgorithm(ptap->C_loc,"default"));
441   PetscCall(MatProductSetFill(ptap->C_loc,fill));
442 
443   PetscCall(MatSetOptionsPrefix(ptap->C_loc,prefix));
444   PetscCall(MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_"));
445 
446   PetscCall(MatProductSetFromOptions(ptap->C_loc));
447   PetscCall(MatProductSymbolic(ptap->C_loc));
448 
449   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
450   PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j));
451 
452   /* receives coj are complete */
453   for (i=0; i<nrecv; i++) {
454     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
455   }
456   PetscCall(PetscFree(rwaits));
457   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
458 
459   /* add received column indices into ta to update Crmax */
460   for (k=0; k<nrecv; k++) {/* k-th received message */
461     Jptr = buf_rj[k];
462     for (j=0; j<len_r[k]; j++) {
463       PetscCall(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES));
464     }
465   }
466   PetscCall(PetscTableGetCount(ta,&Crmax));
467   PetscCall(PetscTableDestroy(&ta));
468 
469   /* (4) send and recv coi */
470   /*-----------------------*/
471   PetscCall(PetscCommGetNewTag(comm,&tagi));
472   PetscCall(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits));
473   PetscCall(PetscMalloc1(len+1,&buf_s));
474   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
475   for (proc=0,k=0; proc<size; proc++) {
476     if (!len_s[proc]) continue;
477     /* form outgoing message for i-structure:
478          buf_si[0]:                 nrows to be sent
479                [1:nrows]:           row index (global)
480                [nrows+1:2*nrows+1]: i-structure index
481     */
482     /*-------------------------------------------*/
483     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
484     buf_si_i    = buf_si + nrows+1;
485     buf_si[0]   = nrows;
486     buf_si_i[0] = 0;
487     nrows       = 0;
488     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
489       nzi = coi[i+1] - coi[i];
490       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
491       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
492       nrows++;
493     }
494     PetscCallMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k));
495     k++;
496     buf_si += len_si[proc];
497   }
498   for (i=0; i<nrecv; i++) {
499     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
500   }
501   PetscCall(PetscFree(rwaits));
502   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
503 
504   PetscCall(PetscFree4(len_s,len_si,sstatus,owners_co));
505   PetscCall(PetscFree(len_ri));
506   PetscCall(PetscFree(swaits));
507   PetscCall(PetscFree(buf_s));
508 
509   /* (5) compute the local portion of Cmpi      */
510   /* ------------------------------------------ */
511   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
512   PetscCall(PetscFreeSpaceGet(Crmax,&free_space));
513   current_space = free_space;
514 
515   PetscCall(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci));
516   for (k=0; k<nrecv; k++) {
517     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
518     nrows       = *buf_ri_k[k];
519     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
520     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
521   }
522 
523   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);PetscCall(ierr);
524   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
525   for (i=0; i<pn; i++) {
526     /* add C_loc into Cmpi */
527     nzi  = c_loc->i[i+1] - c_loc->i[i];
528     Jptr = c_loc->j + c_loc->i[i];
529     PetscCall(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk));
530 
531     /* add received col data into lnk */
532     for (k=0; k<nrecv; k++) { /* k-th received message */
533       if (i == *nextrow[k]) { /* i-th row */
534         nzi  = *(nextci[k]+1) - *nextci[k];
535         Jptr = buf_rj[k] + *nextci[k];
536         PetscCall(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk));
537         nextrow[k]++; nextci[k]++;
538       }
539     }
540     nzi = lnk[0];
541 
542     /* copy data into free space, then initialize lnk */
543     PetscCall(PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk));
544     PetscCall(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz));
545   }
546   PetscCall(PetscFree3(buf_ri_k,nextrow,nextci));
547   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
548   PetscCall(PetscFreeSpaceDestroy(free_space));
549 
550   /* local sizes and preallocation */
551   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
552   if (P->cmap->bs > 0) {
553     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs));
554     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs));
555   }
556   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
557   ierr = MatPreallocateFinalize(dnz,onz);PetscCall(ierr);
558 
559   /* members in merge */
560   PetscCall(PetscFree(id_r));
561   PetscCall(PetscFree(len_r));
562   PetscCall(PetscFree(buf_ri[0]));
563   PetscCall(PetscFree(buf_ri));
564   PetscCall(PetscFree(buf_rj[0]));
565   PetscCall(PetscFree(buf_rj));
566   PetscCall(PetscLayoutDestroy(&rowmap));
567 
568   nout = 0;
569   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j));
570   PetscCheckFalse(c_oth->i[ptap->C_oth->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_oth->i[ptap->C_oth->rmap->n],nout);
571   PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j));
572   PetscCheckFalse(c_loc->i[ptap->C_loc->rmap->n] != nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_loc->i[ptap->C_loc->rmap->n],nout);
573 
574   /* attach the supporting struct to Cmpi for reuse */
575   Cmpi->product->data    = ptap;
576   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
577   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
578 
579   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
580   Cmpi->assembled        = PETSC_FALSE;
581   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
582   PetscFunctionReturn(0);
583 }
584 
585 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
586 {
587   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
588   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
589   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
590   PetscInt             pcstart,pcend,column,offset;
591 
592   PetscFunctionBegin;
593   pcstart = P->cmap->rstart;
594   pcstart *= dof;
595   pcend   = P->cmap->rend;
596   pcend   *= dof;
597   /* diagonal portion: Ad[i,:]*P */
598   ai = ad->i;
599   nzi = ai[i+1] - ai[i];
600   aj  = ad->j + ai[i];
601   for (j=0; j<nzi; j++) {
602     row  = aj[j];
603     offset = row%dof;
604     row   /= dof;
605     nzpi = pd->i[row+1] - pd->i[row];
606     pj  = pd->j + pd->i[row];
607     for (k=0; k<nzpi; k++) {
608       PetscCall(PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart));
609     }
610   }
611   /* off diag P */
612   for (j=0; j<nzi; j++) {
613     row  = aj[j];
614     offset = row%dof;
615     row   /= dof;
616     nzpi = po->i[row+1] - po->i[row];
617     pj  = po->j + po->i[row];
618     for (k=0; k<nzpi; k++) {
619       PetscCall(PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset));
620     }
621   }
622 
623   /* off diagonal part: Ao[i, :]*P_oth */
624   if (ao) {
625     ai = ao->i;
626     pi = p_oth->i;
627     nzi = ai[i+1] - ai[i];
628     aj  = ao->j + ai[i];
629     for (j=0; j<nzi; j++) {
630       row  = aj[j];
631       offset = a->garray[row]%dof;
632       row  = map[row];
633       pnz  = pi[row+1] - pi[row];
634       p_othcols = p_oth->j + pi[row];
635       for (col=0; col<pnz; col++) {
636         column = p_othcols[col] * dof + offset;
637         if (column>=pcstart && column<pcend) {
638           PetscCall(PetscHSetIAdd(dht,column));
639         } else {
640           PetscCall(PetscHSetIAdd(oht,column));
641         }
642       }
643     }
644   } /* end if (ao) */
645   PetscFunctionReturn(0);
646 }
647 
648 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
649 {
650   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
651   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
652   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
653   PetscScalar          ra,*aa,*pa;
654 
655   PetscFunctionBegin;
656   pcstart = P->cmap->rstart;
657   pcstart *= dof;
658 
659   /* diagonal portion: Ad[i,:]*P */
660   ai  = ad->i;
661   nzi = ai[i+1] - ai[i];
662   aj  = ad->j + ai[i];
663   aa  = ad->a + ai[i];
664   for (j=0; j<nzi; j++) {
665     ra   = aa[j];
666     row  = aj[j];
667     offset = row%dof;
668     row    /= dof;
669     nzpi = pd->i[row+1] - pd->i[row];
670     pj = pd->j + pd->i[row];
671     pa = pd->a + pd->i[row];
672     for (k=0; k<nzpi; k++) {
673       PetscCall(PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]));
674     }
675     PetscCall(PetscLogFlops(2.0*nzpi));
676   }
677   for (j=0; j<nzi; j++) {
678     ra   = aa[j];
679     row  = aj[j];
680     offset = row%dof;
681     row   /= dof;
682     nzpi = po->i[row+1] - po->i[row];
683     pj = po->j + po->i[row];
684     pa = po->a + po->i[row];
685     for (k=0; k<nzpi; k++) {
686       PetscCall(PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]));
687     }
688     PetscCall(PetscLogFlops(2.0*nzpi));
689   }
690 
691   /* off diagonal part: Ao[i, :]*P_oth */
692   if (ao) {
693     ai = ao->i;
694     pi = p_oth->i;
695     nzi = ai[i+1] - ai[i];
696     aj  = ao->j + ai[i];
697     aa  = ao->a + ai[i];
698     for (j=0; j<nzi; j++) {
699       row  = aj[j];
700       offset = a->garray[row]%dof;
701       row    = map[row];
702       ra   = aa[j];
703       pnz  = pi[row+1] - pi[row];
704       p_othcols = p_oth->j + pi[row];
705       pa   = p_oth->a + pi[row];
706       for (col=0; col<pnz; col++) {
707         PetscCall(PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]));
708       }
709       PetscCall(PetscLogFlops(2.0*pnz));
710     }
711   } /* end if (ao) */
712 
713   PetscFunctionReturn(0);
714 }
715 
716 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
717 
718 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
719 {
720   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
721   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
722   Mat_APMPI         *ptap;
723   PetscHMapIV       hmap;
724   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
725   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
726   PetscInt          offset,ii,pocol;
727   const PetscInt    *mappingindices;
728   IS                map;
729 
730   PetscFunctionBegin;
731   MatCheckProduct(C,4);
732   ptap = (Mat_APMPI*)C->product->data;
733   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
734   PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
735 
736   PetscCall(MatZeroEntries(C));
737 
738   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
739   /*-----------------------------------------------------*/
740   if (ptap->reuse == MAT_REUSE_MATRIX) {
741     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
742     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth));
743   }
744   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
745 
746   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
747   pon *= dof;
748   PetscCall(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta));
749   PetscCall(MatGetLocalSize(A,&am,NULL));
750   cmaxr = 0;
751   for (i=0; i<pon; i++) {
752     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
753   }
754   PetscCall(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc));
755   PetscCall(PetscHMapIVCreate(&hmap));
756   PetscCall(PetscHMapIVResize(hmap,cmaxr));
757   PetscCall(ISGetIndices(map,&mappingindices));
758   for (i=0; i<am && pon; i++) {
759     PetscCall(PetscHMapIVClear(hmap));
760     offset = i%dof;
761     ii     = i/dof;
762     nzi = po->i[ii+1] - po->i[ii];
763     if (!nzi) continue;
764     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
765     voff = 0;
766     PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
767     if (!voff) continue;
768 
769     /* Form C(ii, :) */
770     poj = po->j + po->i[ii];
771     poa = po->a + po->i[ii];
772     for (j=0; j<nzi; j++) {
773       pocol = poj[j]*dof+offset;
774       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
775       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
776       for (jj=0; jj<voff; jj++) {
777         apvaluestmp[jj] = apvalues[jj]*poa[j];
778         /*If the row is empty */
779         if (!c_rmtc[pocol]) {
780           c_rmtjj[jj] = apindices[jj];
781           c_rmtaa[jj] = apvaluestmp[jj];
782           c_rmtc[pocol]++;
783         } else {
784           PetscCall(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc));
785           if (loc>=0){ /* hit */
786             c_rmtaa[loc] += apvaluestmp[jj];
787             PetscCall(PetscLogFlops(1.0));
788           } else { /* new element */
789             loc = -(loc+1);
790             /* Move data backward */
791             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
792               c_rmtjj[kk] = c_rmtjj[kk-1];
793               c_rmtaa[kk] = c_rmtaa[kk-1];
794             }/* End kk */
795             c_rmtjj[loc] = apindices[jj];
796             c_rmtaa[loc] = apvaluestmp[jj];
797             c_rmtc[pocol]++;
798           }
799         }
800         PetscCall(PetscLogFlops(voff));
801       } /* End jj */
802     } /* End j */
803   } /* End i */
804 
805   PetscCall(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc));
806   PetscCall(PetscHMapIVDestroy(&hmap));
807 
808   PetscCall(MatGetLocalSize(P,NULL,&pn));
809   pn *= dof;
810   PetscCall(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha));
811 
812   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
813   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
814   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
815   pcstart = pcstart*dof;
816   pcend   = pcend*dof;
817   cd = (Mat_SeqAIJ*)(c->A)->data;
818   co = (Mat_SeqAIJ*)(c->B)->data;
819 
820   cmaxr = 0;
821   for (i=0; i<pn; i++) {
822     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
823   }
824   PetscCall(PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ));
825   PetscCall(PetscHMapIVCreate(&hmap));
826   PetscCall(PetscHMapIVResize(hmap,cmaxr));
827   for (i=0; i<am && pn; i++) {
828     PetscCall(PetscHMapIVClear(hmap));
829     offset = i%dof;
830     ii     = i/dof;
831     nzi = pd->i[ii+1] - pd->i[ii];
832     if (!nzi) continue;
833     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
834     voff = 0;
835     PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
836     if (!voff) continue;
837     /* Form C(ii, :) */
838     pdj = pd->j + pd->i[ii];
839     pda = pd->a + pd->i[ii];
840     for (j=0; j<nzi; j++) {
841       row = pcstart + pdj[j] * dof + offset;
842       for (jj=0; jj<voff; jj++) {
843         apvaluestmp[jj] = apvalues[jj]*pda[j];
844       }
845       PetscCall(PetscLogFlops(voff));
846       PetscCall(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES));
847     }
848   }
849   PetscCall(ISRestoreIndices(map,&mappingindices));
850   PetscCall(MatGetOwnershipRangeColumn(C,&ccstart,&ccend));
851   PetscCall(PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ));
852   PetscCall(PetscHMapIVDestroy(&hmap));
853   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
854   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
855   PetscCall(PetscFree2(c_rmtj,c_rmta));
856 
857   /* Add contributions from remote */
858   for (i = 0; i < pn; i++) {
859     row = i + pcstart;
860     PetscCall(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES));
861   }
862   PetscCall(PetscFree2(c_othj,c_otha));
863 
864   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
865   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
866 
867   ptap->reuse = MAT_REUSE_MATRIX;
868   PetscFunctionReturn(0);
869 }
870 
871 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
872 {
873   PetscFunctionBegin;
874 
875   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C));
876   PetscFunctionReturn(0);
877 }
878 
879 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
880 {
881   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
882   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
883   Mat_APMPI         *ptap;
884   PetscHMapIV       hmap;
885   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
886   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
887   PetscInt          offset,ii,pocol;
888   const PetscInt    *mappingindices;
889   IS                map;
890 
891   PetscFunctionBegin;
892   MatCheckProduct(C,4);
893   ptap = (Mat_APMPI*)C->product->data;
894   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
895   PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
896 
897   PetscCall(MatZeroEntries(C));
898 
899   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
900   /*-----------------------------------------------------*/
901   if (ptap->reuse == MAT_REUSE_MATRIX) {
902     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
903     PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth));
904   }
905   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
906   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
907   pon *= dof;
908   PetscCall(MatGetLocalSize(P,NULL,&pn));
909   pn  *= dof;
910 
911   PetscCall(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta));
912   PetscCall(MatGetLocalSize(A,&am,NULL));
913   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
914   pcstart *= dof;
915   pcend   *= dof;
916   cmaxr = 0;
917   for (i=0; i<pon; i++) {
918     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
919   }
920   cd = (Mat_SeqAIJ*)(c->A)->data;
921   co = (Mat_SeqAIJ*)(c->B)->data;
922   for (i=0; i<pn; i++) {
923     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
924   }
925   PetscCall(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc));
926   PetscCall(PetscHMapIVCreate(&hmap));
927   PetscCall(PetscHMapIVResize(hmap,cmaxr));
928   PetscCall(ISGetIndices(map,&mappingindices));
929   for (i=0; i<am && (pon || pn); i++) {
930     PetscCall(PetscHMapIVClear(hmap));
931     offset = i%dof;
932     ii     = i/dof;
933     nzi  = po->i[ii+1] - po->i[ii];
934     dnzi = pd->i[ii+1] - pd->i[ii];
935     if (!nzi && !dnzi) continue;
936     PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap));
937     voff = 0;
938     PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues));
939     if (!voff) continue;
940 
941     /* Form remote C(ii, :) */
942     poj = po->j + po->i[ii];
943     poa = po->a + po->i[ii];
944     for (j=0; j<nzi; j++) {
945       pocol = poj[j]*dof+offset;
946       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
947       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
948       for (jj=0; jj<voff; jj++) {
949         apvaluestmp[jj] = apvalues[jj]*poa[j];
950         /*If the row is empty */
951         if (!c_rmtc[pocol]) {
952           c_rmtjj[jj] = apindices[jj];
953           c_rmtaa[jj] = apvaluestmp[jj];
954           c_rmtc[pocol]++;
955         } else {
956           PetscCall(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc));
957           if (loc>=0){ /* hit */
958             c_rmtaa[loc] += apvaluestmp[jj];
959             PetscCall(PetscLogFlops(1.0));
960           } else { /* new element */
961             loc = -(loc+1);
962             /* Move data backward */
963             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
964               c_rmtjj[kk] = c_rmtjj[kk-1];
965               c_rmtaa[kk] = c_rmtaa[kk-1];
966             }/* End kk */
967             c_rmtjj[loc] = apindices[jj];
968             c_rmtaa[loc] = apvaluestmp[jj];
969             c_rmtc[pocol]++;
970           }
971         }
972       } /* End jj */
973       PetscCall(PetscLogFlops(voff));
974     } /* End j */
975 
976     /* Form local C(ii, :) */
977     pdj = pd->j + pd->i[ii];
978     pda = pd->a + pd->i[ii];
979     for (j=0; j<dnzi; j++) {
980       row = pcstart + pdj[j] * dof + offset;
981       for (jj=0; jj<voff; jj++) {
982         apvaluestmp[jj] = apvalues[jj]*pda[j];
983       }/* End kk */
984       PetscCall(PetscLogFlops(voff));
985       PetscCall(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES));
986     }/* End j */
987   } /* End i */
988 
989   PetscCall(ISRestoreIndices(map,&mappingindices));
990   PetscCall(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc));
991   PetscCall(PetscHMapIVDestroy(&hmap));
992   PetscCall(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha));
993 
994   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
995   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
996   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
997   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE));
998   PetscCall(PetscFree2(c_rmtj,c_rmta));
999 
1000   /* Add contributions from remote */
1001   for (i = 0; i < pn; i++) {
1002     row = i + pcstart;
1003     PetscCall(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES));
1004   }
1005   PetscCall(PetscFree2(c_othj,c_otha));
1006 
1007   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1008   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1009 
1010   ptap->reuse = MAT_REUSE_MATRIX;
1011   PetscFunctionReturn(0);
1012 }
1013 
1014 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1015 {
1016   PetscFunctionBegin;
1017 
1018   PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C));
1019   PetscFunctionReturn(0);
1020 }
1021 
1022 /* TODO: move algorithm selection to MatProductSetFromOptions */
1023 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1024 {
1025   Mat_APMPI           *ptap;
1026   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
1027   MPI_Comm            comm;
1028   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1029   MatType             mtype;
1030   PetscSF             sf;
1031   PetscSFNode         *iremote;
1032   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1033   const PetscInt      *rootdegrees;
1034   PetscHSetI          ht,oht,*hta,*hto;
1035   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1036   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1037   PetscInt            nalg=2,alg=0,offset,ii;
1038   PetscMPIInt         owner;
1039   const PetscInt      *mappingindices;
1040   PetscBool           flg;
1041   const char          *algTypes[2] = {"overlapping","merged"};
1042   IS                  map;
1043   PetscErrorCode      ierr;
1044 
1045   PetscFunctionBegin;
1046   MatCheckProduct(Cmpi,5);
1047   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
1048   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1049 
1050   /* Create symbolic parallel matrix Cmpi */
1051   PetscCall(MatGetLocalSize(P,NULL,&pn));
1052   pn *= dof;
1053   PetscCall(MatGetType(A,&mtype));
1054   PetscCall(MatSetType(Cmpi,mtype));
1055   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1056 
1057   PetscCall(PetscNew(&ptap));
1058   ptap->reuse = MAT_INITIAL_MATRIX;
1059   ptap->algType = 2;
1060 
1061   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1062   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth));
1063   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
1064   /* This equals to the number of offdiag columns in P */
1065   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
1066   pon *= dof;
1067   /* offsets */
1068   PetscCall(PetscMalloc1(pon+1,&ptap->c_rmti));
1069   /* The number of columns we will send to remote ranks */
1070   PetscCall(PetscMalloc1(pon,&c_rmtc));
1071   PetscCall(PetscMalloc1(pon,&hta));
1072   for (i=0; i<pon; i++) {
1073     PetscCall(PetscHSetICreate(&hta[i]));
1074   }
1075   PetscCall(MatGetLocalSize(A,&am,NULL));
1076   PetscCall(MatGetOwnershipRange(A,&arstart,&arend));
1077   /* Create hash table to merge all columns for C(i, :) */
1078   PetscCall(PetscHSetICreate(&ht));
1079 
1080   PetscCall(ISGetIndices(map,&mappingindices));
1081   ptap->c_rmti[0] = 0;
1082   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1083   for (i=0; i<am && pon; i++) {
1084     /* Form one row of AP */
1085     PetscCall(PetscHSetIClear(ht));
1086     offset = i%dof;
1087     ii     = i/dof;
1088     /* If the off diag is empty, we should not do any calculation */
1089     nzi = po->i[ii+1] - po->i[ii];
1090     if (!nzi) continue;
1091 
1092     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht));
1093     PetscCall(PetscHSetIGetSize(ht,&htsize));
1094     /* If AP is empty, just continue */
1095     if (!htsize) continue;
1096     /* Form C(ii, :) */
1097     poj = po->j + po->i[ii];
1098     for (j=0; j<nzi; j++) {
1099       PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht));
1100     }
1101   }
1102 
1103   for (i=0; i<pon; i++) {
1104     PetscCall(PetscHSetIGetSize(hta[i],&htsize));
1105     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1106     c_rmtc[i] = htsize;
1107   }
1108 
1109   PetscCall(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj));
1110 
1111   for (i=0; i<pon; i++) {
1112     off = 0;
1113     PetscCall(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]));
1114     PetscCall(PetscHSetIDestroy(&hta[i]));
1115   }
1116   PetscCall(PetscFree(hta));
1117 
1118   PetscCall(PetscMalloc1(pon,&iremote));
1119   for (i=0; i<pon; i++) {
1120     owner = 0; lidx = 0;
1121     offset = i%dof;
1122     ii     = i/dof;
1123     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx));
1124     iremote[i].index = lidx*dof + offset;
1125     iremote[i].rank  = owner;
1126   }
1127 
1128   PetscCall(PetscSFCreate(comm,&sf));
1129   PetscCall(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
1130   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1131   PetscCall(PetscSFSetRankOrder(sf,PETSC_TRUE));
1132   PetscCall(PetscSFSetFromOptions(sf));
1133   PetscCall(PetscSFSetUp(sf));
1134   /* How many neighbors have contributions to my rows? */
1135   PetscCall(PetscSFComputeDegreeBegin(sf,&rootdegrees));
1136   PetscCall(PetscSFComputeDegreeEnd(sf,&rootdegrees));
1137   rootspacesize = 0;
1138   for (i = 0; i < pn; i++) {
1139     rootspacesize += rootdegrees[i];
1140   }
1141   PetscCall(PetscMalloc1(rootspacesize,&rootspace));
1142   PetscCall(PetscMalloc1(rootspacesize+1,&rootspaceoffsets));
1143   /* Get information from leaves
1144    * Number of columns other people contribute to my rows
1145    * */
1146   PetscCall(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace));
1147   PetscCall(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace));
1148   PetscCall(PetscFree(c_rmtc));
1149   PetscCall(PetscCalloc1(pn+1,&ptap->c_othi));
1150   /* The number of columns is received for each row */
1151   ptap->c_othi[0] = 0;
1152   rootspacesize = 0;
1153   rootspaceoffsets[0] = 0;
1154   for (i = 0; i < pn; i++) {
1155     rcvncols = 0;
1156     for (j = 0; j<rootdegrees[i]; j++) {
1157       rcvncols += rootspace[rootspacesize];
1158       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1159       rootspacesize++;
1160     }
1161     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1162   }
1163   PetscCall(PetscFree(rootspace));
1164 
1165   PetscCall(PetscMalloc1(pon,&c_rmtoffsets));
1166   PetscCall(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
1167   PetscCall(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
1168   PetscCall(PetscSFDestroy(&sf));
1169   PetscCall(PetscFree(rootspaceoffsets));
1170 
1171   PetscCall(PetscCalloc1(ptap->c_rmti[pon],&iremote));
1172   nleaves = 0;
1173   for (i = 0; i<pon; i++) {
1174     owner = 0;
1175     ii = i/dof;
1176     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL));
1177     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1178     for (j=0; j<sendncols; j++) {
1179       iremote[nleaves].rank = owner;
1180       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1181     }
1182   }
1183   PetscCall(PetscFree(c_rmtoffsets));
1184   PetscCall(PetscCalloc1(ptap->c_othi[pn],&c_othj));
1185 
1186   PetscCall(PetscSFCreate(comm,&ptap->sf));
1187   PetscCall(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
1188   PetscCall(PetscSFSetFromOptions(ptap->sf));
1189   /* One to one map */
1190   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
1191 
1192   PetscCall(PetscMalloc2(pn,&dnz,pn,&onz));
1193   PetscCall(PetscHSetICreate(&oht));
1194   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
1195   pcstart *= dof;
1196   pcend   *= dof;
1197   PetscCall(PetscMalloc2(pn,&hta,pn,&hto));
1198   for (i=0; i<pn; i++) {
1199     PetscCall(PetscHSetICreate(&hta[i]));
1200     PetscCall(PetscHSetICreate(&hto[i]));
1201   }
1202   /* Work on local part */
1203   /* 4) Pass 1: Estimate memory for C_loc */
1204   for (i=0; i<am && pn; i++) {
1205     PetscCall(PetscHSetIClear(ht));
1206     PetscCall(PetscHSetIClear(oht));
1207     offset = i%dof;
1208     ii     = i/dof;
1209     nzi = pd->i[ii+1] - pd->i[ii];
1210     if (!nzi) continue;
1211 
1212     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht));
1213     PetscCall(PetscHSetIGetSize(ht,&htsize));
1214     PetscCall(PetscHSetIGetSize(oht,&htosize));
1215     if (!(htsize+htosize)) continue;
1216     /* Form C(ii, :) */
1217     pdj = pd->j + pd->i[ii];
1218     for (j=0; j<nzi; j++) {
1219       PetscCall(PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht));
1220       PetscCall(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht));
1221     }
1222   }
1223 
1224   PetscCall(ISRestoreIndices(map,&mappingindices));
1225 
1226   PetscCall(PetscHSetIDestroy(&ht));
1227   PetscCall(PetscHSetIDestroy(&oht));
1228 
1229   /* Get remote data */
1230   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
1231   PetscCall(PetscFree(c_rmtj));
1232 
1233   for (i = 0; i < pn; i++) {
1234     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1235     rdj = c_othj + ptap->c_othi[i];
1236     for (j = 0; j < nzi; j++) {
1237       col = rdj[j];
1238       /* diag part */
1239       if (col>=pcstart && col<pcend) {
1240         PetscCall(PetscHSetIAdd(hta[i],col));
1241       } else { /* off diag */
1242         PetscCall(PetscHSetIAdd(hto[i],col));
1243       }
1244     }
1245     PetscCall(PetscHSetIGetSize(hta[i],&htsize));
1246     dnz[i] = htsize;
1247     PetscCall(PetscHSetIDestroy(&hta[i]));
1248     PetscCall(PetscHSetIGetSize(hto[i],&htsize));
1249     onz[i] = htsize;
1250     PetscCall(PetscHSetIDestroy(&hto[i]));
1251   }
1252 
1253   PetscCall(PetscFree2(hta,hto));
1254   PetscCall(PetscFree(c_othj));
1255 
1256   /* local sizes and preallocation */
1257   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1258   PetscCall(MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs));
1259   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
1260   PetscCall(MatSetUp(Cmpi));
1261   PetscCall(PetscFree2(dnz,onz));
1262 
1263   /* attach the supporting struct to Cmpi for reuse */
1264   Cmpi->product->data    = ptap;
1265   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1266   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1267 
1268   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1269   Cmpi->assembled = PETSC_FALSE;
1270   /* pick an algorithm */
1271   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");PetscCall(ierr);
1272   alg = 0;
1273   PetscCall(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg));
1274   ierr = PetscOptionsEnd();PetscCall(ierr);
1275   switch (alg) {
1276     case 0:
1277       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1278       break;
1279     case 1:
1280       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1281       break;
1282     default:
1283       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1284   }
1285   PetscFunctionReturn(0);
1286 }
1287 
1288 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1289 {
1290   PetscFunctionBegin;
1291   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C));
1292   PetscFunctionReturn(0);
1293 }
1294 
1295 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1296 {
1297   Mat_APMPI           *ptap;
1298   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
1299   MPI_Comm            comm;
1300   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1301   MatType             mtype;
1302   PetscSF             sf;
1303   PetscSFNode         *iremote;
1304   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1305   const PetscInt      *rootdegrees;
1306   PetscHSetI          ht,oht,*hta,*hto,*htd;
1307   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1308   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1309   PetscInt            nalg=2,alg=0,offset,ii;
1310   PetscMPIInt         owner;
1311   PetscBool           flg;
1312   const char          *algTypes[2] = {"merged","overlapping"};
1313   const PetscInt      *mappingindices;
1314   IS                  map;
1315   PetscErrorCode      ierr;
1316 
1317   PetscFunctionBegin;
1318   MatCheckProduct(Cmpi,5);
1319   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
1320   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1321 
1322   /* Create symbolic parallel matrix Cmpi */
1323   PetscCall(MatGetLocalSize(P,NULL,&pn));
1324   pn *= dof;
1325   PetscCall(MatGetType(A,&mtype));
1326   PetscCall(MatSetType(Cmpi,mtype));
1327   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1328 
1329   PetscCall(PetscNew(&ptap));
1330   ptap->reuse = MAT_INITIAL_MATRIX;
1331   ptap->algType = 3;
1332 
1333   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1334   PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth));
1335   PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map));
1336 
1337   /* This equals to the number of offdiag columns in P */
1338   PetscCall(MatGetLocalSize(p->B,NULL,&pon));
1339   pon *= dof;
1340   /* offsets */
1341   PetscCall(PetscMalloc1(pon+1,&ptap->c_rmti));
1342   /* The number of columns we will send to remote ranks */
1343   PetscCall(PetscMalloc1(pon,&c_rmtc));
1344   PetscCall(PetscMalloc1(pon,&hta));
1345   for (i=0; i<pon; i++) {
1346     PetscCall(PetscHSetICreate(&hta[i]));
1347   }
1348   PetscCall(MatGetLocalSize(A,&am,NULL));
1349   PetscCall(MatGetOwnershipRange(A,&arstart,&arend));
1350   /* Create hash table to merge all columns for C(i, :) */
1351   PetscCall(PetscHSetICreate(&ht));
1352   PetscCall(PetscHSetICreate(&oht));
1353   PetscCall(PetscMalloc2(pn,&htd,pn,&hto));
1354   for (i=0; i<pn; i++) {
1355     PetscCall(PetscHSetICreate(&htd[i]));
1356     PetscCall(PetscHSetICreate(&hto[i]));
1357   }
1358 
1359   PetscCall(ISGetIndices(map,&mappingindices));
1360   ptap->c_rmti[0] = 0;
1361   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1362   for (i=0; i<am && (pon || pn); i++) {
1363     /* Form one row of AP */
1364     PetscCall(PetscHSetIClear(ht));
1365     PetscCall(PetscHSetIClear(oht));
1366     offset = i%dof;
1367     ii     = i/dof;
1368     /* If the off diag is empty, we should not do any calculation */
1369     nzi = po->i[ii+1] - po->i[ii];
1370     dnzi = pd->i[ii+1] - pd->i[ii];
1371     if (!nzi && !dnzi) continue;
1372 
1373     PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht));
1374     PetscCall(PetscHSetIGetSize(ht,&htsize));
1375     PetscCall(PetscHSetIGetSize(oht,&htosize));
1376     /* If AP is empty, just continue */
1377     if (!(htsize+htosize)) continue;
1378 
1379     /* Form remote C(ii, :) */
1380     poj = po->j + po->i[ii];
1381     for (j=0; j<nzi; j++) {
1382       PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht));
1383       PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],oht));
1384     }
1385 
1386     /* Form local C(ii, :) */
1387     pdj = pd->j + pd->i[ii];
1388     for (j=0; j<dnzi; j++) {
1389       PetscCall(PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht));
1390       PetscCall(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht));
1391     }
1392   }
1393 
1394   PetscCall(ISRestoreIndices(map,&mappingindices));
1395 
1396   PetscCall(PetscHSetIDestroy(&ht));
1397   PetscCall(PetscHSetIDestroy(&oht));
1398 
1399   for (i=0; i<pon; i++) {
1400     PetscCall(PetscHSetIGetSize(hta[i],&htsize));
1401     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1402     c_rmtc[i] = htsize;
1403   }
1404 
1405   PetscCall(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj));
1406 
1407   for (i=0; i<pon; i++) {
1408     off = 0;
1409     PetscCall(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]));
1410     PetscCall(PetscHSetIDestroy(&hta[i]));
1411   }
1412   PetscCall(PetscFree(hta));
1413 
1414   PetscCall(PetscMalloc1(pon,&iremote));
1415   for (i=0; i<pon; i++) {
1416     owner = 0; lidx = 0;
1417     offset = i%dof;
1418     ii     = i/dof;
1419     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx));
1420     iremote[i].index = lidx*dof+offset;
1421     iremote[i].rank  = owner;
1422   }
1423 
1424   PetscCall(PetscSFCreate(comm,&sf));
1425   PetscCall(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
1426   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1427   PetscCall(PetscSFSetRankOrder(sf,PETSC_TRUE));
1428   PetscCall(PetscSFSetFromOptions(sf));
1429   PetscCall(PetscSFSetUp(sf));
1430   /* How many neighbors have contributions to my rows? */
1431   PetscCall(PetscSFComputeDegreeBegin(sf,&rootdegrees));
1432   PetscCall(PetscSFComputeDegreeEnd(sf,&rootdegrees));
1433   rootspacesize = 0;
1434   for (i = 0; i < pn; i++) {
1435     rootspacesize += rootdegrees[i];
1436   }
1437   PetscCall(PetscMalloc1(rootspacesize,&rootspace));
1438   PetscCall(PetscMalloc1(rootspacesize+1,&rootspaceoffsets));
1439   /* Get information from leaves
1440    * Number of columns other people contribute to my rows
1441    * */
1442   PetscCall(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace));
1443   PetscCall(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace));
1444   PetscCall(PetscFree(c_rmtc));
1445   PetscCall(PetscMalloc1(pn+1,&ptap->c_othi));
1446   /* The number of columns is received for each row */
1447   ptap->c_othi[0]     = 0;
1448   rootspacesize       = 0;
1449   rootspaceoffsets[0] = 0;
1450   for (i = 0; i < pn; i++) {
1451     rcvncols = 0;
1452     for (j = 0; j<rootdegrees[i]; j++) {
1453       rcvncols += rootspace[rootspacesize];
1454       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1455       rootspacesize++;
1456     }
1457     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1458   }
1459   PetscCall(PetscFree(rootspace));
1460 
1461   PetscCall(PetscMalloc1(pon,&c_rmtoffsets));
1462   PetscCall(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
1463   PetscCall(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets));
1464   PetscCall(PetscSFDestroy(&sf));
1465   PetscCall(PetscFree(rootspaceoffsets));
1466 
1467   PetscCall(PetscCalloc1(ptap->c_rmti[pon],&iremote));
1468   nleaves = 0;
1469   for (i = 0; i<pon; i++) {
1470     owner = 0;
1471     ii    = i/dof;
1472     PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL));
1473     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1474     for (j=0; j<sendncols; j++) {
1475       iremote[nleaves].rank    = owner;
1476       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1477     }
1478   }
1479   PetscCall(PetscFree(c_rmtoffsets));
1480   PetscCall(PetscCalloc1(ptap->c_othi[pn],&c_othj));
1481 
1482   PetscCall(PetscSFCreate(comm,&ptap->sf));
1483   PetscCall(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER));
1484   PetscCall(PetscSFSetFromOptions(ptap->sf));
1485   /* One to one map */
1486   PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
1487   /* Get remote data */
1488   PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE));
1489   PetscCall(PetscFree(c_rmtj));
1490   PetscCall(PetscMalloc2(pn,&dnz,pn,&onz));
1491   PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend));
1492   pcstart *= dof;
1493   pcend   *= dof;
1494   for (i = 0; i < pn; i++) {
1495     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1496     rdj = c_othj + ptap->c_othi[i];
1497     for (j = 0; j < nzi; j++) {
1498       col =  rdj[j];
1499       /* diag part */
1500       if (col>=pcstart && col<pcend) {
1501         PetscCall(PetscHSetIAdd(htd[i],col));
1502       } else { /* off diag */
1503         PetscCall(PetscHSetIAdd(hto[i],col));
1504       }
1505     }
1506     PetscCall(PetscHSetIGetSize(htd[i],&htsize));
1507     dnz[i] = htsize;
1508     PetscCall(PetscHSetIDestroy(&htd[i]));
1509     PetscCall(PetscHSetIGetSize(hto[i],&htsize));
1510     onz[i] = htsize;
1511     PetscCall(PetscHSetIDestroy(&hto[i]));
1512   }
1513 
1514   PetscCall(PetscFree2(htd,hto));
1515   PetscCall(PetscFree(c_othj));
1516 
1517   /* local sizes and preallocation */
1518   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1519   PetscCall(MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs));
1520   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
1521   PetscCall(PetscFree2(dnz,onz));
1522 
1523   /* attach the supporting struct to Cmpi for reuse */
1524   Cmpi->product->data    = ptap;
1525   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1526   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1527 
1528   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1529   Cmpi->assembled = PETSC_FALSE;
1530   /* pick an algorithm */
1531   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");PetscCall(ierr);
1532   alg = 0;
1533   PetscCall(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg));
1534   ierr = PetscOptionsEnd();PetscCall(ierr);
1535   switch (alg) {
1536     case 0:
1537       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1538       break;
1539     case 1:
1540       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1541       break;
1542     default:
1543       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1544   }
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1549 {
1550   PetscFunctionBegin;
1551   PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C));
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1556 {
1557   PetscErrorCode      ierr;
1558   Mat_APMPI           *ptap;
1559   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1560   MPI_Comm            comm;
1561   PetscMPIInt         size,rank;
1562   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1563   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1564   PetscInt            *lnk,i,k,pnz,row,nsend;
1565   PetscBT             lnkbt;
1566   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
1567   PETSC_UNUSED PetscMPIInt icompleted=0;
1568   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1569   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1570   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1571   MPI_Request         *swaits,*rwaits;
1572   MPI_Status          *sstatus,rstatus;
1573   PetscLayout         rowmap;
1574   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1575   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1576   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1577   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1578   PetscScalar         *apv;
1579   PetscTable          ta;
1580   MatType             mtype;
1581   const char          *prefix;
1582 #if defined(PETSC_USE_INFO)
1583   PetscReal           apfill;
1584 #endif
1585 
1586   PetscFunctionBegin;
1587   MatCheckProduct(Cmpi,4);
1588   PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty");
1589   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1590   PetscCallMPI(MPI_Comm_size(comm,&size));
1591   PetscCallMPI(MPI_Comm_rank(comm,&rank));
1592 
1593   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1594 
1595   /* create symbolic parallel matrix Cmpi */
1596   PetscCall(MatGetType(A,&mtype));
1597   PetscCall(MatSetType(Cmpi,mtype));
1598 
1599   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1600   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1601 
1602   /* create struct Mat_APMPI and attached it to C later */
1603   PetscCall(PetscNew(&ptap));
1604   ptap->reuse = MAT_INITIAL_MATRIX;
1605   ptap->algType = 1;
1606 
1607   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1608   PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
1609   /* get P_loc by taking all local rows of P */
1610   PetscCall(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc));
1611 
1612   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1613   /* --------------------------------- */
1614   PetscCall(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd));
1615   PetscCall(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro));
1616 
1617   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1618   /* ----------------------------------------------------------------- */
1619   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1620   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1621 
1622   /* create and initialize a linked list */
1623   PetscCall(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */
1624   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1625   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1626   PetscCall(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */
1627   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1628 
1629   PetscCall(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt));
1630 
1631   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1632   if (ao) {
1633     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space));
1634   } else {
1635     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space));
1636   }
1637   current_space = free_space;
1638   nspacedouble  = 0;
1639 
1640   PetscCall(PetscMalloc1(am+1,&api));
1641   api[0] = 0;
1642   for (i=0; i<am; i++) {
1643     /* diagonal portion: Ad[i,:]*P */
1644     ai = ad->i; pi = p_loc->i;
1645     nzi = ai[i+1] - ai[i];
1646     aj  = ad->j + ai[i];
1647     for (j=0; j<nzi; j++) {
1648       row  = aj[j];
1649       pnz  = pi[row+1] - pi[row];
1650       Jptr = p_loc->j + pi[row];
1651       /* add non-zero cols of P into the sorted linked list lnk */
1652       PetscCall(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt));
1653     }
1654     /* off-diagonal portion: Ao[i,:]*P */
1655     if (ao) {
1656       ai = ao->i; pi = p_oth->i;
1657       nzi = ai[i+1] - ai[i];
1658       aj  = ao->j + ai[i];
1659       for (j=0; j<nzi; j++) {
1660         row  = aj[j];
1661         pnz  = pi[row+1] - pi[row];
1662         Jptr = p_oth->j + pi[row];
1663         PetscCall(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt));
1664       }
1665     }
1666     apnz     = lnk[0];
1667     api[i+1] = api[i] + apnz;
1668     if (ap_rmax < apnz) ap_rmax = apnz;
1669 
1670     /* if free space is not available, double the total space in the list */
1671     if (current_space->local_remaining<apnz) {
1672       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space));
1673       nspacedouble++;
1674     }
1675 
1676     /* Copy data into free space, then initialize lnk */
1677     PetscCall(PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt));
1678 
1679     current_space->array           += apnz;
1680     current_space->local_used      += apnz;
1681     current_space->local_remaining -= apnz;
1682   }
1683   /* Allocate space for apj and apv, initialize apj, and */
1684   /* destroy list of free space and other temporary array(s) */
1685   PetscCall(PetscMalloc2(api[am],&apj,api[am],&apv));
1686   PetscCall(PetscFreeSpaceContiguous(&free_space,apj));
1687   PetscCall(PetscLLDestroy(lnk,lnkbt));
1688 
1689   /* Create AP_loc for reuse */
1690   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc));
1691   PetscCall(MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name));
1692 #if defined(PETSC_USE_INFO)
1693   if (ao) {
1694     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1695   } else {
1696     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1697   }
1698   ptap->AP_loc->info.mallocs           = nspacedouble;
1699   ptap->AP_loc->info.fill_ratio_given  = fill;
1700   ptap->AP_loc->info.fill_ratio_needed = apfill;
1701 
1702   if (api[am]) {
1703     PetscCall(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill));
1704     PetscCall(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill));
1705   } else {
1706     PetscCall(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n"));
1707   }
1708 #endif
1709 
1710   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1711   /* ------------------------------------ */
1712   PetscCall(MatGetOptionsPrefix(A,&prefix));
1713   PetscCall(MatSetOptionsPrefix(ptap->Ro,prefix));
1714   PetscCall(MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_"));
1715   PetscCall(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth));
1716   PetscCall(MatGetOptionsPrefix(Cmpi,&prefix));
1717   PetscCall(MatSetOptionsPrefix(ptap->C_oth,prefix));
1718   PetscCall(MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_"));
1719   PetscCall(MatProductSetType(ptap->C_oth,MATPRODUCT_AB));
1720   PetscCall(MatProductSetAlgorithm(ptap->C_oth,"default"));
1721   PetscCall(MatProductSetFill(ptap->C_oth,fill));
1722   PetscCall(MatProductSetFromOptions(ptap->C_oth));
1723   PetscCall(MatProductSymbolic(ptap->C_oth));
1724 
1725   /* (3) send coj of C_oth to other processors  */
1726   /* ------------------------------------------ */
1727   /* determine row ownership */
1728   PetscCall(PetscLayoutCreate(comm,&rowmap));
1729   rowmap->n  = pn;
1730   rowmap->bs = 1;
1731   PetscCall(PetscLayoutSetUp(rowmap));
1732   owners = rowmap->range;
1733 
1734   /* determine the number of messages to send, their lengths */
1735   PetscCall(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co));
1736   PetscCall(PetscArrayzero(len_s,size));
1737   PetscCall(PetscArrayzero(len_si,size));
1738 
1739   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1740   coi   = c_oth->i; coj = c_oth->j;
1741   con   = ptap->C_oth->rmap->n;
1742   proc  = 0;
1743   for (i=0; i<con; i++) {
1744     while (prmap[i] >= owners[proc+1]) proc++;
1745     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1746     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1747   }
1748 
1749   len          = 0; /* max length of buf_si[], see (4) */
1750   owners_co[0] = 0;
1751   nsend        = 0;
1752   for (proc=0; proc<size; proc++) {
1753     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1754     if (len_s[proc]) {
1755       nsend++;
1756       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1757       len         += len_si[proc];
1758     }
1759   }
1760 
1761   /* determine the number and length of messages to receive for coi and coj  */
1762   PetscCall(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv));
1763   PetscCall(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri));
1764 
1765   /* post the Irecv and Isend of coj */
1766   PetscCall(PetscCommGetNewTag(comm,&tagj));
1767   PetscCall(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits));
1768   PetscCall(PetscMalloc1(nsend+1,&swaits));
1769   for (proc=0, k=0; proc<size; proc++) {
1770     if (!len_s[proc]) continue;
1771     i    = owners_co[proc];
1772     PetscCallMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k));
1773     k++;
1774   }
1775 
1776   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1777   /* ---------------------------------------- */
1778   PetscCall(MatSetOptionsPrefix(ptap->Rd,prefix));
1779   PetscCall(MatAppendOptionsPrefix(ptap->Rd,"inner_diag_"));
1780   PetscCall(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc));
1781   PetscCall(MatGetOptionsPrefix(Cmpi,&prefix));
1782   PetscCall(MatSetOptionsPrefix(ptap->C_loc,prefix));
1783   PetscCall(MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_"));
1784   PetscCall(MatProductSetType(ptap->C_loc,MATPRODUCT_AB));
1785   PetscCall(MatProductSetAlgorithm(ptap->C_loc,"default"));
1786   PetscCall(MatProductSetFill(ptap->C_loc,fill));
1787   PetscCall(MatProductSetFromOptions(ptap->C_loc));
1788   PetscCall(MatProductSymbolic(ptap->C_loc));
1789 
1790   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1791 
1792   /* receives coj are complete */
1793   for (i=0; i<nrecv; i++) {
1794     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
1795   }
1796   PetscCall(PetscFree(rwaits));
1797   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
1798 
1799   /* add received column indices into ta to update Crmax */
1800   for (k=0; k<nrecv; k++) {/* k-th received message */
1801     Jptr = buf_rj[k];
1802     for (j=0; j<len_r[k]; j++) {
1803       PetscCall(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES));
1804     }
1805   }
1806   PetscCall(PetscTableGetCount(ta,&Crmax));
1807   PetscCall(PetscTableDestroy(&ta));
1808 
1809   /* (4) send and recv coi */
1810   /*-----------------------*/
1811   PetscCall(PetscCommGetNewTag(comm,&tagi));
1812   PetscCall(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits));
1813   PetscCall(PetscMalloc1(len+1,&buf_s));
1814   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1815   for (proc=0,k=0; proc<size; proc++) {
1816     if (!len_s[proc]) continue;
1817     /* form outgoing message for i-structure:
1818          buf_si[0]:                 nrows to be sent
1819                [1:nrows]:           row index (global)
1820                [nrows+1:2*nrows+1]: i-structure index
1821     */
1822     /*-------------------------------------------*/
1823     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1824     buf_si_i    = buf_si + nrows+1;
1825     buf_si[0]   = nrows;
1826     buf_si_i[0] = 0;
1827     nrows       = 0;
1828     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1829       nzi = coi[i+1] - coi[i];
1830       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1831       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1832       nrows++;
1833     }
1834     PetscCallMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k));
1835     k++;
1836     buf_si += len_si[proc];
1837   }
1838   for (i=0; i<nrecv; i++) {
1839     PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus));
1840   }
1841   PetscCall(PetscFree(rwaits));
1842   if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus));
1843 
1844   PetscCall(PetscFree4(len_s,len_si,sstatus,owners_co));
1845   PetscCall(PetscFree(len_ri));
1846   PetscCall(PetscFree(swaits));
1847   PetscCall(PetscFree(buf_s));
1848 
1849   /* (5) compute the local portion of Cmpi      */
1850   /* ------------------------------------------ */
1851   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1852   PetscCall(PetscFreeSpaceGet(Crmax,&free_space));
1853   current_space = free_space;
1854 
1855   PetscCall(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci));
1856   for (k=0; k<nrecv; k++) {
1857     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1858     nrows       = *buf_ri_k[k];
1859     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1860     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1861   }
1862 
1863   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);PetscCall(ierr);
1864   PetscCall(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt));
1865   for (i=0; i<pn; i++) {
1866     /* add C_loc into Cmpi */
1867     nzi  = c_loc->i[i+1] - c_loc->i[i];
1868     Jptr = c_loc->j + c_loc->i[i];
1869     PetscCall(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt));
1870 
1871     /* add received col data into lnk */
1872     for (k=0; k<nrecv; k++) { /* k-th received message */
1873       if (i == *nextrow[k]) { /* i-th row */
1874         nzi  = *(nextci[k]+1) - *nextci[k];
1875         Jptr = buf_rj[k] + *nextci[k];
1876         PetscCall(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt));
1877         nextrow[k]++; nextci[k]++;
1878       }
1879     }
1880     nzi = lnk[0];
1881 
1882     /* copy data into free space, then initialize lnk */
1883     PetscCall(PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt));
1884     PetscCall(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz));
1885   }
1886   PetscCall(PetscFree3(buf_ri_k,nextrow,nextci));
1887   PetscCall(PetscLLDestroy(lnk,lnkbt));
1888   PetscCall(PetscFreeSpaceDestroy(free_space));
1889 
1890   /* local sizes and preallocation */
1891   PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE));
1892   if (P->cmap->bs > 0) {
1893     PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs));
1894     PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs));
1895   }
1896   PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz));
1897   ierr = MatPreallocateFinalize(dnz,onz);PetscCall(ierr);
1898 
1899   /* members in merge */
1900   PetscCall(PetscFree(id_r));
1901   PetscCall(PetscFree(len_r));
1902   PetscCall(PetscFree(buf_ri[0]));
1903   PetscCall(PetscFree(buf_ri));
1904   PetscCall(PetscFree(buf_rj[0]));
1905   PetscCall(PetscFree(buf_rj));
1906   PetscCall(PetscLayoutDestroy(&rowmap));
1907 
1908   PetscCall(PetscCalloc1(pN,&ptap->apa));
1909 
1910   /* attach the supporting struct to Cmpi for reuse */
1911   Cmpi->product->data    = ptap;
1912   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1913   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
1914 
1915   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1916   Cmpi->assembled = PETSC_FALSE;
1917   PetscFunctionReturn(0);
1918 }
1919 
1920 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1921 {
1922   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1923   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1924   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1925   Mat_APMPI         *ptap;
1926   Mat               AP_loc,C_loc,C_oth;
1927   PetscInt          i,rstart,rend,cm,ncols,row;
1928   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
1929   PetscScalar       *apa;
1930   const PetscInt    *cols;
1931   const PetscScalar *vals;
1932 
1933   PetscFunctionBegin;
1934   MatCheckProduct(C,3);
1935   ptap = (Mat_APMPI*)C->product->data;
1936   PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data");
1937   PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()");
1938 
1939   PetscCall(MatZeroEntries(C));
1940   /* 1) get R = Pd^T,Ro = Po^T */
1941   if (ptap->reuse == MAT_REUSE_MATRIX) {
1942     PetscCall(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd));
1943     PetscCall(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro));
1944   }
1945 
1946   /* 2) get AP_loc */
1947   AP_loc = ptap->AP_loc;
1948   ap = (Mat_SeqAIJ*)AP_loc->data;
1949 
1950   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1951   /*-----------------------------------------------------*/
1952   if (ptap->reuse == MAT_REUSE_MATRIX) {
1953     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1954     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth));
1955     PetscCall(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc));
1956   }
1957 
1958   /* 2-2) compute numeric A_loc*P - dominating part */
1959   /* ---------------------------------------------- */
1960   /* get data from symbolic products */
1961   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1962   if (ptap->P_oth) {
1963     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1964   }
1965   apa   = ptap->apa;
1966   api   = ap->i;
1967   apj   = ap->j;
1968   for (i=0; i<am; i++) {
1969     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1970     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1971     apnz = api[i+1] - api[i];
1972     for (j=0; j<apnz; j++) {
1973       col = apj[j+api[i]];
1974       ap->a[j+ap->i[i]] = apa[col];
1975       apa[col] = 0.0;
1976     }
1977   }
1978   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1979   PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc));
1980 
1981   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1982   PetscCall(MatProductNumeric(ptap->C_loc));
1983   PetscCall(MatProductNumeric(ptap->C_oth));
1984   C_loc = ptap->C_loc;
1985   C_oth = ptap->C_oth;
1986 
1987   /* add C_loc and Co to to C */
1988   PetscCall(MatGetOwnershipRange(C,&rstart,&rend));
1989 
1990   /* C_loc -> C */
1991   cm    = C_loc->rmap->N;
1992   c_seq = (Mat_SeqAIJ*)C_loc->data;
1993   cols = c_seq->j;
1994   vals = c_seq->a;
1995 
1996   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1997   /* when there are no off-processor parts.  */
1998   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1999   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2000   /* a table, and other, more complex stuff has to be done. */
2001   if (C->assembled) {
2002     C->was_assembled = PETSC_TRUE;
2003     C->assembled     = PETSC_FALSE;
2004   }
2005   if (C->was_assembled) {
2006     for (i=0; i<cm; i++) {
2007       ncols = c_seq->i[i+1] - c_seq->i[i];
2008       row = rstart + i;
2009       PetscCall(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES));
2010       cols += ncols; vals += ncols;
2011     }
2012   } else {
2013     PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a));
2014   }
2015 
2016   /* Co -> C, off-processor part */
2017   cm = C_oth->rmap->N;
2018   c_seq = (Mat_SeqAIJ*)C_oth->data;
2019   cols = c_seq->j;
2020   vals = c_seq->a;
2021   for (i=0; i<cm; i++) {
2022     ncols = c_seq->i[i+1] - c_seq->i[i];
2023     row = p->garray[i];
2024     PetscCall(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES));
2025     cols += ncols; vals += ncols;
2026   }
2027 
2028   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
2029   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
2030 
2031   ptap->reuse = MAT_REUSE_MATRIX;
2032   PetscFunctionReturn(0);
2033 }
2034 
2035 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
2036 {
2037   Mat_Product         *product = C->product;
2038   Mat                 A=product->A,P=product->B;
2039   MatProductAlgorithm alg=product->alg;
2040   PetscReal           fill=product->fill;
2041   PetscBool           flg;
2042 
2043   PetscFunctionBegin;
2044   /* scalable: do R=P^T locally, then C=R*A*P */
2045   PetscCall(PetscStrcmp(alg,"scalable",&flg));
2046   if (flg) {
2047     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C));
2048     C->ops->productnumeric = MatProductNumeric_PtAP;
2049     goto next;
2050   }
2051 
2052   /* nonscalable: do R=P^T locally, then C=R*A*P */
2053   PetscCall(PetscStrcmp(alg,"nonscalable",&flg));
2054   if (flg) {
2055     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C));
2056     goto next;
2057   }
2058 
2059   /* allatonce */
2060   PetscCall(PetscStrcmp(alg,"allatonce",&flg));
2061   if (flg) {
2062     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C));
2063     goto next;
2064   }
2065 
2066   /* allatonce_merged */
2067   PetscCall(PetscStrcmp(alg,"allatonce_merged",&flg));
2068   if (flg) {
2069     PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C));
2070     goto next;
2071   }
2072 
2073   /* backend general code */
2074   PetscCall(PetscStrcmp(alg,"backend",&flg));
2075   if (flg) {
2076     PetscCall(MatProductSymbolic_MPIAIJBACKEND(C));
2077     PetscFunctionReturn(0);
2078   }
2079 
2080   /* hypre */
2081 #if defined(PETSC_HAVE_HYPRE)
2082   PetscCall(PetscStrcmp(alg,"hypre",&flg));
2083   if (flg) {
2084     PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C));
2085     C->ops->productnumeric = MatProductNumeric_PtAP;
2086     PetscFunctionReturn(0);
2087   }
2088 #endif
2089   SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
2090 
2091 next:
2092   C->ops->productnumeric = MatProductNumeric_PtAP;
2093   PetscFunctionReturn(0);
2094 }
2095