xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision ae77e1216f63a101c7dbd925cdbaa61ef86e6ca7)
1 
2 /*
3   Defines projective product routines where A is a MPIAIJ matrix
4           C = P^T * A * P
5 */
6 
7 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
8 #include "src/mat/utils/freespace.h"
9 #include "src/mat/impls/aij/mpi/mpiaij.h"
10 #include "petscbt.h"
11 
12 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
13 #undef __FUNCT__
14 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
15 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
16 {
17   PetscErrorCode       ierr;
18   Mat_MatMatMultMPI    *ptap;
19   PetscObjectContainer container;
20 
21   PetscFunctionBegin;
22   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
23   if (container) {
24     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
25     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
26     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
27     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
28     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
29     if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr);
30     if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr);
31 
32     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
33     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
34   }
35   ierr = PetscFree(ptap);CHKERRQ(ierr);
36 
37   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
38   PetscFunctionReturn(0);
39 }
40 
41 #undef __FUNCT__
42 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
43 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
44 {
45   PetscErrorCode       ierr;
46   Mat_MatMatMultMPI    *ptap;
47   PetscObjectContainer container;
48 
49   PetscFunctionBegin;
50   if (scall == MAT_INITIAL_MATRIX){
51     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
52     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
53     ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL;
54     ptap->abnz_max = 0; /* symbolic A*P is not done yet */
55 
56     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
57     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
58 
59     /* get P_loc by taking all local rows of P */
60     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
61 
62     /* attach the supporting struct to P for reuse */
63     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
64     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
65     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
66     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
67 
68     /* now, compute symbolic local P^T*A*P */
69     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
70     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
71   } else if (scall == MAT_REUSE_MATRIX){
72     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
73     if (container) {
74       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
75     } else {
76       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
77     }
78 
79     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
80     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
81 
82     /* get P_loc by taking all local rows of P */
83     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
84 
85   } else {
86     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
87   }
88   /* now, compute numeric local P^T*A*P */
89   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
90   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
91   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
92 
93   PetscFunctionReturn(0);
94 }
95 
96 #undef __FUNCT__
97 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
98 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
99 {
100   PetscErrorCode       ierr;
101   Mat                  P_loc,P_oth,B_mpi;
102   Mat_MatMatMultMPI    *ap;
103   PetscObjectContainer container;
104   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
105   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
106   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
107   Mat_SeqAIJ           *p_loc,*p_oth;
108   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ,*pjj;
109   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz;
110   PetscInt             nlnk,*lnk,*cdi,*cdj,*owners_co,*coi,*coj,*cji;
111   PetscInt             am=A->m,pN=P->N,pm=P->m,pn=P->n;
112   PetscInt             i,j,k,pnz,row,cnz;
113   PetscBT              lnkbt;
114   PetscInt             prstart,prend;
115   MPI_Comm             comm=A->comm;
116   PetscMPIInt          size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
117   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
118   PetscInt             len,proc,*dnz,*onz,*owners;
119   PetscInt             anzi,*bi,*bj,bnz;
120   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
121   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
122   MPI_Status           *status;
123   Mat_Merge_SeqsToMPI  *merge;
124   PetscInt             *api,*apj,*apJ,apnz,*prmap=p->garray,pon;
125 
126   PetscFunctionBegin;
127   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
128   if (container) {
129     ierr  = PetscObjectContainerGetPointer(container,(void **)&ap);CHKERRQ(ierr);
130   } else {
131     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
132   }
133   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
134   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
135 
136   /* get data from P_loc and P_oth */
137   P_loc=ap->B_loc; P_oth=ap->B_oth; prstart=ap->brstart;
138   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
139   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
140   prend = prstart + pm;
141 
142   /* first, compute symbolic AP = A_loc*P */
143   /*--------------------------------------*/
144   ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr);
145   ap->abi = api;
146   api[0] = 0;
147 
148   /* create and initialize a linked list */
149   nlnk = pN+1;
150   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
151 
152   /* Initial FreeSpace size is fill*nnz(A) */
153   ierr = GetMoreSpace((PetscInt)(fill*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr);
154   current_space = free_space;
155 
156   for (i=0;i<am;i++) {
157     apnz = 0;
158     /* diagonal portion of A */
159     anzi = adi[i+1] - adi[i];
160     for (j=0; j<anzi; j++){
161       row = *adj++;
162       pnz = pi_loc[row+1] - pi_loc[row];
163       pjj  = pj_loc + pi_loc[row];
164       /* add non-zero cols of P into the sorted linked list lnk */
165       ierr = PetscLLAdd(pnz,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
166       apnz += nlnk;
167     }
168     /* off-diagonal portion of A */
169     anzi = aoi[i+1] - aoi[i];
170     for (j=0; j<anzi; j++){
171       row = *aoj++;
172       pnz = pi_oth[row+1] - pi_oth[row];
173       pjj  = pj_oth + pi_oth[row];
174       ierr = PetscLLAdd(pnz,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
175       apnz += nlnk;
176     }
177 
178     api[i+1] = api[i] + apnz;
179     if (ap->abnz_max < apnz) ap->abnz_max = apnz;
180 
181     /* if free space is not available, double the total space in the list */
182     if (current_space->local_remaining<apnz) {
183       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
184     }
185 
186     /* Copy data into free space, then initialize lnk */
187     ierr = PetscLLClean(pN,pN,apnz,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
188     current_space->array           += apnz;
189     current_space->local_used      += apnz;
190     current_space->local_remaining -= apnz;
191   }
192   /* Allocate space for apj, initialize apj, and */
193   /* destroy list of free space and other temporary array(s) */
194   ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ap->abj);CHKERRQ(ierr);
195   apj = ap->abj;
196   ierr = MakeSpaceContiguous(&free_space,ap->abj);CHKERRQ(ierr);
197 
198   /* get ij structure of (p->A)^T and (p->B)^T */
199   /*-------------------------------------------*/
200   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
201   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
202 
203   /* then, compute symbolic Co_seq = (p->B)^T*AP */
204   /* Allocate coi array, arrays for fill computation and */
205   /* free space for accumulating nonzero column info */
206   pon = (p->B)->n; /* total num of rows to be sent to other processors
207                          >= (num of nonzero rows of C_seq) - pn */
208   ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
209   coi[0] = 0;
210 
211   /* set initial free space to be 3*pon*max( nnz(AP) per row) */
212   nnz           = 3*pon*ap->abnz_max + 1;
213   ierr          = GetMoreSpace(nnz,&free_space);
214   current_space = free_space;
215 
216   /* determine symbolic Co=(p->B)^T*AP - send to others */
217   /*----------------------------------------------------*/
218   for (i=0; i<pon; i++) {
219     cnz  = 0;
220     pnz = poti[i+1] - poti[i];
221     j     = pnz;
222     ptJ   = potj + poti[i+1];
223     while (j){/* assume cols are almost in increasing order, starting from its end saves computation */
224       j--; ptJ--;
225       row  = *ptJ; /* row of AP == col of Pot */
226       apnz = api[row+1] - api[row];
227       apJ   = apj + api[row];
228       /* add non-zero cols of AP into the sorted linked list lnk */
229       ierr = PetscLLAdd(apnz,apJ,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
230       cnz += nlnk;
231     }
232 
233     /* If free space is not available, double the total space in the list */
234     if (current_space->local_remaining<cnz) {
235       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
236     }
237 
238     /* Copy data into free space, and zero out denserows */
239     ierr = PetscLLClean(pN,pN,cnz,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
240     current_space->array           += cnz;
241     current_space->local_used      += cnz;
242     current_space->local_remaining -= cnz;
243     coi[i+1] = coi[i] + cnz;
244   }
245   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
246   ierr = MakeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
247   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
248 
249   /* send j structure of Co to other processors */
250   /*--------------------------------------------*/
251   /* determine row ownership */
252   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
253   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
254   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
255   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
256   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
257 
258   /* determine the number of messages to send, their lengths */
259   /*---------------------------------------------------------*/
260   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
261   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
262   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
263   len_s = merge->len_s;
264   merge->nsend = 0;
265 
266   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
267   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
268 
269   proc = 0;
270   for (i=0; i<pon; i++){
271     while (prmap[i] >= owners[proc+1]) proc++;
272     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
273     len_s[proc] += coi[i+1] - coi[i];
274   }
275 
276   len   = 0;  /* max length of buf_si[] */
277   owners_co[0] = 0;
278   for (proc=0; proc<size; proc++){
279     owners_co[proc+1] = owners_co[proc] + len_si[proc];
280     if (len_si[proc]){
281       merge->nsend++;
282       len_si[proc] = 2*(len_si[proc] + 1);
283       len += len_si[proc];
284     }
285   }
286 
287   /* determine the number and length of messages to receive for ij-structure */
288   /*-------------------------------------------------------------------------*/
289   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
290   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
291 
292   /* post the Irecv and Isend of j-structure */
293   /*-------------------------------*/
294   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
295   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
296 
297   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
298   sj_waits = si_waits + merge->nsend;
299 
300   for (proc=0, k=0; proc<size; proc++){
301     if (!len_s[proc]) continue;
302     i = owners_co[proc];
303     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
304     k++;
305   }
306 
307   /* determine symbolic Cd = (p->A)^T*AP - stay local */
308   /*--------------------------------------------------*/
309   /* Allocate ci array, arrays for fill computation and */
310   /* free space for accumulating nonzero column info */
311   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&cdi);CHKERRQ(ierr);
312   cdi[0] = 0;
313 
314   /* Set initial free space to be 3*pn*max( nnz(AP) per row) */
315   nnz           = 3*pn*ap->abnz_max + 1;
316   ierr          = GetMoreSpace(nnz,&free_space);
317   current_space = free_space;
318 
319   for (i=0; i<pn; i++) {
320     cnz = 0;
321     pnz  = pdti[i+1] - pdti[i];
322     j    = pnz;
323     ptJ  = pdtj + pdti[i+1];
324     while (j){/* assume cols are almost in increasing order, starting from its end saves computation */
325       j--; ptJ--;
326       row  = *ptJ; /* row of AP == col of Pt */
327       apnz = api[row+1] - api[row];
328       apJ   = apj + api[row];
329       /* add non-zero cols of AP into the sorted linked list lnk */
330       ierr = PetscLLAdd(apnz,apJ,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
331       cnz += nlnk;
332     }
333 
334     /* If free space is not available, double the total space in the list */
335     if (current_space->local_remaining<cnz) {
336       /* printf("[%d] free space is not available, double\n",rank); */
337       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
338     }
339 
340     /* Copy data into free space, and zero out denserows */
341     ierr = PetscLLClean(pN,pN,cnz,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
342     current_space->array           += cnz;
343     current_space->local_used      += cnz;
344     current_space->local_remaining -= cnz;
345     cdi[i+1] = cdi[i] + cnz;
346   }
347   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
348 
349   /* Allocate space for cj, initialize cj, and */
350   /* destroy list of free space and other temporary array(s) */
351   ierr = PetscMalloc((cdi[pn]+1)*sizeof(PetscInt),&cdj);CHKERRQ(ierr);
352   ierr = MakeSpaceContiguous(&free_space,cdj);CHKERRQ(ierr);
353 
354   /* receives and sends of j-structure are complete */
355   /*------------------------------------------------*/
356   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
357   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
358   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
359 
360   /* send and recv i-structure */
361   /*---------------------------*/
362   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
363   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
364 
365   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
366   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
367   for (proc=0,k=0; proc<size; proc++){
368     if (!len_s[proc]) continue;
369     /* form outgoing message for i-structure:
370          buf_si[0]:                 nrows to be sent
371                [1:nrows]:           row index (global)
372                [nrows+1:2*nrows+1]: i-structure index
373     */
374     /*-------------------------------------------*/
375     nrows = len_si[proc]/2 - 1;
376     buf_si_i    = buf_si + nrows+1;
377     buf_si[0]   = nrows;
378     buf_si_i[0] = 0;
379     nrows = 0;
380     for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
381       anzi = coi[i+1] - coi[i];
382       if (!anzi) SETERRQ1(1,"i=%d, ani = 0",i);
383       buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
384       buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
385       nrows++;
386     }
387     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
388     k++;
389     buf_si += len_si[proc];
390   }
391 
392   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
393   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
394 
395   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
396   for (i=0; i<merge->nrecv; i++){
397     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
398   }
399 
400   ierr = PetscFree(len_si);CHKERRQ(ierr);
401   ierr = PetscFree(len_ri);CHKERRQ(ierr);
402   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
403   ierr = PetscFree(si_waits);CHKERRQ(ierr);
404   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
405   ierr = PetscFree(buf_s);CHKERRQ(ierr);
406   ierr = PetscFree(status);CHKERRQ(ierr);
407 
408   /* compute the local portion of C (mpi mat) */
409   /*------------------------------------------*/
410   /* allocate bi array and free space for accumulating nonzero column info */
411   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
412   bi[0] = 0;
413 
414   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
415   ierr = GetMoreSpace((PetscInt)(2*cdi[pn]+1),&free_space);CHKERRQ(ierr);
416   current_space = free_space;
417 
418   /* determine symbolic info for each local row */
419   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
420   nextrow = buf_ri_k + merge->nrecv;
421   nextci  = nextrow + merge->nrecv;
422   for (k=0; k<merge->nrecv; k++){
423     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
424     nrows       = *buf_ri_k[k];
425     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
426     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
427   }
428 
429   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
430   len = 0;
431   for (i=0; i<pn; i++) {
432     bnz = 0;
433     /* add local non-zero cols of this proc's C_seq into lnk */
434     anzi = cdi[i+1] - cdi[i];
435     cji  = cdj + cdi[i];
436     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
437     bnz += nlnk;
438     /* add received col data into lnk */
439     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
440       if (i == *nextrow[k]) { /* i-th row */
441         anzi = *(nextci[k]+1) - *nextci[k];
442         cji  = buf_rj[k] + *nextci[k];
443         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
444         bnz += nlnk;
445         nextrow[k]++; nextci[k]++;
446       }
447     }
448     if (len < bnz) len = bnz;  /* =max(bnz) */
449 
450     /* if free space is not available, make more free space */
451     if (current_space->local_remaining<bnz) {
452       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
453     }
454     /* copy data into free space, then initialize lnk */
455     ierr = PetscLLClean(pN,pN,bnz,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
456     ierr = MatPreallocateSet(i+owners[rank],bnz,current_space->array,dnz,onz);CHKERRQ(ierr);
457 
458     current_space->array           += bnz;
459     current_space->local_used      += bnz;
460     current_space->local_remaining -= bnz;
461 
462     bi[i+1] = bi[i] + bnz;
463   }
464   ierr = PetscFree(cdi);CHKERRQ(ierr);
465   ierr = PetscFree(cdj);CHKERRQ(ierr);
466   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
467 
468   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
469   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
470   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
471 
472   /* create symbolic parallel matrix B_mpi */
473   /*---------------------------------------*/
474   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
475   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
476   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
477   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
478 
479   /* B_mpi is not ready for use - assembly will be done by MatPtAPNumeric() */
480   B_mpi->assembled     = PETSC_FALSE;
481   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
482   merge->bi            = bi;
483   merge->bj            = bj;
484   merge->coi           = coi;
485   merge->coj           = coj;
486   merge->buf_ri        = buf_ri;
487   merge->buf_rj        = buf_rj;
488   merge->owners_co     = owners_co;
489 
490   /* attach the supporting struct to B_mpi for reuse */
491   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
492   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
493   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
494   *C = B_mpi;
495 
496   PetscFunctionReturn(0);
497 }
498 
499 #undef __FUNCT__
500 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
501 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
502 {
503   PetscErrorCode       ierr;
504   Mat_Merge_SeqsToMPI  *merge;
505   Mat_MatMatMultMPI    *ap;
506   PetscObjectContainer cont_merge,cont_ptap;
507   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
508   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
509   Mat_SeqAIJ     *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
510   Mat_SeqAIJ     *p_loc,*p_oth;
511   PetscInt       *adi=ad->i,*aoi=ao->i,*adj=ad->j,*aoj=ao->j,*apJ,nextp,flops=0;
512   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj;
513   PetscInt       i,j,k,anz,pnz,apnz,nextap,row,*cj;
514   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*pa,*ca,*pa_loc,*pa_oth;
515   PetscInt       am=A->m,cm=C->m,pon=(p->B)->n;
516   MPI_Comm       comm=C->comm;
517   PetscMPIInt    size,rank,taga,*len_s;
518   PetscInt       *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
519   PetscInt       **buf_ri,**buf_rj;
520   PetscInt       cnz,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
521   MPI_Request    *s_waits,*r_waits;
522   MPI_Status     *status;
523   MatScalar      **abuf_r,*ba_i,*pA,*coa,*ba;
524   PetscInt       *api,*apj,*coi,*coj;
525   PetscInt       *poJ=po->j,*pdJ=pd->j,pcstart=p->cstart,pcend=p->cend;
526 
527   PetscFunctionBegin;
528   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
529   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
530 
531   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
532   if (cont_merge) {
533     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
534   } else {
535     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
536   }
537   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
538   if (cont_ptap) {
539     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ap);CHKERRQ(ierr);
540   } else {
541     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
542   }
543 
544   /* get data from symbolic products */
545   p_loc = (Mat_SeqAIJ*)(ap->B_loc)->data;
546   p_oth = (Mat_SeqAIJ*)(ap->B_oth)->data;
547   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a,pA=pa_loc;
548   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
549 
550   coi = merge->coi; coj = merge->coj;
551   ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
552   ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
553 
554   bi = merge->bi; bj = merge->bj;
555   ierr  = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
556   ierr  = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
557   ierr  = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
558 
559   /* get data from symbolic A*P */
560   ierr = PetscMalloc((ap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr);
561   ierr = PetscMemzero(apa,ap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr);
562 
563   /* compute numeric C_seq=P_loc^T*A_loc*P */
564   api = ap->abi; apj = ap->abj;
565   for (i=0;i<am;i++) {
566     /* form i-th sparse row of A*P */
567     apnz = api[i+1] - api[i];
568     apJ  = apj + api[i];
569     /* diagonal portion of A */
570     anz  = adi[i+1] - adi[i];
571     for (j=0;j<anz;j++) {
572       row = *adj++;
573       pnz = pi_loc[row+1] - pi_loc[row];
574       pj  = pj_loc + pi_loc[row];
575       pa  = pa_loc + pi_loc[row];
576       nextp = 0;
577       for (k=0; nextp<pnz; k++) {
578         if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
579           apa[k] += (*ada)*pa[nextp++];
580         }
581       }
582       flops += 2*pnz;
583       ada++;
584     }
585     /* off-diagonal portion of A */
586     anz  = aoi[i+1] - aoi[i];
587     for (j=0; j<anz; j++) {
588       row = *aoj++;
589       pnz = pi_oth[row+1] - pi_oth[row];
590       pj  = pj_oth + pi_oth[row];
591       pa  = pa_oth + pi_oth[row];
592       nextp = 0;
593       for (k=0; nextp<pnz; k++) {
594         if (apJ[k] == pj[nextp]) { /* col of AP == col of P */
595           apa[k] += (*aoa)*pa[nextp++];
596         }
597       }
598       flops += 2*pnz;
599       aoa++;
600     }
601 
602     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
603     pnz = pi_loc[i+1] - pi_loc[i];
604     for (j=0; j<pnz; j++) {
605       nextap = 0;
606       row    = *pJ++; /* global index */
607       if (row < pcstart || row >=pcend) { /* put the value into Co */
608         cj  = coj + coi[*poJ];
609         ca  = coa + coi[*poJ++];
610       } else {                            /* put the value into Cd */
611         cj   = bj + bi[*pdJ];
612         ca   = ba + bi[*pdJ++];
613       }
614       for (k=0; nextap<apnz; k++) {
615         if (cj[k]==apJ[nextap]) ca[k] += (*pA)*apa[nextap++];
616       }
617       flops += 2*apnz;
618       pA++;
619     }
620 
621     /* zero the current row info for A*P */
622     ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr);
623   }
624   ierr = PetscFree(apa);CHKERRQ(ierr);
625 
626   /* send and recv matrix values */
627   /*-----------------------------*/
628   buf_ri = merge->buf_ri;
629   buf_rj = merge->buf_rj;
630   len_s  = merge->len_s;
631   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
632   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
633 
634   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
635   for (proc=0,k=0; proc<size; proc++){
636     if (!len_s[proc]) continue;
637     i = merge->owners_co[proc];
638     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
639     k++;
640   }
641   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
642   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
643   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
644   ierr = PetscFree(status);CHKERRQ(ierr);
645 
646   ierr = PetscFree(s_waits);CHKERRQ(ierr);
647   ierr = PetscFree(r_waits);CHKERRQ(ierr);
648   ierr = PetscFree(coa);CHKERRQ(ierr);
649 
650   /* insert local and received values into C */
651   /*-----------------------------------------*/
652   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
653   nextrow   = buf_ri_k + merge->nrecv;
654   nextci = nextrow + merge->nrecv;
655 
656   for (k=0; k<merge->nrecv; k++){
657     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
658     nrows       = *(buf_ri_k[k]);
659     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
660     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
661   }
662 
663   for (i=0; i<cm; i++) {
664     row = owners[rank] + i; /* global row index of C_seq */
665     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
666     ba_i = ba + bi[i];
667     bnz  = bi[i+1] - bi[i];
668     /* add received vals into ba */
669     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
670       /* i-th row */
671       if (i == *nextrow[k]) {
672         cnz = *(nextci[k]+1) - *nextci[k];
673         cj  = buf_rj[k] + *(nextci[k]);
674         ca  = abuf_r[k] + *(nextci[k]);
675         nextcj = 0;
676         for (j=0; nextcj<cnz; j++){
677           if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */
678             ba_i[j] += ca[nextcj++];
679           }
680         }
681         nextrow[k]++; nextci[k]++;
682       }
683     }
684     ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
685     flops += 2*cnz;
686   }
687   ierr = MatSetOption(C,MAT_COLUMNS_SORTED);CHKERRQ(ierr); /* -cause delay? */
688   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
689   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
690 
691   ierr = PetscFree(ba);CHKERRQ(ierr);
692   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
693   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
694   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
695 
696   PetscFunctionReturn(0);
697 }
698