xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 5ef145eb8df80296d879d7eba3b3c32bcd261bc3)
1 /*
2   Defines projective product routines where A is a MPIAIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
12 #undef __FUNCT__
13 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
14 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
15 {
16   PetscErrorCode       ierr;
17   Mat_MatMatMultMPI    *ptap;
18   PetscObjectContainer container;
19 
20   PetscFunctionBegin;
21   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
22   if (container) {
23     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
24     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
25     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
26     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
27     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
28     if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr);
29     if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr);
30 
31     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
32     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
33   }
34   ierr = PetscFree(ptap);CHKERRQ(ierr);
35 
36   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
37   PetscFunctionReturn(0);
38 }
39 
40 #undef __FUNCT__
41 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
42 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
43 {
44   PetscErrorCode       ierr;
45   Mat_MatMatMultMPI    *ptap;
46   PetscObjectContainer container;
47 
48   PetscFunctionBegin;
49   if (scall == MAT_INITIAL_MATRIX){
50     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
51     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
52     ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL;
53     ptap->abnz_max = 0; /* symbolic A*P is not done yet */
54 
55     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
56     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
57 
58     /* get P_loc by taking all local rows of P */
59     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
60 
61     /* attach the supporting struct to P for reuse */
62     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
63     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
64     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
65     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
66 
67     /* now, compute symbolic local P^T*A*P */
68     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
69     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
70   } else if (scall == MAT_REUSE_MATRIX){
71     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
72     if (container) {
73       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
74     } else {
75       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
76     }
77 
78     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
79     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
80 
81     /* get P_loc by taking all local rows of P */
82     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
83 
84   } else {
85     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
86   }
87   /* now, compute numeric local P^T*A*P */
88   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
89   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
90   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
91 
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
97 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
98 {
99   PetscErrorCode       ierr;
100   Mat                  P_loc,P_oth,B_mpi;
101   Mat_MatMatMultMPI    *ap;
102   PetscObjectContainer container;
103   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
104   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
105   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
106   Mat_SeqAIJ           *p_loc,*p_oth;
107   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ,*pjj;
108   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz;
109   PetscInt             nlnk,*lnk,*cdi,*cdj,*owners_co,*coi,*coj,*cji;
110   PetscInt             am=A->m,pN=P->N,pm=P->m,pn=P->n;
111   PetscInt             i,j,k,ptnzi,arow,prow,pnzj,cnzi;
112   PetscBT              lnkbt;
113   PetscInt             prstart,prend;
114   MPI_Comm             comm=A->comm;
115   PetscMPIInt          size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
116   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
117   PetscInt             len,proc,*dnz,*onz,*owners;
118   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
119   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
120   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
121   MPI_Status           *status;
122   Mat_Merge_SeqsToMPI  *merge;
123   PetscInt             *apsymi,*apsymj,*apj,apnzj,*prmap=p->garray,pon;
124 
125   PetscFunctionBegin;
126   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
127   if (container) {
128     ierr  = PetscObjectContainerGetPointer(container,(void **)&ap);CHKERRQ(ierr);
129   } else {
130     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
131   }
132   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
133   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
134 
135   /* get data from P_loc and P_oth */
136   P_loc=ap->B_loc; P_oth=ap->B_oth; prstart=ap->brstart;
137   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
138   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
139   prend = prstart + pm;
140 
141   /* first, compute symbolic AP = A_loc*P */
142   /*--------------------------------------*/
143   ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr);
144   ap->abi = apsymi;
145   apsymi[0] = 0;
146 
147   /* create and initialize a linked list */
148   nlnk = pN+1;
149   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
150 
151   /* Initial FreeSpace size is 2*nnz(A) */
152   ierr = GetMoreSpace((PetscInt)(fill*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr);
153   current_space = free_space;
154 
155   for (i=0;i<am;i++) {
156     apnzj = 0;
157     /* diagonal portion of A */
158     anzi = adi[i+1] - adi[i];
159     for (j=0; j<anzi; j++){
160       prow = *adj++;
161       pnzj = pi_loc[prow+1] - pi_loc[prow];
162       pjj  = pj_loc + pi_loc[prow];
163       /* add non-zero cols of P into the sorted linked list lnk */
164       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
165       apnzj += nlnk;
166     }
167     /* off-diagonal portion of A */
168     anzi = aoi[i+1] - aoi[i];
169     for (j=0; j<anzi; j++){
170       prow = *aoj++;
171       pnzj = pi_oth[prow+1] - pi_oth[prow];
172       pjj  = pj_oth + pi_oth[prow];
173       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
174       apnzj += nlnk;
175     }
176 
177     apsymi[i+1] = apsymi[i] + apnzj;
178     if (ap->abnz_max < apnzj) ap->abnz_max = apnzj;
179 
180     /* if free space is not available, double the total space in the list */
181     if (current_space->local_remaining<apnzj) {
182       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
183     }
184 
185     /* Copy data into free space, then initialize lnk */
186     ierr = PetscLLClean(pN,pN,apnzj,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
187     current_space->array           += apnzj;
188     current_space->local_used      += apnzj;
189     current_space->local_remaining -= apnzj;
190   }
191   /* Allocate space for apsymj, initialize apsymj, and */
192   /* destroy list of free space and other temporary array(s) */
193   ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ap->abj);CHKERRQ(ierr);
194   apsymj = ap->abj;
195   ierr = MakeSpaceContiguous(&free_space,ap->abj);CHKERRQ(ierr);
196 
197   /* get ij structure of (p->A)^T and (p->B)^T */
198   /*-------------------------------------------*/
199   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
200   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
201 
202   /* then, compute symbolic Co_seq = (p->B)^T*AP */
203   /* Allocate coi array, arrays for fill computation and */
204   /* free space for accumulating nonzero column info */
205   pon = (p->B)->n; /* total num of rows to be sent to other processors
206                          >= (num of nonzero rows of C_seq) - pn */
207   ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr);
208   coi[0] = 0;
209 
210   /* set initial free space to be 2*pon*max( nnz(AP) per row)- an upper bound */
211   nnz           = 2*pon*ap->abnz_max + 1;
212   ierr          = GetMoreSpace(nnz,&free_space);
213   current_space = free_space;
214 
215   /* determine symbolic Co=(p->B)^T*AP - send to others */
216   for (i=0; i<pon; i++) {
217     cnzi  = 0;
218     ptnzi = poti[i+1] - poti[i];
219     j     = ptnzi;
220     ptJ   = potj + poti[i+1];
221     while (j){/* assume cols are almost in increasing order, starting from its end saves computation */
222       j--; ptJ--;
223       arow  = *ptJ; /* row of AP == col of Pot */
224       apnzj = apsymi[arow+1] - apsymi[arow];
225       apj   = apsymj + apsymi[arow];
226       /* add non-zero cols of AP into the sorted linked list lnk */
227       ierr = PetscLLAdd(apnzj,apj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
228       cnzi += nlnk;
229     }
230 
231     /* If free space is not available, double the total space in the list */
232     if (current_space->local_remaining<cnzi) {
233       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
234     }
235 
236     /* Copy data into free space, and zero out denserows */
237     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
238     current_space->array           += cnzi;
239     current_space->local_used      += cnzi;
240     current_space->local_remaining -= cnzi;
241     coi[i+1] = coi[i] + cnzi;
242   }
243   ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr);
244   ierr = MakeSpaceContiguous(&free_space,coj);CHKERRQ(ierr);
245 
246   /* determine symbolic Cd = (p->A)^T*AP - stay local */
247   /*------------------------------------------------------*/
248   /* Allocate ci array, arrays for fill computation and */
249   /* free space for accumulating nonzero column info */
250   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&cdi);CHKERRQ(ierr);
251   cdi[0] = 0;
252 
253   /* Set initial free space to be 2*pn*max( nnz(AP) per row) */
254   nnz           = 2*pn*ap->abnz_max + 1;
255   ierr          = GetMoreSpace(nnz,&free_space);
256   current_space = free_space;
257 
258   for (i=0; i<pn; i++) {
259     cnzi  = 0;
260     ptnzi = pdti[i+1] - pdti[i];
261     if (!ptnzi) SETERRQ1(1,"%d ptnzi=0",i);
262     j     = ptnzi;
263     ptJ   = pdtj + pdti[i+1];
264     while (j){/* assume cols are almost in increasing order, starting from its end saves computation */
265       j--; ptJ--;
266       arow  = *ptJ; /* row of AP == col of Pt */
267       apnzj = apsymi[arow+1] - apsymi[arow];
268       apj   = apsymj + apsymi[arow];
269       /* add non-zero cols of AP into the sorted linked list lnk */
270       ierr = PetscLLAdd(apnzj,apj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
271       cnzi += nlnk;
272     }
273 
274     /* If free space is not available, double the total space in the list */
275     if (current_space->local_remaining<cnzi) {
276       printf("[%d] free space is not available, double\n",rank);
277       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
278     }
279 
280     /* Copy data into free space, and zero out denserows */
281     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
282     current_space->array           += cnzi;
283     current_space->local_used      += cnzi;
284     current_space->local_remaining -= cnzi;
285     cdi[i+1] = cdi[i] + cnzi;
286   }
287 
288   /* Clean up. */
289   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr);
290   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr);
291 
292   /* Allocate space for cj, initialize cj, and */
293   /* destroy list of free space and other temporary array(s) */
294   ierr = PetscMalloc((cdi[pn]+1)*sizeof(PetscInt),&cdj);CHKERRQ(ierr);
295   ierr = MakeSpaceContiguous(&free_space,cdj);CHKERRQ(ierr);
296   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
297 
298   /* add C_seq into mpi C              */
299   /*-----------------------------------*/
300   /* determine row ownership */
301   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
302   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
303   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
304   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
305   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
306 
307   /* determine the number of messages to send, their lengths */
308   /*---------------------------------------------------------*/
309   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
310   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
311   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
312   len_s = merge->len_s;
313   merge->nsend = 0;
314 
315   ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr);
316   ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
317 
318   proc = 0;
319   for (i=0; i<pon; i++){
320     while (prmap[i] >= owners[proc+1]) proc++;
321     len_si[proc]++;  /* num of rows in Co to be sent to [proc] */
322     len_s[proc] += coi[i+1] - coi[i];
323   }
324 
325   len   = 0;  /* max length of buf_si[] */
326   owners_co[0] = 0;
327   for (proc=0; proc<size; proc++){
328     owners_co[proc+1] = owners_co[proc] + len_si[proc];
329     if (len_si[proc]){
330       merge->nsend++;
331       len_si[proc] = 2*(len_si[proc] + 1);
332       len += len_si[proc];
333     }
334   }
335 
336   /* determine the number and length of messages to receive for ij-structure */
337   /*-------------------------------------------------------------------------*/
338   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
339   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
340 
341   /* post the Irecv of j-structure */
342   /*-------------------------------*/
343   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
344   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
345 
346   /* post the Isend of j-structure */
347   /*--------------------------------*/
348   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
349   sj_waits = si_waits + merge->nsend;
350 
351   for (proc=0, k=0; proc<size; proc++){
352     if (!len_s[proc]) continue;
353     i = owners_co[proc];
354     ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
355     k++;
356   }
357 
358   /* receives and sends of j-structure are complete */
359   /*------------------------------------------------*/
360   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
361   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
362   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
363 
364   /* send and recv i-structure */
365   /*---------------------------*/
366   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
367   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
368 
369   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
370   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
371   for (proc=0,k=0; proc<size; proc++){
372     if (!len_s[proc]) continue;
373     /* form outgoing message for i-structure:
374          buf_si[0]:                 nrows to be sent
375                [1:nrows]:           row index (global)
376                [nrows+1:2*nrows+1]: i-structure index
377     */
378     /*-------------------------------------------*/
379     nrows = len_si[proc]/2 - 1;
380     buf_si_i    = buf_si + nrows+1;
381     buf_si[0]   = nrows;
382     buf_si_i[0] = 0;
383     nrows = 0;
384     for (i=owners_co[proc]; i<owners_co[proc+1]; i++){
385       anzi = coi[i+1] - coi[i];
386       if (!anzi) SETERRQ1(1,"i=%d, ani = 0",i);
387       buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
388       buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */
389       nrows++;
390     }
391     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
392     k++;
393     buf_si += len_si[proc];
394   }
395 
396   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
397   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
398 
399   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
400   for (i=0; i<merge->nrecv; i++){
401     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
402   }
403 
404   ierr = PetscFree(len_si);CHKERRQ(ierr);
405   ierr = PetscFree(len_ri);CHKERRQ(ierr);
406   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
407   ierr = PetscFree(si_waits);CHKERRQ(ierr);
408   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
409   ierr = PetscFree(buf_s);CHKERRQ(ierr);
410   ierr = PetscFree(status);CHKERRQ(ierr);
411 
412   /* compute the local portion of C (mpi mat) */
413   /*------------------------------------------*/
414   /* allocate bi array and free space for accumulating nonzero column info */
415   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
416   bi[0] = 0;
417 
418   /* create and initialize a linked list */
419   nlnk = pN+1;
420   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
421 
422   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
423   free_space=PETSC_NULL; current_space=PETSC_NULL;
424   ierr = GetMoreSpace((PetscInt)(2*cdi[pn]+1),&free_space);CHKERRQ(ierr);
425   current_space = free_space;
426 
427   /* determine symbolic info for each local row */
428   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
429   nextrow = buf_ri_k + merge->nrecv;
430   nextci  = nextrow + merge->nrecv;
431   for (k=0; k<merge->nrecv; k++){
432     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
433     nrows = *buf_ri_k[k];
434     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
435     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
436   }
437 
438   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
439   len = 0;
440   for (i=0; i<pn; i++) {
441     bnzi = 0;
442     /* add local non-zero cols of this proc's C_seq into lnk */
443     anzi = cdi[i+1] - cdi[i];
444     cji  = cdj + cdi[i];
445     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
446     bnzi += nlnk;
447     /* add received col data into lnk */
448     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
449       if (i == *nextrow[k]) { /* i-th row */
450         anzi = *(nextci[k]+1) - *nextci[k];
451         cji  = buf_rj[k] + *nextci[k];
452         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
453         bnzi += nlnk;
454         nextrow[k]++; nextci[k]++;
455       }
456     }
457     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
458 
459     /* if free space is not available, make more free space */
460     if (current_space->local_remaining<bnzi) {
461       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
462       nspacedouble++;
463     }
464     /* copy data into free space, then initialize lnk */
465     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
466     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
467 
468     current_space->array           += bnzi;
469     current_space->local_used      += bnzi;
470     current_space->local_remaining -= bnzi;
471 
472     bi[i+1] = bi[i] + bnzi;
473   }
474   ierr = PetscFree(cdi);CHKERRQ(ierr);
475   ierr = PetscFree(cdj);CHKERRQ(ierr);
476   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
477 
478   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
479   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
480   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
481 
482   /* create symbolic parallel matrix B_mpi */
483   /*---------------------------------------*/
484   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
485   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
486    ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
487   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
488   /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */
489 
490   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
491   B_mpi->assembled     = PETSC_FALSE;
492   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
493   merge->bi            = bi;
494   merge->bj            = bj;
495   merge->coi           = coi;
496   merge->coj           = coj;
497   merge->buf_ri        = buf_ri;
498   merge->buf_rj        = buf_rj;
499   merge->owners_co     = owners_co;
500 
501   /* attach the supporting struct to B_mpi for reuse */
502   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
503   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
504   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
505   *C = B_mpi;
506 
507   PetscFunctionReturn(0);
508 }
509 
510 #undef __FUNCT__
511 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
512 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
513 {
514   PetscErrorCode       ierr;
515   Mat_Merge_SeqsToMPI  *merge;
516   Mat_MatMatMultMPI    *ap;
517   PetscObjectContainer cont_merge,cont_ptap;
518   PetscInt       flops=0;
519   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
520   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
521   Mat_SeqAIJ     *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
522   Mat_SeqAIJ     *p_loc,*p_oth;
523   PetscInt       *adi=ad->i,*aoi=ao->i,*adj=ad->j,*aoj=ao->j,*apj,nextp;
524   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
525   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
526   PetscInt       *cjj;
527   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*caj;
528   MatScalar      *pa_loc,*pa_oth;
529   PetscInt       am=A->m,cm=C->m;
530   MPI_Comm       comm=C->comm;
531   PetscMPIInt    size,rank,taga,*len_s;
532   PetscInt       *owners,proc,nrows,**buf_ri_k,**nextrow,**nextcseqi;
533   PetscInt       **buf_ri,**buf_rj;
534   PetscInt       cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi,bj,ba: local array of C(mpi mat) */
535   MPI_Request    *s_waits,*r_waits;
536   MPI_Status     *status;
537   MatScalar      **abuf_r,*ba_i,*cseqa_tmp;
538   PetscInt       *cseqj_tmp;
539   PetscInt       *apsymi,*apsymj;
540   PetscInt       pon=(p->B)->n,*coi,*coj;
541   PetscInt       *poi=po->i,*poj=po->j,*poJ=poj,*pdi=pd->i,*pdj=pd->j,*pdJ=pdj;
542   MatScalar      *coa,*pdA=pd->a,*poA=po->a,*ba;
543 
544   PetscFunctionBegin;
545   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
546   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
547 
548   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
549   if (cont_merge) {
550     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
551   } else {
552     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
553   }
554   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
555   if (cont_ptap) {
556     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ap);CHKERRQ(ierr);
557   } else {
558     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
559   }
560 
561   /* get data from symbolic products */
562   p_loc=(Mat_SeqAIJ*)(ap->B_loc)->data;
563   p_oth=(Mat_SeqAIJ*)(ap->B_oth)->data;
564   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a;
565   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
566 
567   coi   = merge->coi; coj   = merge->coj;
568   ierr  = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr);
569   ierr  = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr);
570 
571   bi = merge->bi; bj = merge->bj;
572   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
573   ierr  = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr);
574   ierr  = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr);
575 
576   /* get data from symbolic A*P */
577   ierr = PetscMalloc((ap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr);
578   ierr = PetscMemzero(apa,ap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr);
579 
580   /* compute numeric C_seq=P_loc^T*A_loc*P */
581   apsymi = ap->abi; apsymj = ap->abj;
582   for (i=0;i<am;i++) {
583     /* form i-th sparse row of A*P */
584     apnzj = apsymi[i+1] - apsymi[i];
585     apj   = apsymj + apsymi[i];
586     /* diagonal portion of A */
587     anzi  = adi[i+1] - adi[i];
588     for (j=0;j<anzi;j++) {
589       prow = *adj++;
590       pnzj = pi_loc[prow+1] - pi_loc[prow];
591       pjj  = pj_loc + pi_loc[prow];
592       paj  = pa_loc + pi_loc[prow];
593       nextp = 0;
594       for (k=0; nextp<pnzj; k++) {
595         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
596           apa[k] += (*ada)*paj[nextp++];
597         }
598       }
599       flops += 2*pnzj;
600       ada++;
601     }
602     /* off-diagonal portion of A */
603     anzi  = aoi[i+1] - aoi[i];
604     for (j=0;j<anzi;j++) {
605       prow = *aoj++;
606       pnzj = pi_oth[prow+1] - pi_oth[prow];
607       pjj  = pj_oth + pi_oth[prow];
608       paj  = pa_oth + pi_oth[prow];
609       nextp = 0;
610       for (k=0; nextp<pnzj; k++) {
611         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
612           apa[k] += (*aoa)*paj[nextp++];
613         }
614       }
615       flops += 2*pnzj;
616       aoa++;
617     }
618 
619     /* compute outer product (p->A)[i,:]^T*AP[i,:] - stay local */
620     pnzi = pdi[i+1] - pdi[i];
621     for (j=0;j<pnzi;j++) {
622       nextap = 0;
623       crow   = *pdJ++;
624       cjj    = bj + bi[crow];
625       caj    = ba + bi[crow];
626       for (k=0; nextap<apnzj; k++) {
627         if (cjj[k]==apj[nextap]) caj[k] += (*pdA)*apa[nextap++];
628       }
629       flops += 2*apnzj;
630       pdA++;
631     }
632 
633     /* compute outer product (p->B)[i,:]^T*AP[i,:] - send to others */
634     pnzi = poi[i+1] - poi[i];
635     for (j=0;j<pnzi;j++) {
636       nextap = 0;
637       crow   = *poJ++;
638       cjj    = coj + coi[crow];
639       caj    = coa + coi[crow];
640       for (k=0; nextap<apnzj; k++) {
641         if (cjj[k] == apj[nextap]) caj[k] += (*poA)*apa[nextap++];
642       }
643       flops += 2*apnzj;
644       poA++;
645     }
646 
647     /* zero the current row info for A*P */
648     ierr = PetscMemzero(apa,apnzj*sizeof(MatScalar));CHKERRQ(ierr);
649   }
650   ierr = PetscFree(apa);CHKERRQ(ierr);
651 
652   /* send and recv matrix values */
653   /*-----------------------------*/
654 
655   buf_ri = merge->buf_ri;
656   buf_rj = merge->buf_rj;
657 
658   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
659 
660   len_s  = merge->len_s;
661   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
662   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
663 
664   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
665   for (proc=0,k=0; proc<size; proc++){
666     if (!len_s[proc]) continue;
667     i = merge->owners_co[proc];
668     ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
669     k++;
670   }
671 
672   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
673   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
674   ierr = PetscFree(status);CHKERRQ(ierr);
675 
676   ierr = PetscFree(s_waits);CHKERRQ(ierr);
677   ierr = PetscFree(r_waits);CHKERRQ(ierr);
678 
679   /* insert mat values of mpimat */
680   /*----------------------------*/
681   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
682   nextrow = buf_ri_k + merge->nrecv;
683   nextcseqi  = nextrow + merge->nrecv;
684 
685   for (k=0; k<merge->nrecv; k++){
686     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
687     nrows = *(buf_ri_k[k]);
688     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
689     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
690   }
691 
692   /* insert local and received values into C */
693   for (i=0; i<cm; i++) {
694     cseqrow = owners[rank] + i; /* global row index of C_seq */
695     bj_i = bj + bi[i];  /* col indices of the i-th row of C */
696     ba_i = ba + bi[i];
697     bnzi = bi[i+1] - bi[i];
698     /* add received vals into ba */
699     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
700       /* i-th row */
701       if (i == *nextrow[k]) {
702         cseqnzi   = *(nextcseqi[k]+1) - *nextcseqi[k];
703         cseqj_tmp = buf_rj[k] + *(nextcseqi[k]);
704         cseqa_tmp = abuf_r[k] + *(nextcseqi[k]);
705         nextcseqj = 0;
706         for (j=0; nextcseqj<cseqnzi; j++){
707           if (bj_i[j] == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
708             ba_i[j] += cseqa_tmp[nextcseqj++];
709           }
710         }
711         nextrow[k]++; nextcseqi[k]++;
712       }
713     }
714     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
715     flops += 2*cseqnzi;
716   }
717   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
718   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
719 
720   ierr = PetscFree(coa);CHKERRQ(ierr);
721   ierr = PetscFree(ba);CHKERRQ(ierr);
722   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
723   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
724   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
725 
726   PetscFunctionReturn(0);
727 }
728