1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 13 /* #define PTAP_PROFILE */ 14 15 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 16 { 17 PetscErrorCode ierr; 18 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 19 Mat_PtAPMPI *ptap=a->ptap; 20 21 PetscFunctionBegin; 22 if (ptap) { 23 Mat_Merge_SeqsToMPI *merge=ptap->merge; 24 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 25 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 26 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 27 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 28 ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */ 29 ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr); 30 ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr); 31 if (ptap->AP_loc) { /* used by alg_rap */ 32 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 33 ierr = PetscFree(ap->i);CHKERRQ(ierr); 34 ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr); 35 ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr); 36 } else { /* used by alg_ptap */ 37 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 38 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 39 } 40 ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr); 41 ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr); 42 if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);} 43 44 if (merge) { /* used by alg_ptap */ 45 ierr = PetscFree(merge->id_r);CHKERRQ(ierr); 46 ierr = PetscFree(merge->len_s);CHKERRQ(ierr); 47 ierr = PetscFree(merge->len_r);CHKERRQ(ierr); 48 ierr = PetscFree(merge->bi);CHKERRQ(ierr); 49 ierr = PetscFree(merge->bj);CHKERRQ(ierr); 50 ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr); 51 ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr); 52 ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr); 53 ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr); 54 ierr = PetscFree(merge->coi);CHKERRQ(ierr); 55 ierr = PetscFree(merge->coj);CHKERRQ(ierr); 56 ierr = PetscFree(merge->owners_co);CHKERRQ(ierr); 57 ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr); 58 ierr = PetscFree(ptap->merge);CHKERRQ(ierr); 59 } 60 61 ierr = ptap->destroy(A);CHKERRQ(ierr); 62 ierr = PetscFree(ptap);CHKERRQ(ierr); 63 } 64 PetscFunctionReturn(0); 65 } 66 67 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M) 68 { 69 PetscErrorCode ierr; 70 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 71 Mat_PtAPMPI *ptap = a->ptap; 72 73 PetscFunctionBegin; 74 ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr); 75 (*M)->ops->destroy = ptap->destroy; 76 (*M)->ops->duplicate = ptap->duplicate; 77 PetscFunctionReturn(0); 78 } 79 80 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 81 { 82 PetscErrorCode ierr; 83 PetscBool rap=PETSC_TRUE; /* do R=P^T locally, then C=R*A*P */ 84 MPI_Comm comm; 85 86 PetscFunctionBegin; 87 /* check if matrix local sizes are compatible */ 88 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 89 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 90 if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); 91 92 ierr = PetscOptionsGetBool(NULL,NULL,"-matrap",&rap,NULL);CHKERRQ(ierr); 93 if (scall == MAT_INITIAL_MATRIX) { 94 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 95 if (rap) { /* do R=P^T locally, then C=R*A*P */ 96 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); 97 } else { /* do P^T*A*P */ 98 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(A,P,fill,C);CHKERRQ(ierr); 99 } 100 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 101 } 102 ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 103 ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr); 104 ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 105 PetscFunctionReturn(0); 106 } 107 108 #if defined(PETSC_HAVE_HYPRE) 109 PETSC_INTERN PetscErrorCode MatPtAPSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*); 110 #endif 111 112 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 113 { 114 PetscErrorCode ierr; 115 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 116 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 117 Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq; 118 Mat_PtAPMPI *ptap = c->ptap; 119 Mat AP_loc,C_loc,C_oth; 120 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz; 121 PetscScalar *apa; 122 const PetscInt *cols; 123 const PetscScalar *vals; 124 125 PetscFunctionBegin; 126 ierr = MatZeroEntries(C);CHKERRQ(ierr); 127 128 /* 1) get R = Pd^T,Ro = Po^T */ 129 if (ptap->reuse == MAT_REUSE_MATRIX) { 130 ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 131 ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 132 } 133 134 /* 2) get AP_loc */ 135 AP_loc = ptap->AP_loc; 136 ap = (Mat_SeqAIJ*)AP_loc->data; 137 138 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 139 /*-----------------------------------------------------*/ 140 if (ptap->reuse == MAT_REUSE_MATRIX) { 141 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 142 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 143 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 144 } 145 146 /* 2-2) compute numeric A_loc*P - dominating part */ 147 /* ---------------------------------------------- */ 148 /* get data from symbolic products */ 149 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 150 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 151 api = ap->i; 152 apj = ap->j; 153 for (i=0; i<am; i++) { 154 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 155 apnz = api[i+1] - api[i]; 156 apa = ap->a + api[i]; 157 ierr = PetscMemzero(apa,sizeof(PetscScalar)*apnz);CHKERRQ(ierr); 158 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 159 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 160 } 161 162 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 163 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 164 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 165 166 C_loc = ptap->C_loc; 167 C_oth = ptap->C_oth; 168 169 /* add C_loc and Co to to C */ 170 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 171 172 /* C_loc -> C */ 173 cm = C_loc->rmap->N; 174 c_seq = (Mat_SeqAIJ*)C_loc->data; 175 cols = c_seq->j; 176 vals = c_seq->a; 177 for (i=0; i<cm; i++) { 178 ncols = c_seq->i[i+1] - c_seq->i[i]; 179 row = rstart + i; 180 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 181 cols += ncols; vals += ncols; 182 } 183 184 /* Co -> C, off-processor part */ 185 cm = C_oth->rmap->N; 186 c_seq = (Mat_SeqAIJ*)C_oth->data; 187 cols = c_seq->j; 188 vals = c_seq->a; 189 for (i=0; i<cm; i++) { 190 ncols = c_seq->i[i+1] - c_seq->i[i]; 191 row = p->garray[i]; 192 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 193 cols += ncols; vals += ncols; 194 } 195 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 196 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 197 198 ptap->reuse = MAT_REUSE_MATRIX; 199 PetscFunctionReturn(0); 200 } 201 202 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C) 203 { 204 PetscErrorCode ierr; 205 Mat_PtAPMPI *ptap; 206 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 207 MPI_Comm comm; 208 PetscMPIInt size,rank; 209 Mat Cmpi,P_loc,P_oth; 210 PetscFreeSpaceList free_space=NULL,current_space=NULL; 211 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 212 PetscInt *lnk,i,k,pnz,row,nsend; 213 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 214 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 215 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 216 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 217 MPI_Request *swaits,*rwaits; 218 MPI_Status *sstatus,rstatus; 219 PetscLayout rowmap; 220 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 221 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 222 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi; 223 Mat_SeqAIJ *p_loc,*p_oth,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c_loc,*c_oth; 224 PetscScalar *apv; 225 PetscTable ta; 226 #if defined(PETSC_USE_INFO) 227 PetscReal apfill; 228 #endif 229 230 PetscFunctionBegin; 231 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 232 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 233 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 234 235 /* create symbolic parallel matrix Cmpi */ 236 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 237 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 238 239 /* create struct Mat_PtAPMPI and attached it to C later */ 240 ierr = PetscNew(&ptap);CHKERRQ(ierr); 241 ptap->reuse = MAT_INITIAL_MATRIX; 242 243 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 244 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr); 245 /* get P_loc by taking all local rows of P */ 246 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr); 247 248 ptap->P_loc = P_loc; 249 ptap->P_oth = P_oth; 250 251 /* (0) compute Rd = Pd^T, Ro = Po^T */ 252 /* --------------------------------- */ 253 ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 254 ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 255 256 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 257 /* ----------------------------------------------------------------- */ 258 p_loc = (Mat_SeqAIJ*)P_loc->data; 259 p_oth = (Mat_SeqAIJ*)P_oth->data; 260 261 /* create and initialize a linked list */ 262 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 263 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 264 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 265 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 266 267 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 268 269 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 270 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 271 current_space = free_space; 272 nspacedouble = 0; 273 274 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 275 api[0] = 0; 276 for (i=0; i<am; i++) { 277 /* diagonal portion: Ad[i,:]*P */ 278 ai = ad->i; pi = p_loc->i; 279 nzi = ai[i+1] - ai[i]; 280 aj = ad->j + ai[i]; 281 for (j=0; j<nzi; j++) { 282 row = aj[j]; 283 pnz = pi[row+1] - pi[row]; 284 Jptr = p_loc->j + pi[row]; 285 /* add non-zero cols of P into the sorted linked list lnk */ 286 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 287 } 288 /* off-diagonal portion: Ao[i,:]*P */ 289 ai = ao->i; pi = p_oth->i; 290 nzi = ai[i+1] - ai[i]; 291 aj = ao->j + ai[i]; 292 for (j=0; j<nzi; j++) { 293 row = aj[j]; 294 pnz = pi[row+1] - pi[row]; 295 Jptr = p_oth->j + pi[row]; 296 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 297 } 298 apnz = lnk[0]; 299 api[i+1] = api[i] + apnz; 300 301 /* if free space is not available, double the total space in the list */ 302 if (current_space->local_remaining<apnz) { 303 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 304 nspacedouble++; 305 } 306 307 /* Copy data into free space, then initialize lnk */ 308 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 309 310 current_space->array += apnz; 311 current_space->local_used += apnz; 312 current_space->local_remaining -= apnz; 313 } 314 /* Allocate space for apj and apv, initialize apj, and */ 315 /* destroy list of free space and other temporary array(s) */ 316 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 317 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 318 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 319 320 /* Create AP_loc for reuse */ 321 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 322 323 #if defined(PETSC_USE_INFO) 324 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 325 ptap->AP_loc->info.mallocs = nspacedouble; 326 ptap->AP_loc->info.fill_ratio_given = fill; 327 ptap->AP_loc->info.fill_ratio_needed = apfill; 328 329 if (api[am]) { 330 ierr = PetscInfo3(ptap->AP_loc,"AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 331 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 332 } else { 333 ierr = PetscInfo(ptap->AP_loc,"AP_loc is empty \n");CHKERRQ(ierr); 334 } 335 #endif 336 337 /* (2-1) compute symbolic Co = Ro*AP_loc */ 338 /* ------------------------------------ */ 339 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 340 341 /* (3) send coj of C_oth to other processors */ 342 /* ------------------------------------------ */ 343 /* determine row ownership */ 344 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 345 rowmap->n = pn; 346 rowmap->bs = 1; 347 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 348 owners = rowmap->range; 349 350 /* determine the number of messages to send, their lengths */ 351 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 352 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 353 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 354 355 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 356 coi = c_oth->i; coj = c_oth->j; 357 con = ptap->C_oth->rmap->n; 358 proc = 0; 359 for (i=0; i<con; i++) { 360 while (prmap[i] >= owners[proc+1]) proc++; 361 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 362 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 363 } 364 365 len = 0; /* max length of buf_si[], see (4) */ 366 owners_co[0] = 0; 367 nsend = 0; 368 for (proc=0; proc<size; proc++) { 369 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 370 if (len_s[proc]) { 371 nsend++; 372 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 373 len += len_si[proc]; 374 } 375 } 376 377 /* determine the number and length of messages to receive for coi and coj */ 378 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 379 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 380 381 /* post the Irecv and Isend of coj */ 382 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 383 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 384 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 385 for (proc=0, k=0; proc<size; proc++) { 386 if (!len_s[proc]) continue; 387 i = owners_co[proc]; 388 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 389 k++; 390 } 391 392 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 393 /* ---------------------------------------- */ 394 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 395 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 396 397 /* receives coj are complete */ 398 for (i=0; i<nrecv; i++) { 399 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 400 } 401 ierr = PetscFree(rwaits);CHKERRQ(ierr); 402 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 403 404 /* add received column indices into ta to update Crmax */ 405 for (k=0; k<nrecv; k++) {/* k-th received message */ 406 Jptr = buf_rj[k]; 407 for (j=0; j<len_r[k]; j++) { 408 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 409 } 410 } 411 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 412 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 413 414 /* (4) send and recv coi */ 415 /*-----------------------*/ 416 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 417 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 418 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 419 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 420 for (proc=0,k=0; proc<size; proc++) { 421 if (!len_s[proc]) continue; 422 /* form outgoing message for i-structure: 423 buf_si[0]: nrows to be sent 424 [1:nrows]: row index (global) 425 [nrows+1:2*nrows+1]: i-structure index 426 */ 427 /*-------------------------------------------*/ 428 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 429 buf_si_i = buf_si + nrows+1; 430 buf_si[0] = nrows; 431 buf_si_i[0] = 0; 432 nrows = 0; 433 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 434 nzi = coi[i+1] - coi[i]; 435 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 436 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 437 nrows++; 438 } 439 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 440 k++; 441 buf_si += len_si[proc]; 442 } 443 for (i=0; i<nrecv; i++) { 444 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 445 } 446 ierr = PetscFree(rwaits);CHKERRQ(ierr); 447 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 448 449 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 450 ierr = PetscFree(len_ri);CHKERRQ(ierr); 451 ierr = PetscFree(swaits);CHKERRQ(ierr); 452 ierr = PetscFree(buf_s);CHKERRQ(ierr); 453 454 /* (5) compute the local portion of Cmpi */ 455 /* ------------------------------------------ */ 456 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 457 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 458 current_space = free_space; 459 460 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 461 for (k=0; k<nrecv; k++) { 462 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 463 nrows = *buf_ri_k[k]; 464 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 465 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 466 } 467 468 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 469 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 470 for (i=0; i<pn; i++) { 471 /* add C_loc into Cmpi */ 472 nzi = c_loc->i[i+1] - c_loc->i[i]; 473 Jptr = c_loc->j + c_loc->i[i]; 474 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 475 476 /* add received col data into lnk */ 477 for (k=0; k<nrecv; k++) { /* k-th received message */ 478 if (i == *nextrow[k]) { /* i-th row */ 479 nzi = *(nextci[k]+1) - *nextci[k]; 480 Jptr = buf_rj[k] + *nextci[k]; 481 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 482 nextrow[k]++; nextci[k]++; 483 } 484 } 485 nzi = lnk[0]; 486 487 /* copy data into free space, then initialize lnk */ 488 ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr); 489 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 490 } 491 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 492 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 493 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 494 495 /* local sizes and preallocation */ 496 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 497 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 498 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 499 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 500 501 /* members in merge */ 502 ierr = PetscFree(id_r);CHKERRQ(ierr); 503 ierr = PetscFree(len_r);CHKERRQ(ierr); 504 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 505 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 506 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 507 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 508 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 509 510 /* attach the supporting struct to Cmpi for reuse */ 511 c = (Mat_MPIAIJ*)Cmpi->data; 512 c->ptap = ptap; 513 ptap->duplicate = Cmpi->ops->duplicate; 514 ptap->destroy = Cmpi->ops->destroy; 515 516 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 517 Cmpi->assembled = PETSC_FALSE; 518 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 519 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP; 520 *C = Cmpi; 521 PetscFunctionReturn(0); 522 } 523 524 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 525 { 526 PetscErrorCode ierr; 527 Mat_PtAPMPI *ptap; 528 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 529 MPI_Comm comm; 530 PetscMPIInt size,rank; 531 Mat Cmpi; 532 PetscFreeSpaceList free_space=NULL,current_space=NULL; 533 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 534 PetscInt *lnk,i,k,pnz,row,nsend; 535 PetscBT lnkbt; 536 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 537 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 538 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 539 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 540 MPI_Request *swaits,*rwaits; 541 MPI_Status *sstatus,rstatus; 542 PetscLayout rowmap; 543 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 544 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 545 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 546 Mat_SeqAIJ *p_loc,*p_oth,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c_loc,*c_oth; 547 PetscScalar *apv; 548 PetscTable ta; 549 #if defined(PETSC_HAVE_HYPRE) 550 const char *algTypes[3] = {"scalable","nonscalable","hypre"}; 551 PetscInt nalg = 3; 552 #else 553 const char *algTypes[2] = {"scalable","nonscalable"}; 554 PetscInt nalg = 2; 555 #endif 556 PetscInt alg = 1; /* set default algorithm */ 557 #if defined(PETSC_USE_INFO) 558 PetscReal apfill; 559 #endif 560 #if defined(PTAP_PROFILE) 561 PetscLogDouble t0,t1,t11,t12,t2,t3,t4; 562 #endif 563 PetscBool flg; 564 565 PetscFunctionBegin; 566 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 567 568 /* pick an algorithm */ 569 ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 570 ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[1],&alg,&flg);CHKERRQ(ierr); 571 ierr = PetscOptionsEnd();CHKERRQ(ierr); 572 573 if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */ 574 MatInfo Ainfo,Pinfo; 575 PetscInt nz_local; 576 PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable; 577 578 ierr = MatGetInfo(A,MAT_LOCAL,&Ainfo);CHKERRQ(ierr); 579 ierr = MatGetInfo(P,MAT_LOCAL,&Pinfo);CHKERRQ(ierr); 580 nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated); 581 582 if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE; 583 ierr = MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);CHKERRQ(ierr); 584 585 if (alg_scalable) { 586 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 587 ierr = PetscInfo2(P,"Use scalable algorithm, pN %D, fill*nz_allocated %g\n",pN,fill*nz_local);CHKERRQ(ierr); 588 } 589 } 590 /* ierr = PetscInfo2(P,"Use algorithm %D, pN %D\n",alg,pN);CHKERRQ(ierr); */ 591 592 if (alg == 0) { 593 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr); 594 (*C)->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 595 PetscFunctionReturn(0); 596 597 #if defined(PETSC_HAVE_HYPRE) 598 } else if (alg == 2) { 599 /* Use boomerAMGBuildCoarseOperator */ 600 ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr); 601 PetscFunctionReturn(0); 602 #endif 603 } 604 605 #if defined(PTAP_PROFILE) 606 ierr = PetscTime(&t0);CHKERRQ(ierr); 607 #endif 608 609 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 610 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 611 612 /* create symbolic parallel matrix Cmpi */ 613 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 614 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 615 616 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 617 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 618 619 /* create struct Mat_PtAPMPI and attached it to C later */ 620 ierr = PetscNew(&ptap);CHKERRQ(ierr); 621 ptap->reuse = MAT_INITIAL_MATRIX; 622 623 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 624 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 625 /* get P_loc by taking all local rows of P */ 626 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 627 628 /* (0) compute Rd = Pd^T, Ro = Po^T */ 629 /* --------------------------------- */ 630 ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 631 ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 632 #if defined(PTAP_PROFILE) 633 ierr = PetscTime(&t11);CHKERRQ(ierr); 634 #endif 635 636 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 637 /* ----------------------------------------------------------------- */ 638 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 639 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 640 641 /* create and initialize a linked list */ 642 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 643 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 644 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 645 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 646 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 647 648 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 649 650 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 651 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 652 current_space = free_space; 653 nspacedouble = 0; 654 655 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 656 api[0] = 0; 657 for (i=0; i<am; i++) { 658 /* diagonal portion: Ad[i,:]*P */ 659 ai = ad->i; pi = p_loc->i; 660 nzi = ai[i+1] - ai[i]; 661 aj = ad->j + ai[i]; 662 for (j=0; j<nzi; j++) { 663 row = aj[j]; 664 pnz = pi[row+1] - pi[row]; 665 Jptr = p_loc->j + pi[row]; 666 /* add non-zero cols of P into the sorted linked list lnk */ 667 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 668 } 669 /* off-diagonal portion: Ao[i,:]*P */ 670 ai = ao->i; pi = p_oth->i; 671 nzi = ai[i+1] - ai[i]; 672 aj = ao->j + ai[i]; 673 for (j=0; j<nzi; j++) { 674 row = aj[j]; 675 pnz = pi[row+1] - pi[row]; 676 Jptr = p_oth->j + pi[row]; 677 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 678 } 679 apnz = lnk[0]; 680 api[i+1] = api[i] + apnz; 681 if (ap_rmax < apnz) ap_rmax = apnz; 682 683 /* if free space is not available, double the total space in the list */ 684 if (current_space->local_remaining<apnz) { 685 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 686 nspacedouble++; 687 } 688 689 /* Copy data into free space, then initialize lnk */ 690 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 691 692 current_space->array += apnz; 693 current_space->local_used += apnz; 694 current_space->local_remaining -= apnz; 695 } 696 /* Allocate space for apj and apv, initialize apj, and */ 697 /* destroy list of free space and other temporary array(s) */ 698 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 699 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 700 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 701 702 /* Create AP_loc for reuse */ 703 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 704 705 #if defined(PETSC_USE_INFO) 706 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 707 ptap->AP_loc->info.mallocs = nspacedouble; 708 ptap->AP_loc->info.fill_ratio_given = fill; 709 ptap->AP_loc->info.fill_ratio_needed = apfill; 710 711 if (api[am]) { 712 ierr = PetscInfo3(ptap->AP_loc,"AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 713 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 714 } else { 715 ierr = PetscInfo(ptap->AP_loc,"AP_loc is empty \n");CHKERRQ(ierr); 716 } 717 #endif 718 719 #if defined(PTAP_PROFILE) 720 ierr = PetscTime(&t12);CHKERRQ(ierr); 721 #endif 722 723 /* (2-1) compute symbolic Co = Ro*AP_loc */ 724 /* ------------------------------------ */ 725 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 726 #if defined(PTAP_PROFILE) 727 ierr = PetscTime(&t1);CHKERRQ(ierr); 728 #endif 729 730 /* (3) send coj of C_oth to other processors */ 731 /* ------------------------------------------ */ 732 /* determine row ownership */ 733 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 734 rowmap->n = pn; 735 rowmap->bs = 1; 736 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 737 owners = rowmap->range; 738 739 /* determine the number of messages to send, their lengths */ 740 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 741 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 742 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 743 744 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 745 coi = c_oth->i; coj = c_oth->j; 746 con = ptap->C_oth->rmap->n; 747 proc = 0; 748 for (i=0; i<con; i++) { 749 while (prmap[i] >= owners[proc+1]) proc++; 750 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 751 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 752 } 753 754 len = 0; /* max length of buf_si[], see (4) */ 755 owners_co[0] = 0; 756 nsend = 0; 757 for (proc=0; proc<size; proc++) { 758 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 759 if (len_s[proc]) { 760 nsend++; 761 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 762 len += len_si[proc]; 763 } 764 } 765 766 /* determine the number and length of messages to receive for coi and coj */ 767 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 768 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 769 770 /* post the Irecv and Isend of coj */ 771 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 772 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 773 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 774 for (proc=0, k=0; proc<size; proc++) { 775 if (!len_s[proc]) continue; 776 i = owners_co[proc]; 777 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 778 k++; 779 } 780 781 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 782 /* ---------------------------------------- */ 783 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 784 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 785 786 /* receives coj are complete */ 787 for (i=0; i<nrecv; i++) { 788 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 789 } 790 ierr = PetscFree(rwaits);CHKERRQ(ierr); 791 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 792 793 /* add received column indices into ta to update Crmax */ 794 for (k=0; k<nrecv; k++) {/* k-th received message */ 795 Jptr = buf_rj[k]; 796 for (j=0; j<len_r[k]; j++) { 797 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 798 } 799 } 800 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 801 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 802 803 /* (4) send and recv coi */ 804 /*-----------------------*/ 805 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 806 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 807 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 808 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 809 for (proc=0,k=0; proc<size; proc++) { 810 if (!len_s[proc]) continue; 811 /* form outgoing message for i-structure: 812 buf_si[0]: nrows to be sent 813 [1:nrows]: row index (global) 814 [nrows+1:2*nrows+1]: i-structure index 815 */ 816 /*-------------------------------------------*/ 817 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 818 buf_si_i = buf_si + nrows+1; 819 buf_si[0] = nrows; 820 buf_si_i[0] = 0; 821 nrows = 0; 822 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 823 nzi = coi[i+1] - coi[i]; 824 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 825 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 826 nrows++; 827 } 828 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 829 k++; 830 buf_si += len_si[proc]; 831 } 832 for (i=0; i<nrecv; i++) { 833 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 834 } 835 ierr = PetscFree(rwaits);CHKERRQ(ierr); 836 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 837 838 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 839 ierr = PetscFree(len_ri);CHKERRQ(ierr); 840 ierr = PetscFree(swaits);CHKERRQ(ierr); 841 ierr = PetscFree(buf_s);CHKERRQ(ierr); 842 #if defined(PTAP_PROFILE) 843 ierr = PetscTime(&t2);CHKERRQ(ierr); 844 #endif 845 /* (5) compute the local portion of Cmpi */ 846 /* ------------------------------------------ */ 847 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 848 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 849 current_space = free_space; 850 851 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 852 for (k=0; k<nrecv; k++) { 853 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 854 nrows = *buf_ri_k[k]; 855 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 856 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 857 } 858 859 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 860 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 861 for (i=0; i<pn; i++) { 862 /* add C_loc into Cmpi */ 863 nzi = c_loc->i[i+1] - c_loc->i[i]; 864 Jptr = c_loc->j + c_loc->i[i]; 865 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 866 867 /* add received col data into lnk */ 868 for (k=0; k<nrecv; k++) { /* k-th received message */ 869 if (i == *nextrow[k]) { /* i-th row */ 870 nzi = *(nextci[k]+1) - *nextci[k]; 871 Jptr = buf_rj[k] + *nextci[k]; 872 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 873 nextrow[k]++; nextci[k]++; 874 } 875 } 876 nzi = lnk[0]; 877 878 /* copy data into free space, then initialize lnk */ 879 ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 880 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 881 } 882 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 883 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 884 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 885 #if defined(PTAP_PROFILE) 886 ierr = PetscTime(&t3);CHKERRQ(ierr); 887 #endif 888 889 /* local sizes and preallocation */ 890 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 891 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 892 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 893 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 894 895 /* members in merge */ 896 ierr = PetscFree(id_r);CHKERRQ(ierr); 897 ierr = PetscFree(len_r);CHKERRQ(ierr); 898 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 899 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 900 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 901 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 902 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 903 904 /* attach the supporting struct to Cmpi for reuse */ 905 c = (Mat_MPIAIJ*)Cmpi->data; 906 c->ptap = ptap; 907 ptap->duplicate = Cmpi->ops->duplicate; 908 ptap->destroy = Cmpi->ops->destroy; 909 910 if (alg == 1) { 911 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 912 } 913 914 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 915 Cmpi->assembled = PETSC_FALSE; 916 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 917 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP; 918 *C = Cmpi; 919 920 #if defined(PTAP_PROFILE) 921 ierr = PetscTime(&t4);CHKERRQ(ierr); 922 if (rank == 1) { 923 printf("PtAPSym: %g + %g + %g + %g + %g + %g = %g\n",t11-t0,t1-t11,t12-t11,t2-t2,t3-t2,t4-t3,t4-t0); 924 } 925 #endif 926 PetscFunctionReturn(0); 927 } 928 929 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 930 { 931 PetscErrorCode ierr; 932 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 933 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 934 Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq; 935 Mat_PtAPMPI *ptap = c->ptap; 936 Mat AP_loc,C_loc,C_oth; 937 PetscInt i,rstart,rend,cm,ncols,row; 938 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 939 PetscScalar *apa; 940 const PetscInt *cols; 941 const PetscScalar *vals; 942 #if defined(PTAP_PROFILE) 943 PetscMPIInt rank; 944 MPI_Comm comm; 945 PetscLogDouble t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi; 946 #endif 947 948 PetscFunctionBegin; 949 ierr = MatZeroEntries(C);CHKERRQ(ierr); 950 951 /* 1) get R = Pd^T,Ro = Po^T */ 952 #if defined(PTAP_PROFILE) 953 ierr = PetscTime(&t0);CHKERRQ(ierr); 954 #endif 955 if (ptap->reuse == MAT_REUSE_MATRIX) { 956 ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 957 ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 958 } 959 #if defined(PTAP_PROFILE) 960 ierr = PetscTime(&t1);CHKERRQ(ierr); 961 eR = t1 - t0; 962 #endif 963 964 /* 2) get AP_loc */ 965 AP_loc = ptap->AP_loc; 966 ap = (Mat_SeqAIJ*)AP_loc->data; 967 968 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 969 /*-----------------------------------------------------*/ 970 if (ptap->reuse == MAT_REUSE_MATRIX) { 971 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 972 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 973 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 974 } 975 976 /* 2-2) compute numeric A_loc*P - dominating part */ 977 /* ---------------------------------------------- */ 978 /* get data from symbolic products */ 979 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 980 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 981 apa = ptap->apa; 982 api = ap->i; 983 apj = ap->j; 984 for (i=0; i<am; i++) { 985 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 986 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 987 apnz = api[i+1] - api[i]; 988 for (j=0; j<apnz; j++) { 989 col = apj[j+api[i]]; 990 ap->a[j+ap->i[i]] = apa[col]; 991 apa[col] = 0.0; 992 } 993 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 994 } 995 #if defined(PTAP_PROFILE) 996 ierr = PetscTime(&t2);CHKERRQ(ierr); 997 eAP = t2 - t1; 998 #endif 999 1000 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1001 ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 1002 ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 1003 C_loc = ptap->C_loc; 1004 C_oth = ptap->C_oth; 1005 #if defined(PTAP_PROFILE) 1006 ierr = PetscTime(&t3);CHKERRQ(ierr); 1007 eCseq = t3 - t2; 1008 #endif 1009 1010 /* add C_loc and Co to to C */ 1011 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 1012 1013 /* C_loc -> C */ 1014 cm = C_loc->rmap->N; 1015 c_seq = (Mat_SeqAIJ*)C_loc->data; 1016 cols = c_seq->j; 1017 vals = c_seq->a; 1018 for (i=0; i<cm; i++) { 1019 ncols = c_seq->i[i+1] - c_seq->i[i]; 1020 row = rstart + i; 1021 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 1022 cols += ncols; vals += ncols; 1023 } 1024 1025 /* Co -> C, off-processor part */ 1026 cm = C_oth->rmap->N; 1027 c_seq = (Mat_SeqAIJ*)C_oth->data; 1028 cols = c_seq->j; 1029 vals = c_seq->a; 1030 for (i=0; i<cm; i++) { 1031 ncols = c_seq->i[i+1] - c_seq->i[i]; 1032 row = p->garray[i]; 1033 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 1034 cols += ncols; vals += ncols; 1035 } 1036 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1037 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1038 #if defined(PTAP_PROFILE) 1039 ierr = PetscTime(&t4);CHKERRQ(ierr); 1040 eCmpi = t4 - t3; 1041 1042 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1043 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1044 if (rank==1) { 1045 ierr = PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);CHKERRQ(ierr); 1046 } 1047 #endif 1048 ptap->reuse = MAT_REUSE_MATRIX; 1049 PetscFunctionReturn(0); 1050 } 1051 1052 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,PetscReal fill,Mat *C) 1053 { 1054 PetscErrorCode ierr; 1055 Mat Cmpi; 1056 Mat_PtAPMPI *ptap; 1057 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1058 Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 1059 Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1060 Mat_SeqAIJ *p_loc,*p_oth; 1061 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ; 1062 PetscInt *adi=ad->i,*aj,*aoi=ao->i,nnz; 1063 PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row; 1064 PetscInt am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n; 1065 PetscBT lnkbt; 1066 MPI_Comm comm; 1067 PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0; 1068 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1069 PetscInt len,proc,*dnz,*onz,*owners; 1070 PetscInt nzi,*pti,*ptj; 1071 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1072 MPI_Request *swaits,*rwaits; 1073 MPI_Status *sstatus,rstatus; 1074 Mat_Merge_SeqsToMPI *merge; 1075 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0; 1076 PetscReal afill=1.0,afill_tmp; 1077 1078 PetscFunctionBegin; 1079 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1080 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1081 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1082 1083 /* create struct Mat_PtAPMPI and attached it to C later */ 1084 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1085 ierr = PetscNew(&merge);CHKERRQ(ierr); 1086 ptap->merge = merge; 1087 ptap->reuse = MAT_INITIAL_MATRIX; 1088 1089 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1090 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1091 1092 /* get P_loc by taking all local rows of P */ 1093 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1094 1095 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1096 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1097 pi_loc = p_loc->i; pj_loc = p_loc->j; 1098 pi_oth = p_oth->i; pj_oth = p_oth->j; 1099 1100 /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */ 1101 /*--------------------------------------------------------------------------*/ 1102 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 1103 api[0] = 0; 1104 1105 /* create and initialize a linked list */ 1106 ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1107 1108 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */ 1109 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr); 1110 current_space = free_space; 1111 1112 for (i=0; i<am; i++) { 1113 /* diagonal portion of A */ 1114 nzi = adi[i+1] - adi[i]; 1115 aj = ad->j + adi[i]; 1116 for (j=0; j<nzi; j++) { 1117 row = aj[j]; 1118 pnz = pi_loc[row+1] - pi_loc[row]; 1119 Jptr = pj_loc + pi_loc[row]; 1120 /* add non-zero cols of P into the sorted linked list lnk */ 1121 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1122 } 1123 /* off-diagonal portion of A */ 1124 nzi = aoi[i+1] - aoi[i]; 1125 aj = ao->j + aoi[i]; 1126 for (j=0; j<nzi; j++) { 1127 row = aj[j]; 1128 pnz = pi_oth[row+1] - pi_oth[row]; 1129 Jptr = pj_oth + pi_oth[row]; 1130 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1131 } 1132 apnz = lnk[0]; 1133 api[i+1] = api[i] + apnz; 1134 if (ap_rmax < apnz) ap_rmax = apnz; 1135 1136 /* if free space is not available, double the total space in the list */ 1137 if (current_space->local_remaining<apnz) { 1138 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1139 nspacedouble++; 1140 } 1141 1142 /* Copy data into free space, then initialize lnk */ 1143 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1144 1145 current_space->array += apnz; 1146 current_space->local_used += apnz; 1147 current_space->local_remaining -= apnz; 1148 } 1149 1150 /* Allocate space for apj, initialize apj, and */ 1151 /* destroy list of free space and other temporary array(s) */ 1152 ierr = PetscMalloc1(api[am]+1,&apj);CHKERRQ(ierr); 1153 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 1154 afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1); 1155 if (afill_tmp > afill) afill = afill_tmp; 1156 1157 /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/ 1158 /*-----------------------------------------------------------------*/ 1159 ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr); 1160 1161 /* then, compute symbolic Co = (p->B)^T*AP */ 1162 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 1163 >= (num of nonzero rows of C_seq) - pn */ 1164 ierr = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr); 1165 coi[0] = 0; 1166 1167 /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */ 1168 nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],api[am])); 1169 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1170 current_space = free_space; 1171 1172 for (i=0; i<pon; i++) { 1173 pnz = poti[i+1] - poti[i]; 1174 ptJ = potj + poti[i]; 1175 for (j=0; j<pnz; j++) { 1176 row = ptJ[j]; /* row of AP == col of Pot */ 1177 apnz = api[row+1] - api[row]; 1178 Jptr = apj + api[row]; 1179 /* add non-zero cols of AP into the sorted linked list lnk */ 1180 ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1181 } 1182 nnz = lnk[0]; 1183 1184 /* If free space is not available, double the total space in the list */ 1185 if (current_space->local_remaining<nnz) { 1186 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1187 nspacedouble++; 1188 } 1189 1190 /* Copy data into free space, and zero out denserows */ 1191 ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1192 1193 current_space->array += nnz; 1194 current_space->local_used += nnz; 1195 current_space->local_remaining -= nnz; 1196 1197 coi[i+1] = coi[i] + nnz; 1198 } 1199 1200 ierr = PetscMalloc1(coi[pon],&coj);CHKERRQ(ierr); 1201 ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); 1202 afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1); 1203 if (afill_tmp > afill) afill = afill_tmp; 1204 ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr); 1205 1206 /* (3) send j-array (coj) of Co to other processors */ 1207 /*--------------------------------------------------*/ 1208 ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr); 1209 len_s = merge->len_s; 1210 merge->nsend = 0; 1211 1212 1213 /* determine row ownership */ 1214 ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr); 1215 merge->rowmap->n = pn; 1216 merge->rowmap->bs = 1; 1217 1218 ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr); 1219 owners = merge->rowmap->range; 1220 1221 /* determine the number of messages to send, their lengths */ 1222 ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr); 1223 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1224 ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr); 1225 1226 proc = 0; 1227 for (i=0; i<pon; i++) { 1228 while (prmap[i] >= owners[proc+1]) proc++; 1229 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1230 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1231 } 1232 1233 len = 0; /* max length of buf_si[], see (4) */ 1234 owners_co[0] = 0; 1235 for (proc=0; proc<size; proc++) { 1236 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1237 if (len_s[proc]) { 1238 merge->nsend++; 1239 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1240 len += len_si[proc]; 1241 } 1242 } 1243 1244 /* determine the number and length of messages to receive for coi and coj */ 1245 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 1246 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 1247 1248 /* post the Irecv and Isend of coj */ 1249 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1250 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1251 ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr); 1252 for (proc=0, k=0; proc<size; proc++) { 1253 if (!len_s[proc]) continue; 1254 i = owners_co[proc]; 1255 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1256 k++; 1257 } 1258 1259 /* receives and sends of coj are complete */ 1260 for (i=0; i<merge->nrecv; i++) { 1261 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1262 } 1263 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1264 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1265 1266 /* (4) send and recv coi */ 1267 /*-----------------------*/ 1268 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1269 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1270 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 1271 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1272 for (proc=0,k=0; proc<size; proc++) { 1273 if (!len_s[proc]) continue; 1274 /* form outgoing message for i-structure: 1275 buf_si[0]: nrows to be sent 1276 [1:nrows]: row index (global) 1277 [nrows+1:2*nrows+1]: i-structure index 1278 */ 1279 /*-------------------------------------------*/ 1280 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 1281 buf_si_i = buf_si + nrows+1; 1282 buf_si[0] = nrows; 1283 buf_si_i[0] = 0; 1284 nrows = 0; 1285 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1286 nzi = coi[i+1] - coi[i]; 1287 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1288 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1289 nrows++; 1290 } 1291 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1292 k++; 1293 buf_si += len_si[proc]; 1294 } 1295 i = merge->nrecv; 1296 while (i--) { 1297 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1298 } 1299 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1300 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1301 1302 ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr); 1303 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1304 ierr = PetscFree(swaits);CHKERRQ(ierr); 1305 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1306 1307 /* (5) compute the local portion of C (mpi mat) */ 1308 /*----------------------------------------------*/ 1309 ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr); 1310 1311 /* allocate pti array and free space for accumulating nonzero column info */ 1312 ierr = PetscMalloc1(pn+1,&pti);CHKERRQ(ierr); 1313 pti[0] = 0; 1314 1315 /* set initial free space to be fill*(nnz(P) + nnz(AP)) */ 1316 nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pi_loc[pm],api[am])); 1317 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1318 current_space = free_space; 1319 1320 ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); 1321 for (k=0; k<merge->nrecv; k++) { 1322 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1323 nrows = *buf_ri_k[k]; 1324 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1325 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1326 } 1327 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 1328 1329 for (i=0; i<pn; i++) { 1330 /* add pdt[i,:]*AP into lnk */ 1331 pnz = pdti[i+1] - pdti[i]; 1332 ptJ = pdtj + pdti[i]; 1333 for (j=0; j<pnz; j++) { 1334 row = ptJ[j]; /* row of AP == col of Pt */ 1335 apnz = api[row+1] - api[row]; 1336 Jptr = apj + api[row]; 1337 /* add non-zero cols of AP into the sorted linked list lnk */ 1338 ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1339 } 1340 1341 /* add received col data into lnk */ 1342 for (k=0; k<merge->nrecv; k++) { /* k-th received message */ 1343 if (i == *nextrow[k]) { /* i-th row */ 1344 nzi = *(nextci[k]+1) - *nextci[k]; 1345 Jptr = buf_rj[k] + *nextci[k]; 1346 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1347 nextrow[k]++; nextci[k]++; 1348 } 1349 } 1350 nnz = lnk[0]; 1351 1352 /* if free space is not available, make more free space */ 1353 if (current_space->local_remaining<nnz) { 1354 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1355 nspacedouble++; 1356 } 1357 /* copy data into free space, then initialize lnk */ 1358 ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1359 ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr); 1360 1361 current_space->array += nnz; 1362 current_space->local_used += nnz; 1363 current_space->local_remaining -= nnz; 1364 1365 pti[i+1] = pti[i] + nnz; 1366 } 1367 ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr); 1368 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1369 1370 ierr = PetscMalloc1(pti[pn]+1,&ptj);CHKERRQ(ierr); 1371 ierr = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr); 1372 afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1); 1373 if (afill_tmp > afill) afill = afill_tmp; 1374 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1375 1376 /* (6) create symbolic parallel matrix Cmpi */ 1377 /*------------------------------------------*/ 1378 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1379 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1380 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 1381 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 1382 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1383 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1384 1385 merge->bi = pti; /* Cseq->i */ 1386 merge->bj = ptj; /* Cseq->j */ 1387 merge->coi = coi; /* Co->i */ 1388 merge->coj = coj; /* Co->j */ 1389 merge->buf_ri = buf_ri; 1390 merge->buf_rj = buf_rj; 1391 merge->owners_co = owners_co; 1392 merge->destroy = Cmpi->ops->destroy; 1393 1394 /* attach the supporting struct to Cmpi for reuse */ 1395 c = (Mat_MPIAIJ*)Cmpi->data; 1396 c->ptap = ptap; 1397 ptap->api = api; 1398 ptap->apj = apj; 1399 ptap->duplicate = Cmpi->ops->duplicate; 1400 ptap->destroy = Cmpi->ops->destroy; 1401 1402 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1403 Cmpi->assembled = PETSC_FALSE; 1404 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1405 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP; 1406 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap; 1407 *C = Cmpi; 1408 1409 /* flag 'scalable' determines which implementations to be used: 1410 0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa; 1411 1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */ 1412 /* set default scalable */ 1413 ptap->scalable = PETSC_FALSE; /* PETSC_TRUE; */ 1414 1415 ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->options,((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr); 1416 if (!ptap->scalable) { /* Do dense axpy */ 1417 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 1418 } else { 1419 ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr); 1420 } 1421 1422 #if defined(PETSC_USE_INFO) 1423 if (pti[pn] != 0) { 1424 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr); 1425 ierr = PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr); 1426 } else { 1427 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 1428 } 1429 #endif 1430 PetscFunctionReturn(0); 1431 } 1432 1433 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,Mat C) 1434 { 1435 PetscErrorCode ierr; 1436 Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1437 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1438 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; 1439 Mat_SeqAIJ *p_loc,*p_oth; 1440 Mat_PtAPMPI *ptap; 1441 Mat_Merge_SeqsToMPI *merge; 1442 PetscInt *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp; 1443 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj; 1444 PetscInt i,j,k,anz,pnz,apnz,nextap,row,*cj; 1445 MatScalar *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp; 1446 PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n; 1447 MPI_Comm comm; 1448 PetscMPIInt size,rank,taga,*len_s; 1449 PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci; 1450 PetscInt **buf_ri,**buf_rj; 1451 PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 1452 MPI_Request *s_waits,*r_waits; 1453 MPI_Status *status; 1454 MatScalar **abuf_r,*ba_i,*pA,*coa,*ba; 1455 PetscInt *api,*apj,*coi,*coj; 1456 PetscInt *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend; 1457 PetscBool scalable; 1458 #if defined(PTAP_PROFILE) 1459 PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2; 1460 #endif 1461 1462 PetscFunctionBegin; 1463 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1464 #if defined(PTAP_PROFILE) 1465 ierr = PetscTime(&t0);CHKERRQ(ierr); 1466 #endif 1467 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1468 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1469 1470 ptap = c->ptap; 1471 if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX"); 1472 merge = ptap->merge; 1473 apa = ptap->apa; 1474 scalable = ptap->scalable; 1475 1476 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1477 /*-----------------------------------------------------*/ 1478 if (ptap->reuse == MAT_INITIAL_MATRIX) { 1479 /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */ 1480 ptap->reuse = MAT_REUSE_MATRIX; 1481 } else { /* update numerical values of P_oth and P_loc */ 1482 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1483 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1484 } 1485 #if defined(PTAP_PROFILE) 1486 ierr = PetscTime(&t1);CHKERRQ(ierr); 1487 eP = t1-t0; 1488 #endif 1489 /* 1490 printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank, 1491 a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N, 1492 ptap->P_loc->rmap->N,ptap->P_loc->cmap->N, 1493 ptap->P_oth->rmap->N,ptap->P_oth->cmap->N); 1494 */ 1495 1496 /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */ 1497 /*--------------------------------------------------------------*/ 1498 /* get data from symbolic products */ 1499 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1500 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1501 pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a; 1502 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 1503 1504 coi = merge->coi; coj = merge->coj; 1505 ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr); 1506 1507 bi = merge->bi; bj = merge->bj; 1508 owners = merge->rowmap->range; 1509 ierr = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr); /* ba: Cseq->a */ 1510 1511 api = ptap->api; apj = ptap->apj; 1512 1513 if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */ 1514 ierr = PetscInfo(C,"Using non-scalable dense axpy\n");CHKERRQ(ierr); 1515 #if defined(PTAP_PROFILE) 1516 ierr = PetscTime(&t1);CHKERRQ(ierr); 1517 #endif 1518 for (i=0; i<am; i++) { 1519 #if defined(PTAP_PROFILE) 1520 ierr = PetscTime(&t2_0);CHKERRQ(ierr); 1521 #endif 1522 /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */ 1523 /*------------------------------------------------------------*/ 1524 apJ = apj + api[i]; 1525 1526 /* diagonal portion of A */ 1527 anz = adi[i+1] - adi[i]; 1528 adj = ad->j + adi[i]; 1529 ada = ad->a + adi[i]; 1530 for (j=0; j<anz; j++) { 1531 row = adj[j]; 1532 pnz = pi_loc[row+1] - pi_loc[row]; 1533 pj = pj_loc + pi_loc[row]; 1534 pa = pa_loc + pi_loc[row]; 1535 1536 /* perform dense axpy */ 1537 valtmp = ada[j]; 1538 for (k=0; k<pnz; k++) { 1539 apa[pj[k]] += valtmp*pa[k]; 1540 } 1541 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 1542 } 1543 1544 /* off-diagonal portion of A */ 1545 anz = aoi[i+1] - aoi[i]; 1546 aoj = ao->j + aoi[i]; 1547 aoa = ao->a + aoi[i]; 1548 for (j=0; j<anz; j++) { 1549 row = aoj[j]; 1550 pnz = pi_oth[row+1] - pi_oth[row]; 1551 pj = pj_oth + pi_oth[row]; 1552 pa = pa_oth + pi_oth[row]; 1553 1554 /* perform dense axpy */ 1555 valtmp = aoa[j]; 1556 for (k=0; k<pnz; k++) { 1557 apa[pj[k]] += valtmp*pa[k]; 1558 } 1559 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 1560 } 1561 #if defined(PTAP_PROFILE) 1562 ierr = PetscTime(&t2_1);CHKERRQ(ierr); 1563 et2_AP += t2_1 - t2_0; 1564 #endif 1565 1566 /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */ 1567 /*--------------------------------------------------------------*/ 1568 apnz = api[i+1] - api[i]; 1569 /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */ 1570 pnz = po->i[i+1] - po->i[i]; 1571 poJ = po->j + po->i[i]; 1572 pA = po->a + po->i[i]; 1573 for (j=0; j<pnz; j++) { 1574 row = poJ[j]; 1575 cnz = coi[row+1] - coi[row]; 1576 cj = coj + coi[row]; 1577 ca = coa + coi[row]; 1578 /* perform dense axpy */ 1579 valtmp = pA[j]; 1580 for (k=0; k<cnz; k++) { 1581 ca[k] += valtmp*apa[cj[k]]; 1582 } 1583 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1584 } 1585 /* put the value into Cd (diagonal part) */ 1586 pnz = pd->i[i+1] - pd->i[i]; 1587 pdJ = pd->j + pd->i[i]; 1588 pA = pd->a + pd->i[i]; 1589 for (j=0; j<pnz; j++) { 1590 row = pdJ[j]; 1591 cnz = bi[row+1] - bi[row]; 1592 cj = bj + bi[row]; 1593 ca = ba + bi[row]; 1594 /* perform dense axpy */ 1595 valtmp = pA[j]; 1596 for (k=0; k<cnz; k++) { 1597 ca[k] += valtmp*apa[cj[k]]; 1598 } 1599 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1600 } 1601 1602 /* zero the current row of A*P */ 1603 for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0; 1604 #if defined(PTAP_PROFILE) 1605 ierr = PetscTime(&t2_2);CHKERRQ(ierr); 1606 ePtAP += t2_2 - t2_1; 1607 #endif 1608 } 1609 } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */ 1610 ierr = PetscInfo(C,"Using scalable sparse axpy\n");CHKERRQ(ierr); 1611 /*-----------------------------------------------------------------------------------------*/ 1612 pA=pa_loc; 1613 for (i=0; i<am; i++) { 1614 #if defined(PTAP_PROFILE) 1615 ierr = PetscTime(&t2_0);CHKERRQ(ierr); 1616 #endif 1617 /* form i-th sparse row of A*P */ 1618 apnz = api[i+1] - api[i]; 1619 apJ = apj + api[i]; 1620 /* diagonal portion of A */ 1621 anz = adi[i+1] - adi[i]; 1622 adj = ad->j + adi[i]; 1623 ada = ad->a + adi[i]; 1624 for (j=0; j<anz; j++) { 1625 row = adj[j]; 1626 pnz = pi_loc[row+1] - pi_loc[row]; 1627 pj = pj_loc + pi_loc[row]; 1628 pa = pa_loc + pi_loc[row]; 1629 valtmp = ada[j]; 1630 nextp = 0; 1631 for (k=0; nextp<pnz; k++) { 1632 if (apJ[k] == pj[nextp]) { /* col of AP == col of P */ 1633 apa[k] += valtmp*pa[nextp++]; 1634 } 1635 } 1636 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 1637 } 1638 /* off-diagonal portion of A */ 1639 anz = aoi[i+1] - aoi[i]; 1640 aoj = ao->j + aoi[i]; 1641 aoa = ao->a + aoi[i]; 1642 for (j=0; j<anz; j++) { 1643 row = aoj[j]; 1644 pnz = pi_oth[row+1] - pi_oth[row]; 1645 pj = pj_oth + pi_oth[row]; 1646 pa = pa_oth + pi_oth[row]; 1647 valtmp = aoa[j]; 1648 nextp = 0; 1649 for (k=0; nextp<pnz; k++) { 1650 if (apJ[k] == pj[nextp]) { /* col of AP == col of P */ 1651 apa[k] += valtmp*pa[nextp++]; 1652 } 1653 } 1654 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 1655 } 1656 #if defined(PTAP_PROFILE) 1657 ierr = PetscTime(&t2_1);CHKERRQ(ierr); 1658 et2_AP += t2_1 - t2_0; 1659 #endif 1660 1661 /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */ 1662 /*--------------------------------------------------------------*/ 1663 pnz = pi_loc[i+1] - pi_loc[i]; 1664 pJ = pj_loc + pi_loc[i]; 1665 for (j=0; j<pnz; j++) { 1666 nextap = 0; 1667 row = pJ[j]; /* global index */ 1668 if (row < pcstart || row >=pcend) { /* put the value into Co */ 1669 row = *poJ; 1670 cj = coj + coi[row]; 1671 ca = coa + coi[row]; poJ++; 1672 } else { /* put the value into Cd */ 1673 row = *pdJ; 1674 cj = bj + bi[row]; 1675 ca = ba + bi[row]; pdJ++; 1676 } 1677 valtmp = pA[j]; 1678 for (k=0; nextap<apnz; k++) { 1679 if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++]; 1680 } 1681 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 1682 } 1683 pA += pnz; 1684 /* zero the current row info for A*P */ 1685 ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr); 1686 #if defined(PTAP_PROFILE) 1687 ierr = PetscTime(&t2_2);CHKERRQ(ierr); 1688 ePtAP += t2_2 - t2_1; 1689 #endif 1690 } 1691 } 1692 #if defined(PTAP_PROFILE) 1693 ierr = PetscTime(&t2);CHKERRQ(ierr); 1694 #endif 1695 1696 /* 3) send and recv matrix values coa */ 1697 /*------------------------------------*/ 1698 buf_ri = merge->buf_ri; 1699 buf_rj = merge->buf_rj; 1700 len_s = merge->len_s; 1701 ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr); 1702 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 1703 1704 ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr); 1705 for (proc=0,k=0; proc<size; proc++) { 1706 if (!len_s[proc]) continue; 1707 i = merge->owners_co[proc]; 1708 ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 1709 k++; 1710 } 1711 if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);} 1712 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);} 1713 1714 ierr = PetscFree2(s_waits,status);CHKERRQ(ierr); 1715 ierr = PetscFree(r_waits);CHKERRQ(ierr); 1716 ierr = PetscFree(coa);CHKERRQ(ierr); 1717 #if defined(PTAP_PROFILE) 1718 ierr = PetscTime(&t3);CHKERRQ(ierr); 1719 #endif 1720 1721 /* 4) insert local Cseq and received values into Cmpi */ 1722 /*------------------------------------------------------*/ 1723 ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); 1724 for (k=0; k<merge->nrecv; k++) { 1725 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1726 nrows = *(buf_ri_k[k]); 1727 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 1728 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1729 } 1730 1731 for (i=0; i<cm; i++) { 1732 row = owners[rank] + i; /* global row index of C_seq */ 1733 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 1734 ba_i = ba + bi[i]; 1735 bnz = bi[i+1] - bi[i]; 1736 /* add received vals into ba */ 1737 for (k=0; k<merge->nrecv; k++) { /* k-th received message */ 1738 /* i-th row */ 1739 if (i == *nextrow[k]) { 1740 cnz = *(nextci[k]+1) - *nextci[k]; 1741 cj = buf_rj[k] + *(nextci[k]); 1742 ca = abuf_r[k] + *(nextci[k]); 1743 nextcj = 0; 1744 for (j=0; nextcj<cnz; j++) { 1745 if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */ 1746 ba_i[j] += ca[nextcj++]; 1747 } 1748 } 1749 nextrow[k]++; nextci[k]++; 1750 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1751 } 1752 } 1753 ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 1754 } 1755 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1756 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1757 1758 ierr = PetscFree(ba);CHKERRQ(ierr); 1759 ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr); 1760 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 1761 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1762 #if defined(PTAP_PROFILE) 1763 ierr = PetscTime(&t4);CHKERRQ(ierr); 1764 if (rank==1) { 1765 ierr = PetscPrintf(MPI_COMM_SELF," [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr); 1766 } 1767 #endif 1768 PetscFunctionReturn(0); 1769 } 1770