1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 13 /* #define PTAP_PROFILE */ 14 15 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer) 16 { 17 PetscErrorCode ierr; 18 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 19 Mat_PtAPMPI *ptap=a->ptap; 20 PetscBool iascii; 21 PetscViewerFormat format; 22 23 PetscFunctionBegin; 24 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr); 25 if (iascii) { 26 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr); 27 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 28 if (ptap->algType == 0) { 29 ierr = PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");CHKERRQ(ierr); 30 } else if (ptap->algType == 1) { 31 ierr = PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");CHKERRQ(ierr); 32 } 33 } 34 } 35 ierr = (ptap->view)(A,viewer);CHKERRQ(ierr); 36 PetscFunctionReturn(0); 37 } 38 39 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A) 40 { 41 PetscErrorCode ierr; 42 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 43 Mat_PtAPMPI *ptap=a->ptap; 44 45 PetscFunctionBegin; 46 if (ptap) { 47 Mat_Merge_SeqsToMPI *merge=ptap->merge; 48 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 49 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 50 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 51 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 52 ierr = MatDestroy(&ptap->A_loc);CHKERRQ(ierr); /* used by MatTransposeMatMult() */ 53 ierr = MatDestroy(&ptap->Rd);CHKERRQ(ierr); 54 ierr = MatDestroy(&ptap->Ro);CHKERRQ(ierr); 55 if (ptap->AP_loc) { /* used by alg_rap */ 56 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 57 ierr = PetscFree(ap->i);CHKERRQ(ierr); 58 ierr = PetscFree2(ap->j,ap->a);CHKERRQ(ierr); 59 ierr = MatDestroy(&ptap->AP_loc);CHKERRQ(ierr); 60 } else { /* used by alg_ptap */ 61 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 62 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 63 } 64 ierr = MatDestroy(&ptap->C_loc);CHKERRQ(ierr); 65 ierr = MatDestroy(&ptap->C_oth);CHKERRQ(ierr); 66 if (ptap->apa) {ierr = PetscFree(ptap->apa);CHKERRQ(ierr);} 67 68 if (merge) { /* used by alg_ptap */ 69 ierr = PetscFree(merge->id_r);CHKERRQ(ierr); 70 ierr = PetscFree(merge->len_s);CHKERRQ(ierr); 71 ierr = PetscFree(merge->len_r);CHKERRQ(ierr); 72 ierr = PetscFree(merge->bi);CHKERRQ(ierr); 73 ierr = PetscFree(merge->bj);CHKERRQ(ierr); 74 ierr = PetscFree(merge->buf_ri[0]);CHKERRQ(ierr); 75 ierr = PetscFree(merge->buf_ri);CHKERRQ(ierr); 76 ierr = PetscFree(merge->buf_rj[0]);CHKERRQ(ierr); 77 ierr = PetscFree(merge->buf_rj);CHKERRQ(ierr); 78 ierr = PetscFree(merge->coi);CHKERRQ(ierr); 79 ierr = PetscFree(merge->coj);CHKERRQ(ierr); 80 ierr = PetscFree(merge->owners_co);CHKERRQ(ierr); 81 ierr = PetscLayoutDestroy(&merge->rowmap);CHKERRQ(ierr); 82 ierr = PetscFree(ptap->merge);CHKERRQ(ierr); 83 } 84 85 ierr = ptap->destroy(A);CHKERRQ(ierr); 86 ierr = PetscFree(ptap);CHKERRQ(ierr); 87 } 88 PetscFunctionReturn(0); 89 } 90 91 PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M) 92 { 93 PetscErrorCode ierr; 94 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 95 Mat_PtAPMPI *ptap = a->ptap; 96 97 PetscFunctionBegin; 98 ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr); 99 (*M)->ops->destroy = ptap->destroy; 100 (*M)->ops->duplicate = ptap->duplicate; 101 (*M)->ops->view = ptap->view; 102 PetscFunctionReturn(0); 103 } 104 105 PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C) 106 { 107 PetscErrorCode ierr; 108 PetscBool rap=PETSC_TRUE; /* do R=P^T locally, then C=R*A*P */ 109 MPI_Comm comm; 110 111 PetscFunctionBegin; 112 /* check if matrix local sizes are compatible */ 113 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 114 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 115 if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); 116 117 ierr = PetscOptionsGetBool(NULL,NULL,"-matrap",&rap,NULL);CHKERRQ(ierr); 118 if (scall == MAT_INITIAL_MATRIX) { 119 ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 120 if (rap) { /* do R=P^T locally, then C=R*A*P */ 121 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr); 122 } else { /* do P^T*A*P */ 123 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(A,P,fill,C);CHKERRQ(ierr); 124 } 125 ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr); 126 } 127 ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 128 ierr = (*(*C)->ops->ptapnumeric)(A,P,*C);CHKERRQ(ierr); 129 ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr); 130 PetscFunctionReturn(0); 131 } 132 133 #if defined(PETSC_HAVE_HYPRE) 134 PETSC_INTERN PetscErrorCode MatPtAPSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*); 135 #endif 136 137 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 138 { 139 PetscErrorCode ierr; 140 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 141 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 142 Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq; 143 Mat_PtAPMPI *ptap = c->ptap; 144 Mat AP_loc,C_loc,C_oth; 145 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz; 146 PetscScalar *apa; 147 const PetscInt *cols; 148 const PetscScalar *vals; 149 150 PetscFunctionBegin; 151 ierr = MatZeroEntries(C);CHKERRQ(ierr); 152 153 /* 1) get R = Pd^T,Ro = Po^T */ 154 if (ptap->reuse == MAT_REUSE_MATRIX) { 155 ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 156 ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 157 } 158 159 /* 2) get AP_loc */ 160 AP_loc = ptap->AP_loc; 161 ap = (Mat_SeqAIJ*)AP_loc->data; 162 163 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 164 /*-----------------------------------------------------*/ 165 if (ptap->reuse == MAT_REUSE_MATRIX) { 166 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 167 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 168 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 169 } 170 171 /* 2-2) compute numeric A_loc*P - dominating part */ 172 /* ---------------------------------------------- */ 173 /* get data from symbolic products */ 174 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 175 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 176 else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!"); 177 178 api = ap->i; 179 apj = ap->j; 180 for (i=0; i<am; i++) { 181 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 182 apnz = api[i+1] - api[i]; 183 apa = ap->a + api[i]; 184 ierr = PetscMemzero(apa,sizeof(PetscScalar)*apnz);CHKERRQ(ierr); 185 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 186 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 187 } 188 189 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 190 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 191 ierr = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 192 193 C_loc = ptap->C_loc; 194 C_oth = ptap->C_oth; 195 196 /* add C_loc and Co to to C */ 197 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 198 199 /* C_loc -> C */ 200 cm = C_loc->rmap->N; 201 c_seq = (Mat_SeqAIJ*)C_loc->data; 202 cols = c_seq->j; 203 vals = c_seq->a; 204 for (i=0; i<cm; i++) { 205 ncols = c_seq->i[i+1] - c_seq->i[i]; 206 row = rstart + i; 207 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 208 cols += ncols; vals += ncols; 209 } 210 211 /* Co -> C, off-processor part */ 212 cm = C_oth->rmap->N; 213 c_seq = (Mat_SeqAIJ*)C_oth->data; 214 cols = c_seq->j; 215 vals = c_seq->a; 216 for (i=0; i<cm; i++) { 217 ncols = c_seq->i[i+1] - c_seq->i[i]; 218 row = p->garray[i]; 219 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 220 cols += ncols; vals += ncols; 221 } 222 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 223 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 224 225 ptap->reuse = MAT_REUSE_MATRIX; 226 PetscFunctionReturn(0); 227 } 228 229 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C) 230 { 231 PetscErrorCode ierr; 232 Mat_PtAPMPI *ptap; 233 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 234 MPI_Comm comm; 235 PetscMPIInt size,rank; 236 Mat Cmpi,P_loc,P_oth; 237 PetscFreeSpaceList free_space=NULL,current_space=NULL; 238 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 239 PetscInt *lnk,i,k,pnz,row,nsend; 240 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 241 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 242 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 243 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 244 MPI_Request *swaits,*rwaits; 245 MPI_Status *sstatus,rstatus; 246 PetscLayout rowmap; 247 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 248 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 249 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi; 250 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 251 PetscScalar *apv; 252 PetscTable ta; 253 #if defined(PETSC_USE_INFO) 254 PetscReal apfill; 255 #endif 256 257 PetscFunctionBegin; 258 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 259 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 260 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 261 262 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 263 264 /* create symbolic parallel matrix Cmpi */ 265 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 266 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 267 268 /* create struct Mat_PtAPMPI and attached it to C later */ 269 ierr = PetscNew(&ptap);CHKERRQ(ierr); 270 ptap->reuse = MAT_INITIAL_MATRIX; 271 ptap->algType = 0; 272 273 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 274 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);CHKERRQ(ierr); 275 /* get P_loc by taking all local rows of P */ 276 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);CHKERRQ(ierr); 277 278 ptap->P_loc = P_loc; 279 ptap->P_oth = P_oth; 280 281 /* (0) compute Rd = Pd^T, Ro = Po^T */ 282 /* --------------------------------- */ 283 ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 284 ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 285 286 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 287 /* ----------------------------------------------------------------- */ 288 p_loc = (Mat_SeqAIJ*)P_loc->data; 289 if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data; 290 291 /* create and initialize a linked list */ 292 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 293 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 294 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 295 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 296 297 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 298 299 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 300 if (ao) { 301 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 302 } else { 303 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 304 } 305 current_space = free_space; 306 nspacedouble = 0; 307 308 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 309 api[0] = 0; 310 for (i=0; i<am; i++) { 311 /* diagonal portion: Ad[i,:]*P */ 312 ai = ad->i; pi = p_loc->i; 313 nzi = ai[i+1] - ai[i]; 314 aj = ad->j + ai[i]; 315 for (j=0; j<nzi; j++) { 316 row = aj[j]; 317 pnz = pi[row+1] - pi[row]; 318 Jptr = p_loc->j + pi[row]; 319 /* add non-zero cols of P into the sorted linked list lnk */ 320 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 321 } 322 /* off-diagonal portion: Ao[i,:]*P */ 323 if (ao) { 324 ai = ao->i; pi = p_oth->i; 325 nzi = ai[i+1] - ai[i]; 326 aj = ao->j + ai[i]; 327 for (j=0; j<nzi; j++) { 328 row = aj[j]; 329 pnz = pi[row+1] - pi[row]; 330 Jptr = p_oth->j + pi[row]; 331 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 332 } 333 } 334 apnz = lnk[0]; 335 api[i+1] = api[i] + apnz; 336 337 /* if free space is not available, double the total space in the list */ 338 if (current_space->local_remaining<apnz) { 339 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 340 nspacedouble++; 341 } 342 343 /* Copy data into free space, then initialize lnk */ 344 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 345 346 current_space->array += apnz; 347 current_space->local_used += apnz; 348 current_space->local_remaining -= apnz; 349 } 350 /* Allocate space for apj and apv, initialize apj, and */ 351 /* destroy list of free space and other temporary array(s) */ 352 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 353 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 354 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 355 356 /* Create AP_loc for reuse */ 357 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 358 359 #if defined(PETSC_USE_INFO) 360 if (ao) { 361 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 362 } else { 363 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 364 } 365 ptap->AP_loc->info.mallocs = nspacedouble; 366 ptap->AP_loc->info.fill_ratio_given = fill; 367 ptap->AP_loc->info.fill_ratio_needed = apfill; 368 369 if (api[am]) { 370 ierr = PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 371 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 372 } else { 373 ierr = PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 374 } 375 #endif 376 377 /* (2-1) compute symbolic Co = Ro*AP_loc */ 378 /* ------------------------------------ */ 379 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 380 381 /* (3) send coj of C_oth to other processors */ 382 /* ------------------------------------------ */ 383 /* determine row ownership */ 384 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 385 rowmap->n = pn; 386 rowmap->bs = 1; 387 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 388 owners = rowmap->range; 389 390 /* determine the number of messages to send, their lengths */ 391 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 392 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 393 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 394 395 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 396 coi = c_oth->i; coj = c_oth->j; 397 con = ptap->C_oth->rmap->n; 398 proc = 0; 399 for (i=0; i<con; i++) { 400 while (prmap[i] >= owners[proc+1]) proc++; 401 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 402 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 403 } 404 405 len = 0; /* max length of buf_si[], see (4) */ 406 owners_co[0] = 0; 407 nsend = 0; 408 for (proc=0; proc<size; proc++) { 409 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 410 if (len_s[proc]) { 411 nsend++; 412 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 413 len += len_si[proc]; 414 } 415 } 416 417 /* determine the number and length of messages to receive for coi and coj */ 418 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 419 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 420 421 /* post the Irecv and Isend of coj */ 422 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 423 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 424 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 425 for (proc=0, k=0; proc<size; proc++) { 426 if (!len_s[proc]) continue; 427 i = owners_co[proc]; 428 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 429 k++; 430 } 431 432 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 433 /* ---------------------------------------- */ 434 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 435 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 436 437 /* receives coj are complete */ 438 for (i=0; i<nrecv; i++) { 439 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 440 } 441 ierr = PetscFree(rwaits);CHKERRQ(ierr); 442 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 443 444 /* add received column indices into ta to update Crmax */ 445 for (k=0; k<nrecv; k++) {/* k-th received message */ 446 Jptr = buf_rj[k]; 447 for (j=0; j<len_r[k]; j++) { 448 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 449 } 450 } 451 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 452 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 453 454 /* (4) send and recv coi */ 455 /*-----------------------*/ 456 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 457 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 458 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 459 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 460 for (proc=0,k=0; proc<size; proc++) { 461 if (!len_s[proc]) continue; 462 /* form outgoing message for i-structure: 463 buf_si[0]: nrows to be sent 464 [1:nrows]: row index (global) 465 [nrows+1:2*nrows+1]: i-structure index 466 */ 467 /*-------------------------------------------*/ 468 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 469 buf_si_i = buf_si + nrows+1; 470 buf_si[0] = nrows; 471 buf_si_i[0] = 0; 472 nrows = 0; 473 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 474 nzi = coi[i+1] - coi[i]; 475 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 476 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 477 nrows++; 478 } 479 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 480 k++; 481 buf_si += len_si[proc]; 482 } 483 for (i=0; i<nrecv; i++) { 484 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 485 } 486 ierr = PetscFree(rwaits);CHKERRQ(ierr); 487 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 488 489 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 490 ierr = PetscFree(len_ri);CHKERRQ(ierr); 491 ierr = PetscFree(swaits);CHKERRQ(ierr); 492 ierr = PetscFree(buf_s);CHKERRQ(ierr); 493 494 /* (5) compute the local portion of Cmpi */ 495 /* ------------------------------------------ */ 496 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 497 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 498 current_space = free_space; 499 500 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 501 for (k=0; k<nrecv; k++) { 502 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 503 nrows = *buf_ri_k[k]; 504 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 505 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 506 } 507 508 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 509 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 510 for (i=0; i<pn; i++) { 511 /* add C_loc into Cmpi */ 512 nzi = c_loc->i[i+1] - c_loc->i[i]; 513 Jptr = c_loc->j + c_loc->i[i]; 514 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 515 516 /* add received col data into lnk */ 517 for (k=0; k<nrecv; k++) { /* k-th received message */ 518 if (i == *nextrow[k]) { /* i-th row */ 519 nzi = *(nextci[k]+1) - *nextci[k]; 520 Jptr = buf_rj[k] + *nextci[k]; 521 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 522 nextrow[k]++; nextci[k]++; 523 } 524 } 525 nzi = lnk[0]; 526 527 /* copy data into free space, then initialize lnk */ 528 ierr = PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);CHKERRQ(ierr); 529 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 530 } 531 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 532 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 533 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 534 535 /* local sizes and preallocation */ 536 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 537 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 538 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 539 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 540 541 /* members in merge */ 542 ierr = PetscFree(id_r);CHKERRQ(ierr); 543 ierr = PetscFree(len_r);CHKERRQ(ierr); 544 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 545 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 546 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 547 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 548 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 549 550 /* attach the supporting struct to Cmpi for reuse */ 551 c = (Mat_MPIAIJ*)Cmpi->data; 552 c->ptap = ptap; 553 ptap->duplicate = Cmpi->ops->duplicate; 554 ptap->destroy = Cmpi->ops->destroy; 555 ptap->view = Cmpi->ops->view; 556 557 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 558 Cmpi->assembled = PETSC_FALSE; 559 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 560 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP; 561 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 562 *C = Cmpi; 563 PetscFunctionReturn(0); 564 } 565 566 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 567 { 568 PetscErrorCode ierr; 569 Mat_PtAPMPI *ptap; 570 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 571 MPI_Comm comm; 572 PetscMPIInt size,rank; 573 Mat Cmpi; 574 PetscFreeSpaceList free_space=NULL,current_space=NULL; 575 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 576 PetscInt *lnk,i,k,pnz,row,nsend; 577 PetscBT lnkbt; 578 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv; 579 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 580 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 581 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 582 MPI_Request *swaits,*rwaits; 583 MPI_Status *sstatus,rstatus; 584 PetscLayout rowmap; 585 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 586 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 587 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 588 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 589 PetscScalar *apv; 590 PetscTable ta; 591 #if defined(PETSC_HAVE_HYPRE) 592 const char *algTypes[3] = {"scalable","nonscalable","hypre"}; 593 PetscInt nalg = 3; 594 #else 595 const char *algTypes[2] = {"scalable","nonscalable"}; 596 PetscInt nalg = 2; 597 #endif 598 PetscInt alg = 1; /* set default algorithm */ 599 #if defined(PETSC_USE_INFO) 600 PetscReal apfill; 601 #endif 602 #if defined(PTAP_PROFILE) 603 PetscLogDouble t0,t1,t11,t12,t2,t3,t4; 604 #endif 605 PetscBool flg; 606 607 PetscFunctionBegin; 608 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 609 610 /* pick an algorithm */ 611 ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 612 PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */ 613 ierr = PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[1],&alg,&flg);CHKERRQ(ierr); 614 ierr = PetscOptionsEnd();CHKERRQ(ierr); 615 616 if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */ 617 MatInfo Ainfo,Pinfo; 618 PetscInt nz_local; 619 PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable; 620 621 ierr = MatGetInfo(A,MAT_LOCAL,&Ainfo);CHKERRQ(ierr); 622 ierr = MatGetInfo(P,MAT_LOCAL,&Pinfo);CHKERRQ(ierr); 623 nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated); 624 625 if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE; 626 ierr = MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);CHKERRQ(ierr); 627 628 if (alg_scalable) { 629 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 630 } 631 } 632 633 if (alg == 0) { 634 ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);CHKERRQ(ierr); 635 (*C)->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 636 PetscFunctionReturn(0); 637 638 #if defined(PETSC_HAVE_HYPRE) 639 } else if (alg == 2) { 640 /* Use boomerAMGBuildCoarseOperator */ 641 ierr = MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);CHKERRQ(ierr); 642 PetscFunctionReturn(0); 643 #endif 644 } 645 646 #if defined(PTAP_PROFILE) 647 ierr = PetscTime(&t0);CHKERRQ(ierr); 648 #endif 649 650 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 651 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 652 653 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 654 655 /* create symbolic parallel matrix Cmpi */ 656 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 657 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 658 659 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 660 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 661 662 /* create struct Mat_PtAPMPI and attached it to C later */ 663 ierr = PetscNew(&ptap);CHKERRQ(ierr); 664 ptap->reuse = MAT_INITIAL_MATRIX; 665 ptap->algType = alg; 666 667 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 668 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 669 /* get P_loc by taking all local rows of P */ 670 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 671 672 /* (0) compute Rd = Pd^T, Ro = Po^T */ 673 /* --------------------------------- */ 674 ierr = MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);CHKERRQ(ierr); 675 ierr = MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);CHKERRQ(ierr); 676 #if defined(PTAP_PROFILE) 677 ierr = PetscTime(&t11);CHKERRQ(ierr); 678 #endif 679 680 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 681 /* ----------------------------------------------------------------- */ 682 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 683 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 684 685 /* create and initialize a linked list */ 686 ierr = PetscTableCreate(pn,pN,&ta);CHKERRQ(ierr); /* for compute AP_loc and Cmpi */ 687 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 688 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 689 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); /* Crmax = nnz(sum of Prows) */ 690 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 691 692 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 693 694 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 695 if (ao) { 696 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);CHKERRQ(ierr); 697 } else { 698 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);CHKERRQ(ierr); 699 } 700 current_space = free_space; 701 nspacedouble = 0; 702 703 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 704 api[0] = 0; 705 for (i=0; i<am; i++) { 706 /* diagonal portion: Ad[i,:]*P */ 707 ai = ad->i; pi = p_loc->i; 708 nzi = ai[i+1] - ai[i]; 709 aj = ad->j + ai[i]; 710 for (j=0; j<nzi; j++) { 711 row = aj[j]; 712 pnz = pi[row+1] - pi[row]; 713 Jptr = p_loc->j + pi[row]; 714 /* add non-zero cols of P into the sorted linked list lnk */ 715 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 716 } 717 /* off-diagonal portion: Ao[i,:]*P */ 718 if (ao) { 719 ai = ao->i; pi = p_oth->i; 720 nzi = ai[i+1] - ai[i]; 721 aj = ao->j + ai[i]; 722 for (j=0; j<nzi; j++) { 723 row = aj[j]; 724 pnz = pi[row+1] - pi[row]; 725 Jptr = p_oth->j + pi[row]; 726 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 727 } 728 } 729 apnz = lnk[0]; 730 api[i+1] = api[i] + apnz; 731 if (ap_rmax < apnz) ap_rmax = apnz; 732 733 /* if free space is not available, double the total space in the list */ 734 if (current_space->local_remaining<apnz) { 735 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 736 nspacedouble++; 737 } 738 739 /* Copy data into free space, then initialize lnk */ 740 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 741 742 current_space->array += apnz; 743 current_space->local_used += apnz; 744 current_space->local_remaining -= apnz; 745 } 746 /* Allocate space for apj and apv, initialize apj, and */ 747 /* destroy list of free space and other temporary array(s) */ 748 ierr = PetscMalloc2(api[am],&apj,api[am],&apv);CHKERRQ(ierr); 749 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 750 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 751 752 /* Create AP_loc for reuse */ 753 ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);CHKERRQ(ierr); 754 755 #if defined(PETSC_USE_INFO) 756 if (ao) { 757 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 758 } else { 759 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 760 } 761 ptap->AP_loc->info.mallocs = nspacedouble; 762 ptap->AP_loc->info.fill_ratio_given = fill; 763 ptap->AP_loc->info.fill_ratio_needed = apfill; 764 765 if (api[am]) { 766 ierr = PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);CHKERRQ(ierr); 767 ierr = PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);CHKERRQ(ierr); 768 } else { 769 ierr = PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");CHKERRQ(ierr); 770 } 771 #endif 772 773 #if defined(PTAP_PROFILE) 774 ierr = PetscTime(&t12);CHKERRQ(ierr); 775 #endif 776 777 /* (2-1) compute symbolic Co = Ro*AP_loc */ 778 /* ------------------------------------ */ 779 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);CHKERRQ(ierr); 780 #if defined(PTAP_PROFILE) 781 ierr = PetscTime(&t1);CHKERRQ(ierr); 782 #endif 783 784 /* (3) send coj of C_oth to other processors */ 785 /* ------------------------------------------ */ 786 /* determine row ownership */ 787 ierr = PetscLayoutCreate(comm,&rowmap);CHKERRQ(ierr); 788 rowmap->n = pn; 789 rowmap->bs = 1; 790 ierr = PetscLayoutSetUp(rowmap);CHKERRQ(ierr); 791 owners = rowmap->range; 792 793 /* determine the number of messages to send, their lengths */ 794 ierr = PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);CHKERRQ(ierr); 795 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 796 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 797 798 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 799 coi = c_oth->i; coj = c_oth->j; 800 con = ptap->C_oth->rmap->n; 801 proc = 0; 802 for (i=0; i<con; i++) { 803 while (prmap[i] >= owners[proc+1]) proc++; 804 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 805 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 806 } 807 808 len = 0; /* max length of buf_si[], see (4) */ 809 owners_co[0] = 0; 810 nsend = 0; 811 for (proc=0; proc<size; proc++) { 812 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 813 if (len_s[proc]) { 814 nsend++; 815 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 816 len += len_si[proc]; 817 } 818 } 819 820 /* determine the number and length of messages to receive for coi and coj */ 821 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);CHKERRQ(ierr); 822 ierr = PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);CHKERRQ(ierr); 823 824 /* post the Irecv and Isend of coj */ 825 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 826 ierr = PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 827 ierr = PetscMalloc1(nsend+1,&swaits);CHKERRQ(ierr); 828 for (proc=0, k=0; proc<size; proc++) { 829 if (!len_s[proc]) continue; 830 i = owners_co[proc]; 831 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 832 k++; 833 } 834 835 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 836 /* ---------------------------------------- */ 837 ierr = MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);CHKERRQ(ierr); 838 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 839 840 /* receives coj are complete */ 841 for (i=0; i<nrecv; i++) { 842 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 843 } 844 ierr = PetscFree(rwaits);CHKERRQ(ierr); 845 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 846 847 /* add received column indices into ta to update Crmax */ 848 for (k=0; k<nrecv; k++) {/* k-th received message */ 849 Jptr = buf_rj[k]; 850 for (j=0; j<len_r[k]; j++) { 851 ierr = PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);CHKERRQ(ierr); 852 } 853 } 854 ierr = PetscTableGetCount(ta,&Crmax);CHKERRQ(ierr); 855 ierr = PetscTableDestroy(&ta);CHKERRQ(ierr); 856 857 /* (4) send and recv coi */ 858 /*-----------------------*/ 859 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 860 ierr = PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 861 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 862 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 863 for (proc=0,k=0; proc<size; proc++) { 864 if (!len_s[proc]) continue; 865 /* form outgoing message for i-structure: 866 buf_si[0]: nrows to be sent 867 [1:nrows]: row index (global) 868 [nrows+1:2*nrows+1]: i-structure index 869 */ 870 /*-------------------------------------------*/ 871 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 872 buf_si_i = buf_si + nrows+1; 873 buf_si[0] = nrows; 874 buf_si_i[0] = 0; 875 nrows = 0; 876 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 877 nzi = coi[i+1] - coi[i]; 878 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 879 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 880 nrows++; 881 } 882 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 883 k++; 884 buf_si += len_si[proc]; 885 } 886 for (i=0; i<nrecv; i++) { 887 ierr = MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 888 } 889 ierr = PetscFree(rwaits);CHKERRQ(ierr); 890 if (nsend) {ierr = MPI_Waitall(nsend,swaits,sstatus);CHKERRQ(ierr);} 891 892 ierr = PetscFree4(len_s,len_si,sstatus,owners_co);CHKERRQ(ierr); 893 ierr = PetscFree(len_ri);CHKERRQ(ierr); 894 ierr = PetscFree(swaits);CHKERRQ(ierr); 895 ierr = PetscFree(buf_s);CHKERRQ(ierr); 896 #if defined(PTAP_PROFILE) 897 ierr = PetscTime(&t2);CHKERRQ(ierr); 898 #endif 899 /* (5) compute the local portion of Cmpi */ 900 /* ------------------------------------------ */ 901 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 902 ierr = PetscFreeSpaceGet(Crmax,&free_space);CHKERRQ(ierr); 903 current_space = free_space; 904 905 ierr = PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);CHKERRQ(ierr); 906 for (k=0; k<nrecv; k++) { 907 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 908 nrows = *buf_ri_k[k]; 909 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 910 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 911 } 912 913 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 914 ierr = PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);CHKERRQ(ierr); 915 for (i=0; i<pn; i++) { 916 /* add C_loc into Cmpi */ 917 nzi = c_loc->i[i+1] - c_loc->i[i]; 918 Jptr = c_loc->j + c_loc->i[i]; 919 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 920 921 /* add received col data into lnk */ 922 for (k=0; k<nrecv; k++) { /* k-th received message */ 923 if (i == *nextrow[k]) { /* i-th row */ 924 nzi = *(nextci[k]+1) - *nextci[k]; 925 Jptr = buf_rj[k] + *nextci[k]; 926 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 927 nextrow[k]++; nextci[k]++; 928 } 929 } 930 nzi = lnk[0]; 931 932 /* copy data into free space, then initialize lnk */ 933 ierr = PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 934 ierr = MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);CHKERRQ(ierr); 935 } 936 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 937 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 938 ierr = PetscFreeSpaceDestroy(free_space);CHKERRQ(ierr); 939 #if defined(PTAP_PROFILE) 940 ierr = PetscTime(&t3);CHKERRQ(ierr); 941 #endif 942 943 /* local sizes and preallocation */ 944 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 945 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 946 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 947 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 948 949 /* members in merge */ 950 ierr = PetscFree(id_r);CHKERRQ(ierr); 951 ierr = PetscFree(len_r);CHKERRQ(ierr); 952 ierr = PetscFree(buf_ri[0]);CHKERRQ(ierr); 953 ierr = PetscFree(buf_ri);CHKERRQ(ierr); 954 ierr = PetscFree(buf_rj[0]);CHKERRQ(ierr); 955 ierr = PetscFree(buf_rj);CHKERRQ(ierr); 956 ierr = PetscLayoutDestroy(&rowmap);CHKERRQ(ierr); 957 958 /* attach the supporting struct to Cmpi for reuse */ 959 c = (Mat_MPIAIJ*)Cmpi->data; 960 c->ptap = ptap; 961 ptap->duplicate = Cmpi->ops->duplicate; 962 ptap->destroy = Cmpi->ops->destroy; 963 ptap->view = Cmpi->ops->view; 964 965 if (alg == 1) { 966 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 967 } 968 969 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 970 Cmpi->assembled = PETSC_FALSE; 971 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 972 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP; 973 Cmpi->ops->view = MatView_MPIAIJ_PtAP; 974 *C = Cmpi; 975 976 #if defined(PTAP_PROFILE) 977 ierr = PetscTime(&t4);CHKERRQ(ierr); 978 if (rank == 1) { 979 printf("PtAPSym: %g + %g + %g + %g + %g + %g = %g\n",t11-t0,t1-t11,t12-t11,t2-t2,t3-t2,t4-t3,t4-t0); 980 } 981 #endif 982 PetscFunctionReturn(0); 983 } 984 985 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 986 { 987 PetscErrorCode ierr; 988 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 989 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 990 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 991 Mat_PtAPMPI *ptap = c->ptap; 992 Mat AP_loc,C_loc,C_oth; 993 PetscInt i,rstart,rend,cm,ncols,row; 994 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 995 PetscScalar *apa; 996 const PetscInt *cols; 997 const PetscScalar *vals; 998 #if defined(PTAP_PROFILE) 999 PetscMPIInt rank; 1000 MPI_Comm comm; 1001 PetscLogDouble t0,t1,t2,t3,t4,eR,eAP,eCseq,eCmpi; 1002 #endif 1003 1004 PetscFunctionBegin; 1005 ierr = MatZeroEntries(C);CHKERRQ(ierr); 1006 1007 /* 1) get R = Pd^T,Ro = Po^T */ 1008 #if defined(PTAP_PROFILE) 1009 ierr = PetscTime(&t0);CHKERRQ(ierr); 1010 #endif 1011 if (ptap->reuse == MAT_REUSE_MATRIX) { 1012 ierr = MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);CHKERRQ(ierr); 1013 ierr = MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);CHKERRQ(ierr); 1014 } 1015 #if defined(PTAP_PROFILE) 1016 ierr = PetscTime(&t1);CHKERRQ(ierr); 1017 eR = t1 - t0; 1018 #endif 1019 1020 /* 2) get AP_loc */ 1021 AP_loc = ptap->AP_loc; 1022 ap = (Mat_SeqAIJ*)AP_loc->data; 1023 1024 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1025 /*-----------------------------------------------------*/ 1026 if (ptap->reuse == MAT_REUSE_MATRIX) { 1027 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1028 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1029 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1030 } 1031 1032 /* 2-2) compute numeric A_loc*P - dominating part */ 1033 /* ---------------------------------------------- */ 1034 /* get data from symbolic products */ 1035 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1036 if (ptap->P_oth) { 1037 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1038 } 1039 apa = ptap->apa; 1040 api = ap->i; 1041 apj = ap->j; 1042 for (i=0; i<am; i++) { 1043 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1044 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 1045 apnz = api[i+1] - api[i]; 1046 for (j=0; j<apnz; j++) { 1047 col = apj[j+api[i]]; 1048 ap->a[j+ap->i[i]] = apa[col]; 1049 apa[col] = 0.0; 1050 } 1051 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 1052 } 1053 #if defined(PTAP_PROFILE) 1054 ierr = PetscTime(&t2);CHKERRQ(ierr); 1055 eAP = t2 - t1; 1056 #endif 1057 1058 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1059 ierr = ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);CHKERRQ(ierr); 1060 ierr = ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);CHKERRQ(ierr); 1061 C_loc = ptap->C_loc; 1062 C_oth = ptap->C_oth; 1063 #if defined(PTAP_PROFILE) 1064 ierr = PetscTime(&t3);CHKERRQ(ierr); 1065 eCseq = t3 - t2; 1066 #endif 1067 1068 /* add C_loc and Co to to C */ 1069 ierr = MatGetOwnershipRange(C,&rstart,&rend);CHKERRQ(ierr); 1070 1071 /* C_loc -> C */ 1072 cm = C_loc->rmap->N; 1073 c_seq = (Mat_SeqAIJ*)C_loc->data; 1074 cols = c_seq->j; 1075 vals = c_seq->a; 1076 for (i=0; i<cm; i++) { 1077 ncols = c_seq->i[i+1] - c_seq->i[i]; 1078 row = rstart + i; 1079 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 1080 cols += ncols; vals += ncols; 1081 } 1082 1083 /* Co -> C, off-processor part */ 1084 cm = C_oth->rmap->N; 1085 c_seq = (Mat_SeqAIJ*)C_oth->data; 1086 cols = c_seq->j; 1087 vals = c_seq->a; 1088 for (i=0; i<cm; i++) { 1089 ncols = c_seq->i[i+1] - c_seq->i[i]; 1090 row = p->garray[i]; 1091 ierr = MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);CHKERRQ(ierr); 1092 cols += ncols; vals += ncols; 1093 } 1094 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1095 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1096 #if defined(PTAP_PROFILE) 1097 ierr = PetscTime(&t4);CHKERRQ(ierr); 1098 eCmpi = t4 - t3; 1099 1100 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1101 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1102 if (rank==1) { 1103 ierr = PetscPrintf(MPI_COMM_SELF," R %g, AP %g, Cseq %g, Cmpi %g = %g\n", eR,eAP,eCseq,eCmpi,eR+eAP+eCseq+eCmpi);CHKERRQ(ierr); 1104 } 1105 #endif 1106 ptap->reuse = MAT_REUSE_MATRIX; 1107 PetscFunctionReturn(0); 1108 } 1109 1110 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,PetscReal fill,Mat *C) 1111 { 1112 PetscErrorCode ierr; 1113 Mat Cmpi; 1114 Mat_PtAPMPI *ptap; 1115 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1116 Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c; 1117 Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1118 Mat_SeqAIJ *p_loc,*p_oth; 1119 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pdti,*pdtj,*poti,*potj,*ptJ; 1120 PetscInt *adi=ad->i,*aj,*aoi=ao->i,nnz; 1121 PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row; 1122 PetscInt am=A->rmap->n,pN=P->cmap->N,pm=P->rmap->n,pn=P->cmap->n; 1123 PetscBT lnkbt; 1124 MPI_Comm comm; 1125 PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0; 1126 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1127 PetscInt len,proc,*dnz,*onz,*owners; 1128 PetscInt nzi,*pti,*ptj; 1129 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1130 MPI_Request *swaits,*rwaits; 1131 MPI_Status *sstatus,rstatus; 1132 Mat_Merge_SeqsToMPI *merge; 1133 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,pon,nspacedouble=0,j,ap_rmax=0; 1134 PetscReal afill=1.0,afill_tmp; 1135 1136 PetscFunctionBegin; 1137 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1138 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1139 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1140 1141 /* create struct Mat_PtAPMPI and attached it to C later */ 1142 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1143 ierr = PetscNew(&merge);CHKERRQ(ierr); 1144 ptap->merge = merge; 1145 ptap->reuse = MAT_INITIAL_MATRIX; 1146 1147 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1148 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1149 1150 /* get P_loc by taking all local rows of P */ 1151 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1152 1153 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1154 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1155 pi_loc = p_loc->i; pj_loc = p_loc->j; 1156 pi_oth = p_oth->i; pj_oth = p_oth->j; 1157 1158 /* (1) compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth (api,apj) */ 1159 /*--------------------------------------------------------------------------*/ 1160 ierr = PetscMalloc1(am+1,&api);CHKERRQ(ierr); 1161 api[0] = 0; 1162 1163 /* create and initialize a linked list */ 1164 ierr = PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);CHKERRQ(ierr); 1165 1166 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) -OOM for ex56, np=8k on Intrepid! */ 1167 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);CHKERRQ(ierr); 1168 current_space = free_space; 1169 1170 for (i=0; i<am; i++) { 1171 /* diagonal portion of A */ 1172 nzi = adi[i+1] - adi[i]; 1173 aj = ad->j + adi[i]; 1174 for (j=0; j<nzi; j++) { 1175 row = aj[j]; 1176 pnz = pi_loc[row+1] - pi_loc[row]; 1177 Jptr = pj_loc + pi_loc[row]; 1178 /* add non-zero cols of P into the sorted linked list lnk */ 1179 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1180 } 1181 /* off-diagonal portion of A */ 1182 nzi = aoi[i+1] - aoi[i]; 1183 aj = ao->j + aoi[i]; 1184 for (j=0; j<nzi; j++) { 1185 row = aj[j]; 1186 pnz = pi_oth[row+1] - pi_oth[row]; 1187 Jptr = pj_oth + pi_oth[row]; 1188 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1189 } 1190 apnz = lnk[0]; 1191 api[i+1] = api[i] + apnz; 1192 if (ap_rmax < apnz) ap_rmax = apnz; 1193 1194 /* if free space is not available, double the total space in the list */ 1195 if (current_space->local_remaining<apnz) { 1196 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1197 nspacedouble++; 1198 } 1199 1200 /* Copy data into free space, then initialize lnk */ 1201 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1202 1203 current_space->array += apnz; 1204 current_space->local_used += apnz; 1205 current_space->local_remaining -= apnz; 1206 } 1207 1208 /* Allocate space for apj, initialize apj, and */ 1209 /* destroy list of free space and other temporary array(s) */ 1210 ierr = PetscMalloc1(api[am]+1,&apj);CHKERRQ(ierr); 1211 ierr = PetscFreeSpaceContiguous(&free_space,apj);CHKERRQ(ierr); 1212 afill_tmp = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1); 1213 if (afill_tmp > afill) afill = afill_tmp; 1214 1215 /* (2) determine symbolic Co=(p->B)^T*AP - send to others (coi,coj)*/ 1216 /*-----------------------------------------------------------------*/ 1217 ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr); 1218 1219 /* then, compute symbolic Co = (p->B)^T*AP */ 1220 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 1221 >= (num of nonzero rows of C_seq) - pn */ 1222 ierr = PetscMalloc1(pon+1,&coi);CHKERRQ(ierr); 1223 coi[0] = 0; 1224 1225 /* set initial free space to be fill*(nnz(p->B) + nnz(AP)) */ 1226 nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],api[am])); 1227 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1228 current_space = free_space; 1229 1230 for (i=0; i<pon; i++) { 1231 pnz = poti[i+1] - poti[i]; 1232 ptJ = potj + poti[i]; 1233 for (j=0; j<pnz; j++) { 1234 row = ptJ[j]; /* row of AP == col of Pot */ 1235 apnz = api[row+1] - api[row]; 1236 Jptr = apj + api[row]; 1237 /* add non-zero cols of AP into the sorted linked list lnk */ 1238 ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1239 } 1240 nnz = lnk[0]; 1241 1242 /* If free space is not available, double the total space in the list */ 1243 if (current_space->local_remaining<nnz) { 1244 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1245 nspacedouble++; 1246 } 1247 1248 /* Copy data into free space, and zero out denserows */ 1249 ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1250 1251 current_space->array += nnz; 1252 current_space->local_used += nnz; 1253 current_space->local_remaining -= nnz; 1254 1255 coi[i+1] = coi[i] + nnz; 1256 } 1257 1258 ierr = PetscMalloc1(coi[pon],&coj);CHKERRQ(ierr); 1259 ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); 1260 afill_tmp = (PetscReal)coi[pon]/(poti[pon] + api[am]+1); 1261 if (afill_tmp > afill) afill = afill_tmp; 1262 ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->B,&poti,&potj);CHKERRQ(ierr); 1263 1264 /* (3) send j-array (coj) of Co to other processors */ 1265 /*--------------------------------------------------*/ 1266 ierr = PetscCalloc1(size,&merge->len_s);CHKERRQ(ierr); 1267 len_s = merge->len_s; 1268 merge->nsend = 0; 1269 1270 1271 /* determine row ownership */ 1272 ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr); 1273 merge->rowmap->n = pn; 1274 merge->rowmap->bs = 1; 1275 1276 ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr); 1277 owners = merge->rowmap->range; 1278 1279 /* determine the number of messages to send, their lengths */ 1280 ierr = PetscMalloc2(size,&len_si,size,&sstatus);CHKERRQ(ierr); 1281 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1282 ierr = PetscMalloc1(size+2,&owners_co);CHKERRQ(ierr); 1283 1284 proc = 0; 1285 for (i=0; i<pon; i++) { 1286 while (prmap[i] >= owners[proc+1]) proc++; 1287 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1288 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1289 } 1290 1291 len = 0; /* max length of buf_si[], see (4) */ 1292 owners_co[0] = 0; 1293 for (proc=0; proc<size; proc++) { 1294 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1295 if (len_s[proc]) { 1296 merge->nsend++; 1297 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1298 len += len_si[proc]; 1299 } 1300 } 1301 1302 /* determine the number and length of messages to receive for coi and coj */ 1303 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 1304 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 1305 1306 /* post the Irecv and Isend of coj */ 1307 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1308 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1309 ierr = PetscMalloc1(merge->nsend+1,&swaits);CHKERRQ(ierr); 1310 for (proc=0, k=0; proc<size; proc++) { 1311 if (!len_s[proc]) continue; 1312 i = owners_co[proc]; 1313 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1314 k++; 1315 } 1316 1317 /* receives and sends of coj are complete */ 1318 for (i=0; i<merge->nrecv; i++) { 1319 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1320 } 1321 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1322 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1323 1324 /* (4) send and recv coi */ 1325 /*-----------------------*/ 1326 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1327 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1328 ierr = PetscMalloc1(len+1,&buf_s);CHKERRQ(ierr); 1329 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1330 for (proc=0,k=0; proc<size; proc++) { 1331 if (!len_s[proc]) continue; 1332 /* form outgoing message for i-structure: 1333 buf_si[0]: nrows to be sent 1334 [1:nrows]: row index (global) 1335 [nrows+1:2*nrows+1]: i-structure index 1336 */ 1337 /*-------------------------------------------*/ 1338 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 1339 buf_si_i = buf_si + nrows+1; 1340 buf_si[0] = nrows; 1341 buf_si_i[0] = 0; 1342 nrows = 0; 1343 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1344 nzi = coi[i+1] - coi[i]; 1345 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1346 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1347 nrows++; 1348 } 1349 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1350 k++; 1351 buf_si += len_si[proc]; 1352 } 1353 i = merge->nrecv; 1354 while (i--) { 1355 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1356 } 1357 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1358 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1359 1360 ierr = PetscFree2(len_si,sstatus);CHKERRQ(ierr); 1361 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1362 ierr = PetscFree(swaits);CHKERRQ(ierr); 1363 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1364 1365 /* (5) compute the local portion of C (mpi mat) */ 1366 /*----------------------------------------------*/ 1367 ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr); 1368 1369 /* allocate pti array and free space for accumulating nonzero column info */ 1370 ierr = PetscMalloc1(pn+1,&pti);CHKERRQ(ierr); 1371 pti[0] = 0; 1372 1373 /* set initial free space to be fill*(nnz(P) + nnz(AP)) */ 1374 nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pi_loc[pm],api[am])); 1375 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1376 current_space = free_space; 1377 1378 ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); 1379 for (k=0; k<merge->nrecv; k++) { 1380 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1381 nrows = *buf_ri_k[k]; 1382 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1383 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1384 } 1385 ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr); 1386 1387 for (i=0; i<pn; i++) { 1388 /* add pdt[i,:]*AP into lnk */ 1389 pnz = pdti[i+1] - pdti[i]; 1390 ptJ = pdtj + pdti[i]; 1391 for (j=0; j<pnz; j++) { 1392 row = ptJ[j]; /* row of AP == col of Pt */ 1393 apnz = api[row+1] - api[row]; 1394 Jptr = apj + api[row]; 1395 /* add non-zero cols of AP into the sorted linked list lnk */ 1396 ierr = PetscLLCondensedAddSorted(apnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1397 } 1398 1399 /* add received col data into lnk */ 1400 for (k=0; k<merge->nrecv; k++) { /* k-th received message */ 1401 if (i == *nextrow[k]) { /* i-th row */ 1402 nzi = *(nextci[k]+1) - *nextci[k]; 1403 Jptr = buf_rj[k] + *nextci[k]; 1404 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1405 nextrow[k]++; nextci[k]++; 1406 } 1407 } 1408 nnz = lnk[0]; 1409 1410 /* if free space is not available, make more free space */ 1411 if (current_space->local_remaining<nnz) { 1412 ierr = PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);CHKERRQ(ierr); 1413 nspacedouble++; 1414 } 1415 /* copy data into free space, then initialize lnk */ 1416 ierr = PetscLLCondensedClean(pN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1417 ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr); 1418 1419 current_space->array += nnz; 1420 current_space->local_used += nnz; 1421 current_space->local_remaining -= nnz; 1422 1423 pti[i+1] = pti[i] + nnz; 1424 } 1425 ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&pdti,&pdtj);CHKERRQ(ierr); 1426 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1427 1428 ierr = PetscMalloc1(pti[pn]+1,&ptj);CHKERRQ(ierr); 1429 ierr = PetscFreeSpaceContiguous(&free_space,ptj);CHKERRQ(ierr); 1430 afill_tmp = (PetscReal)pti[pn]/(pi_loc[pm] + api[am]+1); 1431 if (afill_tmp > afill) afill = afill_tmp; 1432 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1433 1434 /* (6) create symbolic parallel matrix Cmpi */ 1435 /*------------------------------------------*/ 1436 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1437 ierr = MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1438 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));CHKERRQ(ierr); 1439 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 1440 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1441 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1442 1443 merge->bi = pti; /* Cseq->i */ 1444 merge->bj = ptj; /* Cseq->j */ 1445 merge->coi = coi; /* Co->i */ 1446 merge->coj = coj; /* Co->j */ 1447 merge->buf_ri = buf_ri; 1448 merge->buf_rj = buf_rj; 1449 merge->owners_co = owners_co; 1450 merge->destroy = Cmpi->ops->destroy; 1451 1452 /* attach the supporting struct to Cmpi for reuse */ 1453 c = (Mat_MPIAIJ*)Cmpi->data; 1454 c->ptap = ptap; 1455 ptap->api = api; 1456 ptap->apj = apj; 1457 ptap->duplicate = Cmpi->ops->duplicate; 1458 ptap->destroy = Cmpi->ops->destroy; 1459 1460 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1461 Cmpi->assembled = PETSC_FALSE; 1462 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1463 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP; 1464 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap; 1465 *C = Cmpi; 1466 1467 /* flag 'scalable' determines which implementations to be used: 1468 0: do dense axpy in MatPtAPNumeric() - fast, but requires storage of a nonscalable dense array apa; 1469 1: do sparse axpy in MatPtAPNumeric() - might slow, uses a sparse array apa */ 1470 /* set default scalable */ 1471 ptap->scalable = PETSC_FALSE; /* PETSC_TRUE; */ 1472 1473 ierr = PetscOptionsGetBool(((PetscObject)Cmpi)->options,((PetscObject)Cmpi)->prefix,"-matptap_scalable",&ptap->scalable,NULL);CHKERRQ(ierr); 1474 if (!ptap->scalable) { /* Do dense axpy */ 1475 ierr = PetscCalloc1(pN,&ptap->apa);CHKERRQ(ierr); 1476 } else { 1477 ierr = PetscCalloc1(ap_rmax+1,&ptap->apa);CHKERRQ(ierr); 1478 } 1479 1480 #if defined(PETSC_USE_INFO) 1481 if (pti[pn] != 0) { 1482 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr); 1483 ierr = PetscInfo1(Cmpi,"Use MatPtAP(A,P,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr); 1484 } else { 1485 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 1486 } 1487 #endif 1488 PetscFunctionReturn(0); 1489 } 1490 1491 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_ptap(Mat A,Mat P,Mat C) 1492 { 1493 PetscErrorCode ierr; 1494 Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1495 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1496 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; 1497 Mat_SeqAIJ *p_loc,*p_oth; 1498 Mat_PtAPMPI *ptap; 1499 Mat_Merge_SeqsToMPI *merge; 1500 PetscInt *adi=ad->i,*aoi=ao->i,*adj,*aoj,*apJ,nextp; 1501 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pj; 1502 PetscInt i,j,k,anz,pnz,apnz,nextap,row,*cj; 1503 MatScalar *ada,*aoa,*apa,*pa,*ca,*pa_loc,*pa_oth,valtmp; 1504 PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n; 1505 MPI_Comm comm; 1506 PetscMPIInt size,rank,taga,*len_s; 1507 PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci; 1508 PetscInt **buf_ri,**buf_rj; 1509 PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 1510 MPI_Request *s_waits,*r_waits; 1511 MPI_Status *status; 1512 MatScalar **abuf_r,*ba_i,*pA,*coa,*ba; 1513 PetscInt *api,*apj,*coi,*coj; 1514 PetscInt *poJ=po->j,*pdJ=pd->j,pcstart=P->cmap->rstart,pcend=P->cmap->rend; 1515 PetscBool scalable; 1516 #if defined(PTAP_PROFILE) 1517 PetscLogDouble t0,t1,t2,eP,t3,t4,et2_AP=0.0,ePtAP=0.0,t2_0,t2_1,t2_2; 1518 #endif 1519 1520 PetscFunctionBegin; 1521 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1522 #if defined(PTAP_PROFILE) 1523 ierr = PetscTime(&t0);CHKERRQ(ierr); 1524 #endif 1525 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1526 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1527 1528 ptap = c->ptap; 1529 if (!ptap) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_INCOMP,"MatPtAP() has not been called to create matrix C yet, cannot use MAT_REUSE_MATRIX"); 1530 merge = ptap->merge; 1531 apa = ptap->apa; 1532 scalable = ptap->scalable; 1533 1534 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1535 /*-----------------------------------------------------*/ 1536 if (ptap->reuse == MAT_INITIAL_MATRIX) { 1537 /* P_oth and P_loc are obtained in MatPtASymbolic(), skip calling MatGetBrowsOfAoCols() and MatMPIAIJGetLocalMat() */ 1538 ptap->reuse = MAT_REUSE_MATRIX; 1539 } else { /* update numerical values of P_oth and P_loc */ 1540 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 1541 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 1542 } 1543 #if defined(PTAP_PROFILE) 1544 ierr = PetscTime(&t1);CHKERRQ(ierr); 1545 eP = t1-t0; 1546 #endif 1547 /* 1548 printf("[%d] Ad: %d, %d; Ao: %d, %d; P_loc: %d, %d; P_oth %d, %d;\n",rank, 1549 a->A->rmap->N,a->A->cmap->N,a->B->rmap->N,a->B->cmap->N, 1550 ptap->P_loc->rmap->N,ptap->P_loc->cmap->N, 1551 ptap->P_oth->rmap->N,ptap->P_oth->cmap->N); 1552 */ 1553 1554 /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */ 1555 /*--------------------------------------------------------------*/ 1556 /* get data from symbolic products */ 1557 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1558 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1559 pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a; 1560 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 1561 1562 coi = merge->coi; coj = merge->coj; 1563 ierr = PetscCalloc1(coi[pon]+1,&coa);CHKERRQ(ierr); 1564 1565 bi = merge->bi; bj = merge->bj; 1566 owners = merge->rowmap->range; 1567 ierr = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr); /* ba: Cseq->a */ 1568 1569 api = ptap->api; apj = ptap->apj; 1570 1571 if (!scalable) { /* Do dense axpy on apa (length of pN, stores A[i,:]*P) - nonscalable, but faster (could take 1/3 scalable time) */ 1572 ierr = PetscInfo(C,"Using non-scalable dense axpy\n");CHKERRQ(ierr); 1573 #if defined(PTAP_PROFILE) 1574 ierr = PetscTime(&t1);CHKERRQ(ierr); 1575 #endif 1576 for (i=0; i<am; i++) { 1577 #if defined(PTAP_PROFILE) 1578 ierr = PetscTime(&t2_0);CHKERRQ(ierr); 1579 #endif 1580 /* 2-a) form i-th sparse row of A_loc*P = Ad*P_loc + Ao*P_oth */ 1581 /*------------------------------------------------------------*/ 1582 apJ = apj + api[i]; 1583 1584 /* diagonal portion of A */ 1585 anz = adi[i+1] - adi[i]; 1586 adj = ad->j + adi[i]; 1587 ada = ad->a + adi[i]; 1588 for (j=0; j<anz; j++) { 1589 row = adj[j]; 1590 pnz = pi_loc[row+1] - pi_loc[row]; 1591 pj = pj_loc + pi_loc[row]; 1592 pa = pa_loc + pi_loc[row]; 1593 1594 /* perform dense axpy */ 1595 valtmp = ada[j]; 1596 for (k=0; k<pnz; k++) { 1597 apa[pj[k]] += valtmp*pa[k]; 1598 } 1599 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 1600 } 1601 1602 /* off-diagonal portion of A */ 1603 anz = aoi[i+1] - aoi[i]; 1604 aoj = ao->j + aoi[i]; 1605 aoa = ao->a + aoi[i]; 1606 for (j=0; j<anz; j++) { 1607 row = aoj[j]; 1608 pnz = pi_oth[row+1] - pi_oth[row]; 1609 pj = pj_oth + pi_oth[row]; 1610 pa = pa_oth + pi_oth[row]; 1611 1612 /* perform dense axpy */ 1613 valtmp = aoa[j]; 1614 for (k=0; k<pnz; k++) { 1615 apa[pj[k]] += valtmp*pa[k]; 1616 } 1617 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 1618 } 1619 #if defined(PTAP_PROFILE) 1620 ierr = PetscTime(&t2_1);CHKERRQ(ierr); 1621 et2_AP += t2_1 - t2_0; 1622 #endif 1623 1624 /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */ 1625 /*--------------------------------------------------------------*/ 1626 apnz = api[i+1] - api[i]; 1627 /* put the value into Co=(p->B)^T*AP (off-diagonal part, send to others) */ 1628 pnz = po->i[i+1] - po->i[i]; 1629 poJ = po->j + po->i[i]; 1630 pA = po->a + po->i[i]; 1631 for (j=0; j<pnz; j++) { 1632 row = poJ[j]; 1633 cnz = coi[row+1] - coi[row]; 1634 cj = coj + coi[row]; 1635 ca = coa + coi[row]; 1636 /* perform dense axpy */ 1637 valtmp = pA[j]; 1638 for (k=0; k<cnz; k++) { 1639 ca[k] += valtmp*apa[cj[k]]; 1640 } 1641 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1642 } 1643 /* put the value into Cd (diagonal part) */ 1644 pnz = pd->i[i+1] - pd->i[i]; 1645 pdJ = pd->j + pd->i[i]; 1646 pA = pd->a + pd->i[i]; 1647 for (j=0; j<pnz; j++) { 1648 row = pdJ[j]; 1649 cnz = bi[row+1] - bi[row]; 1650 cj = bj + bi[row]; 1651 ca = ba + bi[row]; 1652 /* perform dense axpy */ 1653 valtmp = pA[j]; 1654 for (k=0; k<cnz; k++) { 1655 ca[k] += valtmp*apa[cj[k]]; 1656 } 1657 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1658 } 1659 1660 /* zero the current row of A*P */ 1661 for (k=0; k<apnz; k++) apa[apJ[k]] = 0.0; 1662 #if defined(PTAP_PROFILE) 1663 ierr = PetscTime(&t2_2);CHKERRQ(ierr); 1664 ePtAP += t2_2 - t2_1; 1665 #endif 1666 } 1667 } else { /* Do sparse axpy on apa (length of ap_rmax, stores A[i,:]*P) - scalable, but slower */ 1668 ierr = PetscInfo(C,"Using scalable sparse axpy\n");CHKERRQ(ierr); 1669 /*-----------------------------------------------------------------------------------------*/ 1670 pA=pa_loc; 1671 for (i=0; i<am; i++) { 1672 #if defined(PTAP_PROFILE) 1673 ierr = PetscTime(&t2_0);CHKERRQ(ierr); 1674 #endif 1675 /* form i-th sparse row of A*P */ 1676 apnz = api[i+1] - api[i]; 1677 apJ = apj + api[i]; 1678 /* diagonal portion of A */ 1679 anz = adi[i+1] - adi[i]; 1680 adj = ad->j + adi[i]; 1681 ada = ad->a + adi[i]; 1682 for (j=0; j<anz; j++) { 1683 row = adj[j]; 1684 pnz = pi_loc[row+1] - pi_loc[row]; 1685 pj = pj_loc + pi_loc[row]; 1686 pa = pa_loc + pi_loc[row]; 1687 valtmp = ada[j]; 1688 nextp = 0; 1689 for (k=0; nextp<pnz; k++) { 1690 if (apJ[k] == pj[nextp]) { /* col of AP == col of P */ 1691 apa[k] += valtmp*pa[nextp++]; 1692 } 1693 } 1694 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 1695 } 1696 /* off-diagonal portion of A */ 1697 anz = aoi[i+1] - aoi[i]; 1698 aoj = ao->j + aoi[i]; 1699 aoa = ao->a + aoi[i]; 1700 for (j=0; j<anz; j++) { 1701 row = aoj[j]; 1702 pnz = pi_oth[row+1] - pi_oth[row]; 1703 pj = pj_oth + pi_oth[row]; 1704 pa = pa_oth + pi_oth[row]; 1705 valtmp = aoa[j]; 1706 nextp = 0; 1707 for (k=0; nextp<pnz; k++) { 1708 if (apJ[k] == pj[nextp]) { /* col of AP == col of P */ 1709 apa[k] += valtmp*pa[nextp++]; 1710 } 1711 } 1712 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 1713 } 1714 #if defined(PTAP_PROFILE) 1715 ierr = PetscTime(&t2_1);CHKERRQ(ierr); 1716 et2_AP += t2_1 - t2_0; 1717 #endif 1718 1719 /* 2-b) Compute Cseq = P_loc[i,:]^T*AP[i,:] using outer product */ 1720 /*--------------------------------------------------------------*/ 1721 pnz = pi_loc[i+1] - pi_loc[i]; 1722 pJ = pj_loc + pi_loc[i]; 1723 for (j=0; j<pnz; j++) { 1724 nextap = 0; 1725 row = pJ[j]; /* global index */ 1726 if (row < pcstart || row >=pcend) { /* put the value into Co */ 1727 row = *poJ; 1728 cj = coj + coi[row]; 1729 ca = coa + coi[row]; poJ++; 1730 } else { /* put the value into Cd */ 1731 row = *pdJ; 1732 cj = bj + bi[row]; 1733 ca = ba + bi[row]; pdJ++; 1734 } 1735 valtmp = pA[j]; 1736 for (k=0; nextap<apnz; k++) { 1737 if (cj[k]==apJ[nextap]) ca[k] += valtmp*apa[nextap++]; 1738 } 1739 ierr = PetscLogFlops(2.0*apnz);CHKERRQ(ierr); 1740 } 1741 pA += pnz; 1742 /* zero the current row info for A*P */ 1743 ierr = PetscMemzero(apa,apnz*sizeof(MatScalar));CHKERRQ(ierr); 1744 #if defined(PTAP_PROFILE) 1745 ierr = PetscTime(&t2_2);CHKERRQ(ierr); 1746 ePtAP += t2_2 - t2_1; 1747 #endif 1748 } 1749 } 1750 #if defined(PTAP_PROFILE) 1751 ierr = PetscTime(&t2);CHKERRQ(ierr); 1752 #endif 1753 1754 /* 3) send and recv matrix values coa */ 1755 /*------------------------------------*/ 1756 buf_ri = merge->buf_ri; 1757 buf_rj = merge->buf_rj; 1758 len_s = merge->len_s; 1759 ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr); 1760 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 1761 1762 ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr); 1763 for (proc=0,k=0; proc<size; proc++) { 1764 if (!len_s[proc]) continue; 1765 i = merge->owners_co[proc]; 1766 ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 1767 k++; 1768 } 1769 if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);} 1770 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);} 1771 1772 ierr = PetscFree2(s_waits,status);CHKERRQ(ierr); 1773 ierr = PetscFree(r_waits);CHKERRQ(ierr); 1774 ierr = PetscFree(coa);CHKERRQ(ierr); 1775 #if defined(PTAP_PROFILE) 1776 ierr = PetscTime(&t3);CHKERRQ(ierr); 1777 #endif 1778 1779 /* 4) insert local Cseq and received values into Cmpi */ 1780 /*------------------------------------------------------*/ 1781 ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); 1782 for (k=0; k<merge->nrecv; k++) { 1783 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1784 nrows = *(buf_ri_k[k]); 1785 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 1786 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1787 } 1788 1789 for (i=0; i<cm; i++) { 1790 row = owners[rank] + i; /* global row index of C_seq */ 1791 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 1792 ba_i = ba + bi[i]; 1793 bnz = bi[i+1] - bi[i]; 1794 /* add received vals into ba */ 1795 for (k=0; k<merge->nrecv; k++) { /* k-th received message */ 1796 /* i-th row */ 1797 if (i == *nextrow[k]) { 1798 cnz = *(nextci[k]+1) - *nextci[k]; 1799 cj = buf_rj[k] + *(nextci[k]); 1800 ca = abuf_r[k] + *(nextci[k]); 1801 nextcj = 0; 1802 for (j=0; nextcj<cnz; j++) { 1803 if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */ 1804 ba_i[j] += ca[nextcj++]; 1805 } 1806 } 1807 nextrow[k]++; nextci[k]++; 1808 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1809 } 1810 } 1811 ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 1812 } 1813 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1814 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1815 1816 ierr = PetscFree(ba);CHKERRQ(ierr); 1817 ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr); 1818 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 1819 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1820 #if defined(PTAP_PROFILE) 1821 ierr = PetscTime(&t4);CHKERRQ(ierr); 1822 if (rank==1) { 1823 ierr = PetscPrintf(MPI_COMM_SELF," [%d] PtAPNum %g/P + %g/PtAP( %g/A*P + %g/Pt*AP ) + %g/comm + %g/Cloc = %g\n\n",rank,eP,t2-t1,et2_AP,ePtAP,t3-t2,t4-t3,t4-t0);CHKERRQ(ierr); 1824 } 1825 #endif 1826 PetscFunctionReturn(0); 1827 } 1828