1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmapiv.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer) 17 { 18 PetscBool iascii; 19 PetscViewerFormat format; 20 Mat_APMPI *ptap; 21 22 PetscFunctionBegin; 23 MatCheckProduct(A,1); 24 ptap = (Mat_APMPI*)A->product->data; 25 PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii)); 26 if (iascii) { 27 PetscCall(PetscViewerGetFormat(viewer,&format)); 28 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 29 if (ptap->algType == 0) { 30 PetscCall(PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n")); 31 } else if (ptap->algType == 1) { 32 PetscCall(PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n")); 33 } else if (ptap->algType == 2) { 34 PetscCall(PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n")); 35 } else if (ptap->algType == 3) { 36 PetscCall(PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n")); 37 } 38 } 39 } 40 PetscFunctionReturn(0); 41 } 42 43 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) 44 { 45 Mat_APMPI *ptap = (Mat_APMPI*)data; 46 Mat_Merge_SeqsToMPI *merge; 47 48 PetscFunctionBegin; 49 PetscCall(PetscFree2(ptap->startsj_s,ptap->startsj_r)); 50 PetscCall(PetscFree(ptap->bufa)); 51 PetscCall(MatDestroy(&ptap->P_loc)); 52 PetscCall(MatDestroy(&ptap->P_oth)); 53 PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */ 54 PetscCall(MatDestroy(&ptap->Rd)); 55 PetscCall(MatDestroy(&ptap->Ro)); 56 if (ptap->AP_loc) { /* used by alg_rap */ 57 Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data; 58 PetscCall(PetscFree(ap->i)); 59 PetscCall(PetscFree2(ap->j,ap->a)); 60 PetscCall(MatDestroy(&ptap->AP_loc)); 61 } else { /* used by alg_ptap */ 62 PetscCall(PetscFree(ptap->api)); 63 PetscCall(PetscFree(ptap->apj)); 64 } 65 PetscCall(MatDestroy(&ptap->C_loc)); 66 PetscCall(MatDestroy(&ptap->C_oth)); 67 if (ptap->apa) PetscCall(PetscFree(ptap->apa)); 68 69 PetscCall(MatDestroy(&ptap->Pt)); 70 71 merge = ptap->merge; 72 if (merge) { /* used by alg_ptap */ 73 PetscCall(PetscFree(merge->id_r)); 74 PetscCall(PetscFree(merge->len_s)); 75 PetscCall(PetscFree(merge->len_r)); 76 PetscCall(PetscFree(merge->bi)); 77 PetscCall(PetscFree(merge->bj)); 78 PetscCall(PetscFree(merge->buf_ri[0])); 79 PetscCall(PetscFree(merge->buf_ri)); 80 PetscCall(PetscFree(merge->buf_rj[0])); 81 PetscCall(PetscFree(merge->buf_rj)); 82 PetscCall(PetscFree(merge->coi)); 83 PetscCall(PetscFree(merge->coj)); 84 PetscCall(PetscFree(merge->owners_co)); 85 PetscCall(PetscLayoutDestroy(&merge->rowmap)); 86 PetscCall(PetscFree(ptap->merge)); 87 } 88 PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog)); 89 90 PetscCall(PetscSFDestroy(&ptap->sf)); 91 PetscCall(PetscFree(ptap->c_othi)); 92 PetscCall(PetscFree(ptap->c_rmti)); 93 PetscCall(PetscFree(ptap)); 94 PetscFunctionReturn(0); 95 } 96 97 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C) 98 { 99 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 100 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 101 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 102 Mat_APMPI *ptap; 103 Mat AP_loc,C_loc,C_oth; 104 PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout; 105 PetscScalar *apa; 106 const PetscInt *cols; 107 const PetscScalar *vals; 108 109 PetscFunctionBegin; 110 MatCheckProduct(C,3); 111 ptap = (Mat_APMPI*)C->product->data; 112 PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 113 PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 114 115 PetscCall(MatZeroEntries(C)); 116 117 /* 1) get R = Pd^T,Ro = Po^T */ 118 if (ptap->reuse == MAT_REUSE_MATRIX) { 119 PetscCall(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd)); 120 PetscCall(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro)); 121 } 122 123 /* 2) get AP_loc */ 124 AP_loc = ptap->AP_loc; 125 ap = (Mat_SeqAIJ*)AP_loc->data; 126 127 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 128 /*-----------------------------------------------------*/ 129 if (ptap->reuse == MAT_REUSE_MATRIX) { 130 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 131 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth)); 132 PetscCall(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc)); 133 } 134 135 /* 2-2) compute numeric A_loc*P - dominating part */ 136 /* ---------------------------------------------- */ 137 /* get data from symbolic products */ 138 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 139 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 140 141 api = ap->i; 142 apj = ap->j; 143 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj)); 144 for (i=0; i<am; i++) { 145 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 146 apnz = api[i+1] - api[i]; 147 apa = ap->a + api[i]; 148 PetscCall(PetscArrayzero(apa,apnz)); 149 AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa); 150 } 151 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj)); 152 PetscCheck(api[AP_loc->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,api[AP_loc->rmap->n],nout); 153 154 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 155 /* Always use scalable version since we are in the MPI scalable version */ 156 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc)); 157 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth)); 158 159 C_loc = ptap->C_loc; 160 C_oth = ptap->C_oth; 161 162 /* add C_loc and Co to to C */ 163 PetscCall(MatGetOwnershipRange(C,&rstart,&rend)); 164 165 /* C_loc -> C */ 166 cm = C_loc->rmap->N; 167 c_seq = (Mat_SeqAIJ*)C_loc->data; 168 cols = c_seq->j; 169 vals = c_seq->a; 170 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j)); 171 172 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 173 /* when there are no off-processor parts. */ 174 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 175 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 176 /* a table, and other, more complex stuff has to be done. */ 177 if (C->assembled) { 178 C->was_assembled = PETSC_TRUE; 179 C->assembled = PETSC_FALSE; 180 } 181 if (C->was_assembled) { 182 for (i=0; i<cm; i++) { 183 ncols = c_seq->i[i+1] - c_seq->i[i]; 184 row = rstart + i; 185 PetscCall(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES)); 186 cols += ncols; vals += ncols; 187 } 188 } else { 189 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a)); 190 } 191 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j)); 192 PetscCheck(c_seq->i[C_loc->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout); 193 194 /* Co -> C, off-processor part */ 195 cm = C_oth->rmap->N; 196 c_seq = (Mat_SeqAIJ*)C_oth->data; 197 cols = c_seq->j; 198 vals = c_seq->a; 199 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j)); 200 for (i=0; i<cm; i++) { 201 ncols = c_seq->i[i+1] - c_seq->i[i]; 202 row = p->garray[i]; 203 PetscCall(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES)); 204 cols += ncols; vals += ncols; 205 } 206 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 207 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 208 209 ptap->reuse = MAT_REUSE_MATRIX; 210 211 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j)); 212 PetscCheck(c_seq->i[C_oth->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_seq->i[C_loc->rmap->n],nout); 213 PetscFunctionReturn(0); 214 } 215 216 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi) 217 { 218 Mat_APMPI *ptap; 219 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 220 MPI_Comm comm; 221 PetscMPIInt size,rank; 222 Mat P_loc,P_oth; 223 PetscFreeSpaceList free_space=NULL,current_space=NULL; 224 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 225 PetscInt *lnk,i,k,pnz,row,nsend; 226 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,nrecv; 227 PETSC_UNUSED PetscMPIInt icompleted=0; 228 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 229 const PetscInt *owners; 230 PetscInt len,proc,*dnz,*onz,nzi,nspacedouble; 231 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 232 MPI_Request *swaits,*rwaits; 233 MPI_Status *sstatus,rstatus; 234 PetscLayout rowmap; 235 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 236 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 237 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout; 238 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 239 PetscScalar *apv; 240 PetscTable ta; 241 MatType mtype; 242 const char *prefix; 243 #if defined(PETSC_USE_INFO) 244 PetscReal apfill; 245 #endif 246 247 PetscFunctionBegin; 248 MatCheckProduct(Cmpi,4); 249 PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 250 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 251 PetscCallMPI(MPI_Comm_size(comm,&size)); 252 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 253 254 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 255 256 /* create symbolic parallel matrix Cmpi */ 257 PetscCall(MatGetType(A,&mtype)); 258 PetscCall(MatSetType(Cmpi,mtype)); 259 260 /* create struct Mat_APMPI and attached it to C later */ 261 PetscCall(PetscNew(&ptap)); 262 ptap->reuse = MAT_INITIAL_MATRIX; 263 ptap->algType = 0; 264 265 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 266 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth)); 267 /* get P_loc by taking all local rows of P */ 268 PetscCall(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc)); 269 270 ptap->P_loc = P_loc; 271 ptap->P_oth = P_oth; 272 273 /* (0) compute Rd = Pd^T, Ro = Po^T */ 274 /* --------------------------------- */ 275 PetscCall(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd)); 276 PetscCall(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro)); 277 278 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 279 /* ----------------------------------------------------------------- */ 280 p_loc = (Mat_SeqAIJ*)P_loc->data; 281 if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data; 282 283 /* create and initialize a linked list */ 284 PetscCall(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */ 285 MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta); 286 MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta); 287 PetscCall(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */ 288 289 PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk)); 290 291 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 292 if (ao) { 293 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space)); 294 } else { 295 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space)); 296 } 297 current_space = free_space; 298 nspacedouble = 0; 299 300 PetscCall(PetscMalloc1(am+1,&api)); 301 api[0] = 0; 302 for (i=0; i<am; i++) { 303 /* diagonal portion: Ad[i,:]*P */ 304 ai = ad->i; pi = p_loc->i; 305 nzi = ai[i+1] - ai[i]; 306 aj = ad->j + ai[i]; 307 for (j=0; j<nzi; j++) { 308 row = aj[j]; 309 pnz = pi[row+1] - pi[row]; 310 Jptr = p_loc->j + pi[row]; 311 /* add non-zero cols of P into the sorted linked list lnk */ 312 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk)); 313 } 314 /* off-diagonal portion: Ao[i,:]*P */ 315 if (ao) { 316 ai = ao->i; pi = p_oth->i; 317 nzi = ai[i+1] - ai[i]; 318 aj = ao->j + ai[i]; 319 for (j=0; j<nzi; j++) { 320 row = aj[j]; 321 pnz = pi[row+1] - pi[row]; 322 Jptr = p_oth->j + pi[row]; 323 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk)); 324 } 325 } 326 apnz = lnk[0]; 327 api[i+1] = api[i] + apnz; 328 329 /* if free space is not available, double the total space in the list */ 330 if (current_space->local_remaining<apnz) { 331 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space)); 332 nspacedouble++; 333 } 334 335 /* Copy data into free space, then initialize lnk */ 336 PetscCall(PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk)); 337 338 current_space->array += apnz; 339 current_space->local_used += apnz; 340 current_space->local_remaining -= apnz; 341 } 342 /* Allocate space for apj and apv, initialize apj, and */ 343 /* destroy list of free space and other temporary array(s) */ 344 PetscCall(PetscCalloc2(api[am],&apj,api[am],&apv)); 345 PetscCall(PetscFreeSpaceContiguous(&free_space,apj)); 346 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 347 348 /* Create AP_loc for reuse */ 349 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc)); 350 PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog)); 351 352 #if defined(PETSC_USE_INFO) 353 if (ao) { 354 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 355 } else { 356 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 357 } 358 ptap->AP_loc->info.mallocs = nspacedouble; 359 ptap->AP_loc->info.fill_ratio_given = fill; 360 ptap->AP_loc->info.fill_ratio_needed = apfill; 361 362 if (api[am]) { 363 PetscCall(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill)); 364 PetscCall(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill)); 365 } else { 366 PetscCall(PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n")); 367 } 368 #endif 369 370 /* (2-1) compute symbolic Co = Ro*AP_loc */ 371 /* -------------------------------------- */ 372 PetscCall(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth)); 373 PetscCall(MatGetOptionsPrefix(A,&prefix)); 374 PetscCall(MatSetOptionsPrefix(ptap->C_oth,prefix)); 375 PetscCall(MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_")); 376 377 PetscCall(MatProductSetType(ptap->C_oth,MATPRODUCT_AB)); 378 PetscCall(MatProductSetAlgorithm(ptap->C_oth,"sorted")); 379 PetscCall(MatProductSetFill(ptap->C_oth,fill)); 380 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 381 PetscCall(MatProductSymbolic(ptap->C_oth)); 382 383 /* (3) send coj of C_oth to other processors */ 384 /* ------------------------------------------ */ 385 /* determine row ownership */ 386 PetscCall(PetscLayoutCreate(comm,&rowmap)); 387 PetscCall(PetscLayoutSetLocalSize(rowmap, pn)); 388 PetscCall(PetscLayoutSetBlockSize(rowmap, 1)); 389 PetscCall(PetscLayoutSetUp(rowmap)); 390 PetscCall(PetscLayoutGetRanges(rowmap,&owners)); 391 392 /* determine the number of messages to send, their lengths */ 393 PetscCall(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co)); 394 PetscCall(PetscArrayzero(len_s,size)); 395 PetscCall(PetscArrayzero(len_si,size)); 396 397 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 398 coi = c_oth->i; coj = c_oth->j; 399 con = ptap->C_oth->rmap->n; 400 proc = 0; 401 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj)); 402 for (i=0; i<con; i++) { 403 while (prmap[i] >= owners[proc+1]) proc++; 404 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 405 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 406 } 407 408 len = 0; /* max length of buf_si[], see (4) */ 409 owners_co[0] = 0; 410 nsend = 0; 411 for (proc=0; proc<size; proc++) { 412 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 413 if (len_s[proc]) { 414 nsend++; 415 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 416 len += len_si[proc]; 417 } 418 } 419 420 /* determine the number and length of messages to receive for coi and coj */ 421 PetscCall(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv)); 422 PetscCall(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri)); 423 424 /* post the Irecv and Isend of coj */ 425 PetscCall(PetscCommGetNewTag(comm,&tagj)); 426 PetscCall(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits)); 427 PetscCall(PetscMalloc1(nsend+1,&swaits)); 428 for (proc=0, k=0; proc<size; proc++) { 429 if (!len_s[proc]) continue; 430 i = owners_co[proc]; 431 PetscCallMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k)); 432 k++; 433 } 434 435 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 436 /* ---------------------------------------- */ 437 PetscCall(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc)); 438 PetscCall(MatProductSetType(ptap->C_loc,MATPRODUCT_AB)); 439 PetscCall(MatProductSetAlgorithm(ptap->C_loc,"default")); 440 PetscCall(MatProductSetFill(ptap->C_loc,fill)); 441 442 PetscCall(MatSetOptionsPrefix(ptap->C_loc,prefix)); 443 PetscCall(MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_")); 444 445 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 446 PetscCall(MatProductSymbolic(ptap->C_loc)); 447 448 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 449 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j)); 450 451 /* receives coj are complete */ 452 for (i=0; i<nrecv; i++) { 453 PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus)); 454 } 455 PetscCall(PetscFree(rwaits)); 456 if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus)); 457 458 /* add received column indices into ta to update Crmax */ 459 for (k=0; k<nrecv; k++) {/* k-th received message */ 460 Jptr = buf_rj[k]; 461 for (j=0; j<len_r[k]; j++) { 462 PetscCall(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES)); 463 } 464 } 465 PetscCall(PetscTableGetCount(ta,&Crmax)); 466 PetscCall(PetscTableDestroy(&ta)); 467 468 /* (4) send and recv coi */ 469 /*-----------------------*/ 470 PetscCall(PetscCommGetNewTag(comm,&tagi)); 471 PetscCall(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits)); 472 PetscCall(PetscMalloc1(len+1,&buf_s)); 473 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 474 for (proc=0,k=0; proc<size; proc++) { 475 if (!len_s[proc]) continue; 476 /* form outgoing message for i-structure: 477 buf_si[0]: nrows to be sent 478 [1:nrows]: row index (global) 479 [nrows+1:2*nrows+1]: i-structure index 480 */ 481 /*-------------------------------------------*/ 482 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 483 buf_si_i = buf_si + nrows+1; 484 buf_si[0] = nrows; 485 buf_si_i[0] = 0; 486 nrows = 0; 487 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 488 nzi = coi[i+1] - coi[i]; 489 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 490 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 491 nrows++; 492 } 493 PetscCallMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k)); 494 k++; 495 buf_si += len_si[proc]; 496 } 497 for (i=0; i<nrecv; i++) { 498 PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus)); 499 } 500 PetscCall(PetscFree(rwaits)); 501 if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus)); 502 503 PetscCall(PetscFree4(len_s,len_si,sstatus,owners_co)); 504 PetscCall(PetscFree(len_ri)); 505 PetscCall(PetscFree(swaits)); 506 PetscCall(PetscFree(buf_s)); 507 508 /* (5) compute the local portion of Cmpi */ 509 /* ------------------------------------------ */ 510 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 511 PetscCall(PetscFreeSpaceGet(Crmax,&free_space)); 512 current_space = free_space; 513 514 PetscCall(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci)); 515 for (k=0; k<nrecv; k++) { 516 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 517 nrows = *buf_ri_k[k]; 518 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 519 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 520 } 521 522 MatPreallocateBegin(comm,pn,pn,dnz,onz); 523 PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk)); 524 for (i=0; i<pn; i++) { 525 /* add C_loc into Cmpi */ 526 nzi = c_loc->i[i+1] - c_loc->i[i]; 527 Jptr = c_loc->j + c_loc->i[i]; 528 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk)); 529 530 /* add received col data into lnk */ 531 for (k=0; k<nrecv; k++) { /* k-th received message */ 532 if (i == *nextrow[k]) { /* i-th row */ 533 nzi = *(nextci[k]+1) - *nextci[k]; 534 Jptr = buf_rj[k] + *nextci[k]; 535 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk)); 536 nextrow[k]++; nextci[k]++; 537 } 538 } 539 nzi = lnk[0]; 540 541 /* copy data into free space, then initialize lnk */ 542 PetscCall(PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk)); 543 PetscCall(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz)); 544 } 545 PetscCall(PetscFree3(buf_ri_k,nextrow,nextci)); 546 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 547 PetscCall(PetscFreeSpaceDestroy(free_space)); 548 549 /* local sizes and preallocation */ 550 PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE)); 551 if (P->cmap->bs > 0) { 552 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs)); 553 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs)); 554 } 555 PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz)); 556 MatPreallocateEnd(dnz,onz); 557 558 /* members in merge */ 559 PetscCall(PetscFree(id_r)); 560 PetscCall(PetscFree(len_r)); 561 PetscCall(PetscFree(buf_ri[0])); 562 PetscCall(PetscFree(buf_ri)); 563 PetscCall(PetscFree(buf_rj[0])); 564 PetscCall(PetscFree(buf_rj)); 565 PetscCall(PetscLayoutDestroy(&rowmap)); 566 567 nout = 0; 568 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j)); 569 PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_oth->i[ptap->C_oth->rmap->n],nout); 570 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j)); 571 PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT,c_loc->i[ptap->C_loc->rmap->n],nout); 572 573 /* attach the supporting struct to Cmpi for reuse */ 574 Cmpi->product->data = ptap; 575 Cmpi->product->view = MatView_MPIAIJ_PtAP; 576 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 577 578 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 579 Cmpi->assembled = PETSC_FALSE; 580 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 581 PetscFunctionReturn(0); 582 } 583 584 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht) 585 { 586 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 587 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 588 PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k; 589 PetscInt pcstart,pcend,column,offset; 590 591 PetscFunctionBegin; 592 pcstart = P->cmap->rstart; 593 pcstart *= dof; 594 pcend = P->cmap->rend; 595 pcend *= dof; 596 /* diagonal portion: Ad[i,:]*P */ 597 ai = ad->i; 598 nzi = ai[i+1] - ai[i]; 599 aj = ad->j + ai[i]; 600 for (j=0; j<nzi; j++) { 601 row = aj[j]; 602 offset = row%dof; 603 row /= dof; 604 nzpi = pd->i[row+1] - pd->i[row]; 605 pj = pd->j + pd->i[row]; 606 for (k=0; k<nzpi; k++) { 607 PetscCall(PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart)); 608 } 609 } 610 /* off diag P */ 611 for (j=0; j<nzi; j++) { 612 row = aj[j]; 613 offset = row%dof; 614 row /= dof; 615 nzpi = po->i[row+1] - po->i[row]; 616 pj = po->j + po->i[row]; 617 for (k=0; k<nzpi; k++) { 618 PetscCall(PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset)); 619 } 620 } 621 622 /* off diagonal part: Ao[i, :]*P_oth */ 623 if (ao) { 624 ai = ao->i; 625 pi = p_oth->i; 626 nzi = ai[i+1] - ai[i]; 627 aj = ao->j + ai[i]; 628 for (j=0; j<nzi; j++) { 629 row = aj[j]; 630 offset = a->garray[row]%dof; 631 row = map[row]; 632 pnz = pi[row+1] - pi[row]; 633 p_othcols = p_oth->j + pi[row]; 634 for (col=0; col<pnz; col++) { 635 column = p_othcols[col] * dof + offset; 636 if (column>=pcstart && column<pcend) { 637 PetscCall(PetscHSetIAdd(dht,column)); 638 } else { 639 PetscCall(PetscHSetIAdd(oht,column)); 640 } 641 } 642 } 643 } /* end if (ao) */ 644 PetscFunctionReturn(0); 645 } 646 647 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap) 648 { 649 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 650 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 651 PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset; 652 PetscScalar ra,*aa,*pa; 653 654 PetscFunctionBegin; 655 pcstart = P->cmap->rstart; 656 pcstart *= dof; 657 658 /* diagonal portion: Ad[i,:]*P */ 659 ai = ad->i; 660 nzi = ai[i+1] - ai[i]; 661 aj = ad->j + ai[i]; 662 aa = ad->a + ai[i]; 663 for (j=0; j<nzi; j++) { 664 ra = aa[j]; 665 row = aj[j]; 666 offset = row%dof; 667 row /= dof; 668 nzpi = pd->i[row+1] - pd->i[row]; 669 pj = pd->j + pd->i[row]; 670 pa = pd->a + pd->i[row]; 671 for (k=0; k<nzpi; k++) { 672 PetscCall(PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k])); 673 } 674 PetscCall(PetscLogFlops(2.0*nzpi)); 675 } 676 for (j=0; j<nzi; j++) { 677 ra = aa[j]; 678 row = aj[j]; 679 offset = row%dof; 680 row /= dof; 681 nzpi = po->i[row+1] - po->i[row]; 682 pj = po->j + po->i[row]; 683 pa = po->a + po->i[row]; 684 for (k=0; k<nzpi; k++) { 685 PetscCall(PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k])); 686 } 687 PetscCall(PetscLogFlops(2.0*nzpi)); 688 } 689 690 /* off diagonal part: Ao[i, :]*P_oth */ 691 if (ao) { 692 ai = ao->i; 693 pi = p_oth->i; 694 nzi = ai[i+1] - ai[i]; 695 aj = ao->j + ai[i]; 696 aa = ao->a + ai[i]; 697 for (j=0; j<nzi; j++) { 698 row = aj[j]; 699 offset = a->garray[row]%dof; 700 row = map[row]; 701 ra = aa[j]; 702 pnz = pi[row+1] - pi[row]; 703 p_othcols = p_oth->j + pi[row]; 704 pa = p_oth->a + pi[row]; 705 for (col=0; col<pnz; col++) { 706 PetscCall(PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col])); 707 } 708 PetscCall(PetscLogFlops(2.0*pnz)); 709 } 710 } /* end if (ao) */ 711 712 PetscFunctionReturn(0); 713 } 714 715 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*); 716 717 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C) 718 { 719 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 720 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 721 Mat_APMPI *ptap; 722 PetscHMapIV hmap; 723 PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc; 724 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 725 PetscInt offset,ii,pocol; 726 const PetscInt *mappingindices; 727 IS map; 728 729 PetscFunctionBegin; 730 MatCheckProduct(C,4); 731 ptap = (Mat_APMPI*)C->product->data; 732 PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 733 PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 734 735 PetscCall(MatZeroEntries(C)); 736 737 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 738 /*-----------------------------------------------------*/ 739 if (ptap->reuse == MAT_REUSE_MATRIX) { 740 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 741 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth)); 742 } 743 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map)); 744 745 PetscCall(MatGetLocalSize(p->B,NULL,&pon)); 746 pon *= dof; 747 PetscCall(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta)); 748 PetscCall(MatGetLocalSize(A,&am,NULL)); 749 cmaxr = 0; 750 for (i=0; i<pon; i++) { 751 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 752 } 753 PetscCall(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc)); 754 PetscCall(PetscHMapIVCreate(&hmap)); 755 PetscCall(PetscHMapIVResize(hmap,cmaxr)); 756 PetscCall(ISGetIndices(map,&mappingindices)); 757 for (i=0; i<am && pon; i++) { 758 PetscCall(PetscHMapIVClear(hmap)); 759 offset = i%dof; 760 ii = i/dof; 761 nzi = po->i[ii+1] - po->i[ii]; 762 if (!nzi) continue; 763 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap)); 764 voff = 0; 765 PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues)); 766 if (!voff) continue; 767 768 /* Form C(ii, :) */ 769 poj = po->j + po->i[ii]; 770 poa = po->a + po->i[ii]; 771 for (j=0; j<nzi; j++) { 772 pocol = poj[j]*dof+offset; 773 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 774 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 775 for (jj=0; jj<voff; jj++) { 776 apvaluestmp[jj] = apvalues[jj]*poa[j]; 777 /*If the row is empty */ 778 if (!c_rmtc[pocol]) { 779 c_rmtjj[jj] = apindices[jj]; 780 c_rmtaa[jj] = apvaluestmp[jj]; 781 c_rmtc[pocol]++; 782 } else { 783 PetscCall(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc)); 784 if (loc>=0){ /* hit */ 785 c_rmtaa[loc] += apvaluestmp[jj]; 786 PetscCall(PetscLogFlops(1.0)); 787 } else { /* new element */ 788 loc = -(loc+1); 789 /* Move data backward */ 790 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 791 c_rmtjj[kk] = c_rmtjj[kk-1]; 792 c_rmtaa[kk] = c_rmtaa[kk-1]; 793 }/* End kk */ 794 c_rmtjj[loc] = apindices[jj]; 795 c_rmtaa[loc] = apvaluestmp[jj]; 796 c_rmtc[pocol]++; 797 } 798 } 799 PetscCall(PetscLogFlops(voff)); 800 } /* End jj */ 801 } /* End j */ 802 } /* End i */ 803 804 PetscCall(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc)); 805 PetscCall(PetscHMapIVDestroy(&hmap)); 806 807 PetscCall(MatGetLocalSize(P,NULL,&pn)); 808 pn *= dof; 809 PetscCall(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha)); 810 811 PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE)); 812 PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE)); 813 PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend)); 814 pcstart = pcstart*dof; 815 pcend = pcend*dof; 816 cd = (Mat_SeqAIJ*)(c->A)->data; 817 co = (Mat_SeqAIJ*)(c->B)->data; 818 819 cmaxr = 0; 820 for (i=0; i<pn; i++) { 821 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 822 } 823 PetscCall(PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ)); 824 PetscCall(PetscHMapIVCreate(&hmap)); 825 PetscCall(PetscHMapIVResize(hmap,cmaxr)); 826 for (i=0; i<am && pn; i++) { 827 PetscCall(PetscHMapIVClear(hmap)); 828 offset = i%dof; 829 ii = i/dof; 830 nzi = pd->i[ii+1] - pd->i[ii]; 831 if (!nzi) continue; 832 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap)); 833 voff = 0; 834 PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues)); 835 if (!voff) continue; 836 /* Form C(ii, :) */ 837 pdj = pd->j + pd->i[ii]; 838 pda = pd->a + pd->i[ii]; 839 for (j=0; j<nzi; j++) { 840 row = pcstart + pdj[j] * dof + offset; 841 for (jj=0; jj<voff; jj++) { 842 apvaluestmp[jj] = apvalues[jj]*pda[j]; 843 } 844 PetscCall(PetscLogFlops(voff)); 845 PetscCall(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES)); 846 } 847 } 848 PetscCall(ISRestoreIndices(map,&mappingindices)); 849 PetscCall(MatGetOwnershipRangeColumn(C,&ccstart,&ccend)); 850 PetscCall(PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ)); 851 PetscCall(PetscHMapIVDestroy(&hmap)); 852 PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE)); 853 PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE)); 854 PetscCall(PetscFree2(c_rmtj,c_rmta)); 855 856 /* Add contributions from remote */ 857 for (i = 0; i < pn; i++) { 858 row = i + pcstart; 859 PetscCall(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES)); 860 } 861 PetscCall(PetscFree2(c_othj,c_otha)); 862 863 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 864 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 865 866 ptap->reuse = MAT_REUSE_MATRIX; 867 PetscFunctionReturn(0); 868 } 869 870 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C) 871 { 872 PetscFunctionBegin; 873 874 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C)); 875 PetscFunctionReturn(0); 876 } 877 878 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C) 879 { 880 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 881 Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data; 882 Mat_APMPI *ptap; 883 PetscHMapIV hmap; 884 PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc; 885 PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa; 886 PetscInt offset,ii,pocol; 887 const PetscInt *mappingindices; 888 IS map; 889 890 PetscFunctionBegin; 891 MatCheckProduct(C,4); 892 ptap = (Mat_APMPI*)C->product->data; 893 PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 894 PetscCheck(ptap->P_oth,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 895 896 PetscCall(MatZeroEntries(C)); 897 898 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 899 /*-----------------------------------------------------*/ 900 if (ptap->reuse == MAT_REUSE_MATRIX) { 901 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 902 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth)); 903 } 904 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map)); 905 PetscCall(MatGetLocalSize(p->B,NULL,&pon)); 906 pon *= dof; 907 PetscCall(MatGetLocalSize(P,NULL,&pn)); 908 pn *= dof; 909 910 PetscCall(PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta)); 911 PetscCall(MatGetLocalSize(A,&am,NULL)); 912 PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend)); 913 pcstart *= dof; 914 pcend *= dof; 915 cmaxr = 0; 916 for (i=0; i<pon; i++) { 917 cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]); 918 } 919 cd = (Mat_SeqAIJ*)(c->A)->data; 920 co = (Mat_SeqAIJ*)(c->B)->data; 921 for (i=0; i<pn; i++) { 922 cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i])); 923 } 924 PetscCall(PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc)); 925 PetscCall(PetscHMapIVCreate(&hmap)); 926 PetscCall(PetscHMapIVResize(hmap,cmaxr)); 927 PetscCall(ISGetIndices(map,&mappingindices)); 928 for (i=0; i<am && (pon || pn); i++) { 929 PetscCall(PetscHMapIVClear(hmap)); 930 offset = i%dof; 931 ii = i/dof; 932 nzi = po->i[ii+1] - po->i[ii]; 933 dnzi = pd->i[ii+1] - pd->i[ii]; 934 if (!nzi && !dnzi) continue; 935 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap)); 936 voff = 0; 937 PetscCall(PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues)); 938 if (!voff) continue; 939 940 /* Form remote C(ii, :) */ 941 poj = po->j + po->i[ii]; 942 poa = po->a + po->i[ii]; 943 for (j=0; j<nzi; j++) { 944 pocol = poj[j]*dof+offset; 945 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 946 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 947 for (jj=0; jj<voff; jj++) { 948 apvaluestmp[jj] = apvalues[jj]*poa[j]; 949 /*If the row is empty */ 950 if (!c_rmtc[pocol]) { 951 c_rmtjj[jj] = apindices[jj]; 952 c_rmtaa[jj] = apvaluestmp[jj]; 953 c_rmtc[pocol]++; 954 } else { 955 PetscCall(PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc)); 956 if (loc>=0){ /* hit */ 957 c_rmtaa[loc] += apvaluestmp[jj]; 958 PetscCall(PetscLogFlops(1.0)); 959 } else { /* new element */ 960 loc = -(loc+1); 961 /* Move data backward */ 962 for (kk=c_rmtc[pocol]; kk>loc; kk--) { 963 c_rmtjj[kk] = c_rmtjj[kk-1]; 964 c_rmtaa[kk] = c_rmtaa[kk-1]; 965 }/* End kk */ 966 c_rmtjj[loc] = apindices[jj]; 967 c_rmtaa[loc] = apvaluestmp[jj]; 968 c_rmtc[pocol]++; 969 } 970 } 971 } /* End jj */ 972 PetscCall(PetscLogFlops(voff)); 973 } /* End j */ 974 975 /* Form local C(ii, :) */ 976 pdj = pd->j + pd->i[ii]; 977 pda = pd->a + pd->i[ii]; 978 for (j=0; j<dnzi; j++) { 979 row = pcstart + pdj[j] * dof + offset; 980 for (jj=0; jj<voff; jj++) { 981 apvaluestmp[jj] = apvalues[jj]*pda[j]; 982 }/* End kk */ 983 PetscCall(PetscLogFlops(voff)); 984 PetscCall(MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES)); 985 }/* End j */ 986 } /* End i */ 987 988 PetscCall(ISRestoreIndices(map,&mappingindices)); 989 PetscCall(PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc)); 990 PetscCall(PetscHMapIVDestroy(&hmap)); 991 PetscCall(PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha)); 992 993 PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE)); 994 PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE)); 995 PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE)); 996 PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE)); 997 PetscCall(PetscFree2(c_rmtj,c_rmta)); 998 999 /* Add contributions from remote */ 1000 for (i = 0; i < pn; i++) { 1001 row = i + pcstart; 1002 PetscCall(MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES)); 1003 } 1004 PetscCall(PetscFree2(c_othj,c_otha)); 1005 1006 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 1007 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 1008 1009 ptap->reuse = MAT_REUSE_MATRIX; 1010 PetscFunctionReturn(0); 1011 } 1012 1013 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C) 1014 { 1015 PetscFunctionBegin; 1016 1017 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C)); 1018 PetscFunctionReturn(0); 1019 } 1020 1021 /* TODO: move algorithm selection to MatProductSetFromOptions */ 1022 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi) 1023 { 1024 Mat_APMPI *ptap; 1025 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data; 1026 MPI_Comm comm; 1027 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1028 MatType mtype; 1029 PetscSF sf; 1030 PetscSFNode *iremote; 1031 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1032 const PetscInt *rootdegrees; 1033 PetscHSetI ht,oht,*hta,*hto; 1034 PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1035 PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1036 PetscInt nalg=2,alg=0,offset,ii; 1037 PetscMPIInt owner; 1038 const PetscInt *mappingindices; 1039 PetscBool flg; 1040 const char *algTypes[2] = {"overlapping","merged"}; 1041 IS map; 1042 1043 PetscFunctionBegin; 1044 MatCheckProduct(Cmpi,5); 1045 PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 1046 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 1047 1048 /* Create symbolic parallel matrix Cmpi */ 1049 PetscCall(MatGetLocalSize(P,NULL,&pn)); 1050 pn *= dof; 1051 PetscCall(MatGetType(A,&mtype)); 1052 PetscCall(MatSetType(Cmpi,mtype)); 1053 PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE)); 1054 1055 PetscCall(PetscNew(&ptap)); 1056 ptap->reuse = MAT_INITIAL_MATRIX; 1057 ptap->algType = 2; 1058 1059 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1060 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth)); 1061 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map)); 1062 /* This equals to the number of offdiag columns in P */ 1063 PetscCall(MatGetLocalSize(p->B,NULL,&pon)); 1064 pon *= dof; 1065 /* offsets */ 1066 PetscCall(PetscMalloc1(pon+1,&ptap->c_rmti)); 1067 /* The number of columns we will send to remote ranks */ 1068 PetscCall(PetscMalloc1(pon,&c_rmtc)); 1069 PetscCall(PetscMalloc1(pon,&hta)); 1070 for (i=0; i<pon; i++) { 1071 PetscCall(PetscHSetICreate(&hta[i])); 1072 } 1073 PetscCall(MatGetLocalSize(A,&am,NULL)); 1074 PetscCall(MatGetOwnershipRange(A,&arstart,&arend)); 1075 /* Create hash table to merge all columns for C(i, :) */ 1076 PetscCall(PetscHSetICreate(&ht)); 1077 1078 PetscCall(ISGetIndices(map,&mappingindices)); 1079 ptap->c_rmti[0] = 0; 1080 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1081 for (i=0; i<am && pon; i++) { 1082 /* Form one row of AP */ 1083 PetscCall(PetscHSetIClear(ht)); 1084 offset = i%dof; 1085 ii = i/dof; 1086 /* If the off diag is empty, we should not do any calculation */ 1087 nzi = po->i[ii+1] - po->i[ii]; 1088 if (!nzi) continue; 1089 1090 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht)); 1091 PetscCall(PetscHSetIGetSize(ht,&htsize)); 1092 /* If AP is empty, just continue */ 1093 if (!htsize) continue; 1094 /* Form C(ii, :) */ 1095 poj = po->j + po->i[ii]; 1096 for (j=0; j<nzi; j++) { 1097 PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht)); 1098 } 1099 } 1100 1101 for (i=0; i<pon; i++) { 1102 PetscCall(PetscHSetIGetSize(hta[i],&htsize)); 1103 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1104 c_rmtc[i] = htsize; 1105 } 1106 1107 PetscCall(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj)); 1108 1109 for (i=0; i<pon; i++) { 1110 off = 0; 1111 PetscCall(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i])); 1112 PetscCall(PetscHSetIDestroy(&hta[i])); 1113 } 1114 PetscCall(PetscFree(hta)); 1115 1116 PetscCall(PetscMalloc1(pon,&iremote)); 1117 for (i=0; i<pon; i++) { 1118 owner = 0; lidx = 0; 1119 offset = i%dof; 1120 ii = i/dof; 1121 PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx)); 1122 iremote[i].index = lidx*dof + offset; 1123 iremote[i].rank = owner; 1124 } 1125 1126 PetscCall(PetscSFCreate(comm,&sf)); 1127 PetscCall(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER)); 1128 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1129 PetscCall(PetscSFSetRankOrder(sf,PETSC_TRUE)); 1130 PetscCall(PetscSFSetFromOptions(sf)); 1131 PetscCall(PetscSFSetUp(sf)); 1132 /* How many neighbors have contributions to my rows? */ 1133 PetscCall(PetscSFComputeDegreeBegin(sf,&rootdegrees)); 1134 PetscCall(PetscSFComputeDegreeEnd(sf,&rootdegrees)); 1135 rootspacesize = 0; 1136 for (i = 0; i < pn; i++) { 1137 rootspacesize += rootdegrees[i]; 1138 } 1139 PetscCall(PetscMalloc1(rootspacesize,&rootspace)); 1140 PetscCall(PetscMalloc1(rootspacesize+1,&rootspaceoffsets)); 1141 /* Get information from leaves 1142 * Number of columns other people contribute to my rows 1143 * */ 1144 PetscCall(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace)); 1145 PetscCall(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace)); 1146 PetscCall(PetscFree(c_rmtc)); 1147 PetscCall(PetscCalloc1(pn+1,&ptap->c_othi)); 1148 /* The number of columns is received for each row */ 1149 ptap->c_othi[0] = 0; 1150 rootspacesize = 0; 1151 rootspaceoffsets[0] = 0; 1152 for (i = 0; i < pn; i++) { 1153 rcvncols = 0; 1154 for (j = 0; j<rootdegrees[i]; j++) { 1155 rcvncols += rootspace[rootspacesize]; 1156 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1157 rootspacesize++; 1158 } 1159 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1160 } 1161 PetscCall(PetscFree(rootspace)); 1162 1163 PetscCall(PetscMalloc1(pon,&c_rmtoffsets)); 1164 PetscCall(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets)); 1165 PetscCall(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets)); 1166 PetscCall(PetscSFDestroy(&sf)); 1167 PetscCall(PetscFree(rootspaceoffsets)); 1168 1169 PetscCall(PetscCalloc1(ptap->c_rmti[pon],&iremote)); 1170 nleaves = 0; 1171 for (i = 0; i<pon; i++) { 1172 owner = 0; 1173 ii = i/dof; 1174 PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL)); 1175 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1176 for (j=0; j<sendncols; j++) { 1177 iremote[nleaves].rank = owner; 1178 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1179 } 1180 } 1181 PetscCall(PetscFree(c_rmtoffsets)); 1182 PetscCall(PetscCalloc1(ptap->c_othi[pn],&c_othj)); 1183 1184 PetscCall(PetscSFCreate(comm,&ptap->sf)); 1185 PetscCall(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER)); 1186 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1187 /* One to one map */ 1188 PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE)); 1189 1190 PetscCall(PetscMalloc2(pn,&dnz,pn,&onz)); 1191 PetscCall(PetscHSetICreate(&oht)); 1192 PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend)); 1193 pcstart *= dof; 1194 pcend *= dof; 1195 PetscCall(PetscMalloc2(pn,&hta,pn,&hto)); 1196 for (i=0; i<pn; i++) { 1197 PetscCall(PetscHSetICreate(&hta[i])); 1198 PetscCall(PetscHSetICreate(&hto[i])); 1199 } 1200 /* Work on local part */ 1201 /* 4) Pass 1: Estimate memory for C_loc */ 1202 for (i=0; i<am && pn; i++) { 1203 PetscCall(PetscHSetIClear(ht)); 1204 PetscCall(PetscHSetIClear(oht)); 1205 offset = i%dof; 1206 ii = i/dof; 1207 nzi = pd->i[ii+1] - pd->i[ii]; 1208 if (!nzi) continue; 1209 1210 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht)); 1211 PetscCall(PetscHSetIGetSize(ht,&htsize)); 1212 PetscCall(PetscHSetIGetSize(oht,&htosize)); 1213 if (!(htsize+htosize)) continue; 1214 /* Form C(ii, :) */ 1215 pdj = pd->j + pd->i[ii]; 1216 for (j=0; j<nzi; j++) { 1217 PetscCall(PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht)); 1218 PetscCall(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht)); 1219 } 1220 } 1221 1222 PetscCall(ISRestoreIndices(map,&mappingindices)); 1223 1224 PetscCall(PetscHSetIDestroy(&ht)); 1225 PetscCall(PetscHSetIDestroy(&oht)); 1226 1227 /* Get remote data */ 1228 PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE)); 1229 PetscCall(PetscFree(c_rmtj)); 1230 1231 for (i = 0; i < pn; i++) { 1232 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1233 rdj = c_othj + ptap->c_othi[i]; 1234 for (j = 0; j < nzi; j++) { 1235 col = rdj[j]; 1236 /* diag part */ 1237 if (col>=pcstart && col<pcend) { 1238 PetscCall(PetscHSetIAdd(hta[i],col)); 1239 } else { /* off diag */ 1240 PetscCall(PetscHSetIAdd(hto[i],col)); 1241 } 1242 } 1243 PetscCall(PetscHSetIGetSize(hta[i],&htsize)); 1244 dnz[i] = htsize; 1245 PetscCall(PetscHSetIDestroy(&hta[i])); 1246 PetscCall(PetscHSetIGetSize(hto[i],&htsize)); 1247 onz[i] = htsize; 1248 PetscCall(PetscHSetIDestroy(&hto[i])); 1249 } 1250 1251 PetscCall(PetscFree2(hta,hto)); 1252 PetscCall(PetscFree(c_othj)); 1253 1254 /* local sizes and preallocation */ 1255 PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE)); 1256 PetscCall(MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs)); 1257 PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz)); 1258 PetscCall(MatSetUp(Cmpi)); 1259 PetscCall(PetscFree2(dnz,onz)); 1260 1261 /* attach the supporting struct to Cmpi for reuse */ 1262 Cmpi->product->data = ptap; 1263 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1264 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1265 1266 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1267 Cmpi->assembled = PETSC_FALSE; 1268 /* pick an algorithm */ 1269 PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat"); 1270 alg = 0; 1271 PetscCall(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg)); 1272 PetscOptionsEnd(); 1273 switch (alg) { 1274 case 0: 1275 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1276 break; 1277 case 1: 1278 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1279 break; 1280 default: 1281 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm "); 1282 } 1283 PetscFunctionReturn(0); 1284 } 1285 1286 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C) 1287 { 1288 PetscFunctionBegin; 1289 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C)); 1290 PetscFunctionReturn(0); 1291 } 1292 1293 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi) 1294 { 1295 Mat_APMPI *ptap; 1296 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data; 1297 MPI_Comm comm; 1298 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data; 1299 MatType mtype; 1300 PetscSF sf; 1301 PetscSFNode *iremote; 1302 PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves; 1303 const PetscInt *rootdegrees; 1304 PetscHSetI ht,oht,*hta,*hto,*htd; 1305 PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets; 1306 PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj; 1307 PetscInt nalg=2,alg=0,offset,ii; 1308 PetscMPIInt owner; 1309 PetscBool flg; 1310 const char *algTypes[2] = {"merged","overlapping"}; 1311 const PetscInt *mappingindices; 1312 IS map; 1313 1314 PetscFunctionBegin; 1315 MatCheckProduct(Cmpi,5); 1316 PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 1317 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 1318 1319 /* Create symbolic parallel matrix Cmpi */ 1320 PetscCall(MatGetLocalSize(P,NULL,&pn)); 1321 pn *= dof; 1322 PetscCall(MatGetType(A,&mtype)); 1323 PetscCall(MatSetType(Cmpi,mtype)); 1324 PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE)); 1325 1326 PetscCall(PetscNew(&ptap)); 1327 ptap->reuse = MAT_INITIAL_MATRIX; 1328 ptap->algType = 3; 1329 1330 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1331 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth)); 1332 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map)); 1333 1334 /* This equals to the number of offdiag columns in P */ 1335 PetscCall(MatGetLocalSize(p->B,NULL,&pon)); 1336 pon *= dof; 1337 /* offsets */ 1338 PetscCall(PetscMalloc1(pon+1,&ptap->c_rmti)); 1339 /* The number of columns we will send to remote ranks */ 1340 PetscCall(PetscMalloc1(pon,&c_rmtc)); 1341 PetscCall(PetscMalloc1(pon,&hta)); 1342 for (i=0; i<pon; i++) { 1343 PetscCall(PetscHSetICreate(&hta[i])); 1344 } 1345 PetscCall(MatGetLocalSize(A,&am,NULL)); 1346 PetscCall(MatGetOwnershipRange(A,&arstart,&arend)); 1347 /* Create hash table to merge all columns for C(i, :) */ 1348 PetscCall(PetscHSetICreate(&ht)); 1349 PetscCall(PetscHSetICreate(&oht)); 1350 PetscCall(PetscMalloc2(pn,&htd,pn,&hto)); 1351 for (i=0; i<pn; i++) { 1352 PetscCall(PetscHSetICreate(&htd[i])); 1353 PetscCall(PetscHSetICreate(&hto[i])); 1354 } 1355 1356 PetscCall(ISGetIndices(map,&mappingindices)); 1357 ptap->c_rmti[0] = 0; 1358 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1359 for (i=0; i<am && (pon || pn); i++) { 1360 /* Form one row of AP */ 1361 PetscCall(PetscHSetIClear(ht)); 1362 PetscCall(PetscHSetIClear(oht)); 1363 offset = i%dof; 1364 ii = i/dof; 1365 /* If the off diag is empty, we should not do any calculation */ 1366 nzi = po->i[ii+1] - po->i[ii]; 1367 dnzi = pd->i[ii+1] - pd->i[ii]; 1368 if (!nzi && !dnzi) continue; 1369 1370 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht)); 1371 PetscCall(PetscHSetIGetSize(ht,&htsize)); 1372 PetscCall(PetscHSetIGetSize(oht,&htosize)); 1373 /* If AP is empty, just continue */ 1374 if (!(htsize+htosize)) continue; 1375 1376 /* Form remote C(ii, :) */ 1377 poj = po->j + po->i[ii]; 1378 for (j=0; j<nzi; j++) { 1379 PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],ht)); 1380 PetscCall(PetscHSetIUpdate(hta[poj[j]*dof+offset],oht)); 1381 } 1382 1383 /* Form local C(ii, :) */ 1384 pdj = pd->j + pd->i[ii]; 1385 for (j=0; j<dnzi; j++) { 1386 PetscCall(PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht)); 1387 PetscCall(PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht)); 1388 } 1389 } 1390 1391 PetscCall(ISRestoreIndices(map,&mappingindices)); 1392 1393 PetscCall(PetscHSetIDestroy(&ht)); 1394 PetscCall(PetscHSetIDestroy(&oht)); 1395 1396 for (i=0; i<pon; i++) { 1397 PetscCall(PetscHSetIGetSize(hta[i],&htsize)); 1398 ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize; 1399 c_rmtc[i] = htsize; 1400 } 1401 1402 PetscCall(PetscMalloc1(ptap->c_rmti[pon],&c_rmtj)); 1403 1404 for (i=0; i<pon; i++) { 1405 off = 0; 1406 PetscCall(PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i])); 1407 PetscCall(PetscHSetIDestroy(&hta[i])); 1408 } 1409 PetscCall(PetscFree(hta)); 1410 1411 PetscCall(PetscMalloc1(pon,&iremote)); 1412 for (i=0; i<pon; i++) { 1413 owner = 0; lidx = 0; 1414 offset = i%dof; 1415 ii = i/dof; 1416 PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx)); 1417 iremote[i].index = lidx*dof+offset; 1418 iremote[i].rank = owner; 1419 } 1420 1421 PetscCall(PetscSFCreate(comm,&sf)); 1422 PetscCall(PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER)); 1423 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1424 PetscCall(PetscSFSetRankOrder(sf,PETSC_TRUE)); 1425 PetscCall(PetscSFSetFromOptions(sf)); 1426 PetscCall(PetscSFSetUp(sf)); 1427 /* How many neighbors have contributions to my rows? */ 1428 PetscCall(PetscSFComputeDegreeBegin(sf,&rootdegrees)); 1429 PetscCall(PetscSFComputeDegreeEnd(sf,&rootdegrees)); 1430 rootspacesize = 0; 1431 for (i = 0; i < pn; i++) { 1432 rootspacesize += rootdegrees[i]; 1433 } 1434 PetscCall(PetscMalloc1(rootspacesize,&rootspace)); 1435 PetscCall(PetscMalloc1(rootspacesize+1,&rootspaceoffsets)); 1436 /* Get information from leaves 1437 * Number of columns other people contribute to my rows 1438 * */ 1439 PetscCall(PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace)); 1440 PetscCall(PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace)); 1441 PetscCall(PetscFree(c_rmtc)); 1442 PetscCall(PetscMalloc1(pn+1,&ptap->c_othi)); 1443 /* The number of columns is received for each row */ 1444 ptap->c_othi[0] = 0; 1445 rootspacesize = 0; 1446 rootspaceoffsets[0] = 0; 1447 for (i = 0; i < pn; i++) { 1448 rcvncols = 0; 1449 for (j = 0; j<rootdegrees[i]; j++) { 1450 rcvncols += rootspace[rootspacesize]; 1451 rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1452 rootspacesize++; 1453 } 1454 ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols; 1455 } 1456 PetscCall(PetscFree(rootspace)); 1457 1458 PetscCall(PetscMalloc1(pon,&c_rmtoffsets)); 1459 PetscCall(PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets)); 1460 PetscCall(PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets)); 1461 PetscCall(PetscSFDestroy(&sf)); 1462 PetscCall(PetscFree(rootspaceoffsets)); 1463 1464 PetscCall(PetscCalloc1(ptap->c_rmti[pon],&iremote)); 1465 nleaves = 0; 1466 for (i = 0; i<pon; i++) { 1467 owner = 0; 1468 ii = i/dof; 1469 PetscCall(PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL)); 1470 sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i]; 1471 for (j=0; j<sendncols; j++) { 1472 iremote[nleaves].rank = owner; 1473 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1474 } 1475 } 1476 PetscCall(PetscFree(c_rmtoffsets)); 1477 PetscCall(PetscCalloc1(ptap->c_othi[pn],&c_othj)); 1478 1479 PetscCall(PetscSFCreate(comm,&ptap->sf)); 1480 PetscCall(PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER)); 1481 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1482 /* One to one map */ 1483 PetscCall(PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE)); 1484 /* Get remote data */ 1485 PetscCall(PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE)); 1486 PetscCall(PetscFree(c_rmtj)); 1487 PetscCall(PetscMalloc2(pn,&dnz,pn,&onz)); 1488 PetscCall(MatGetOwnershipRangeColumn(P,&pcstart,&pcend)); 1489 pcstart *= dof; 1490 pcend *= dof; 1491 for (i = 0; i < pn; i++) { 1492 nzi = ptap->c_othi[i+1] - ptap->c_othi[i]; 1493 rdj = c_othj + ptap->c_othi[i]; 1494 for (j = 0; j < nzi; j++) { 1495 col = rdj[j]; 1496 /* diag part */ 1497 if (col>=pcstart && col<pcend) { 1498 PetscCall(PetscHSetIAdd(htd[i],col)); 1499 } else { /* off diag */ 1500 PetscCall(PetscHSetIAdd(hto[i],col)); 1501 } 1502 } 1503 PetscCall(PetscHSetIGetSize(htd[i],&htsize)); 1504 dnz[i] = htsize; 1505 PetscCall(PetscHSetIDestroy(&htd[i])); 1506 PetscCall(PetscHSetIGetSize(hto[i],&htsize)); 1507 onz[i] = htsize; 1508 PetscCall(PetscHSetIDestroy(&hto[i])); 1509 } 1510 1511 PetscCall(PetscFree2(htd,hto)); 1512 PetscCall(PetscFree(c_othj)); 1513 1514 /* local sizes and preallocation */ 1515 PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE)); 1516 PetscCall(MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs)); 1517 PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz)); 1518 PetscCall(PetscFree2(dnz,onz)); 1519 1520 /* attach the supporting struct to Cmpi for reuse */ 1521 Cmpi->product->data = ptap; 1522 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1523 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1524 1525 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1526 Cmpi->assembled = PETSC_FALSE; 1527 /* pick an algorithm */ 1528 PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat"); 1529 alg = 0; 1530 PetscCall(PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg)); 1531 PetscOptionsEnd(); 1532 switch (alg) { 1533 case 0: 1534 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1535 break; 1536 case 1: 1537 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1538 break; 1539 default: 1540 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm "); 1541 } 1542 PetscFunctionReturn(0); 1543 } 1544 1545 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C) 1546 { 1547 PetscFunctionBegin; 1548 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C)); 1549 PetscFunctionReturn(0); 1550 } 1551 1552 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi) 1553 { 1554 Mat_APMPI *ptap; 1555 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 1556 MPI_Comm comm; 1557 PetscMPIInt size,rank; 1558 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1559 PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n; 1560 PetscInt *lnk,i,k,pnz,row,nsend; 1561 PetscBT lnkbt; 1562 PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,nrecv; 1563 PETSC_UNUSED PetscMPIInt icompleted=0; 1564 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1565 PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble; 1566 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1567 MPI_Request *swaits,*rwaits; 1568 MPI_Status *sstatus,rstatus; 1569 PetscLayout rowmap; 1570 PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1571 PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */ 1572 PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi; 1573 Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth; 1574 PetscScalar *apv; 1575 PetscTable ta; 1576 MatType mtype; 1577 const char *prefix; 1578 #if defined(PETSC_USE_INFO) 1579 PetscReal apfill; 1580 #endif 1581 1582 PetscFunctionBegin; 1583 MatCheckProduct(Cmpi,4); 1584 PetscCheck(!Cmpi->product->data,PetscObjectComm((PetscObject)Cmpi),PETSC_ERR_PLIB,"Product data not empty"); 1585 PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 1586 PetscCallMPI(MPI_Comm_size(comm,&size)); 1587 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 1588 1589 if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data; 1590 1591 /* create symbolic parallel matrix Cmpi */ 1592 PetscCall(MatGetType(A,&mtype)); 1593 PetscCall(MatSetType(Cmpi,mtype)); 1594 1595 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1596 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1597 1598 /* create struct Mat_APMPI and attached it to C later */ 1599 PetscCall(PetscNew(&ptap)); 1600 ptap->reuse = MAT_INITIAL_MATRIX; 1601 ptap->algType = 1; 1602 1603 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1604 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth)); 1605 /* get P_loc by taking all local rows of P */ 1606 PetscCall(MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc)); 1607 1608 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1609 /* --------------------------------- */ 1610 PetscCall(MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd)); 1611 PetscCall(MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro)); 1612 1613 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1614 /* ----------------------------------------------------------------- */ 1615 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1616 if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1617 1618 /* create and initialize a linked list */ 1619 PetscCall(PetscTableCreate(pn,pN,&ta)); /* for compute AP_loc and Cmpi */ 1620 MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta); 1621 MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta); 1622 PetscCall(PetscTableGetCount(ta,&Crmax)); /* Crmax = nnz(sum of Prows) */ 1623 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1624 1625 PetscCall(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt)); 1626 1627 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1628 if (ao) { 1629 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space)); 1630 } else { 1631 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space)); 1632 } 1633 current_space = free_space; 1634 nspacedouble = 0; 1635 1636 PetscCall(PetscMalloc1(am+1,&api)); 1637 api[0] = 0; 1638 for (i=0; i<am; i++) { 1639 /* diagonal portion: Ad[i,:]*P */ 1640 ai = ad->i; pi = p_loc->i; 1641 nzi = ai[i+1] - ai[i]; 1642 aj = ad->j + ai[i]; 1643 for (j=0; j<nzi; j++) { 1644 row = aj[j]; 1645 pnz = pi[row+1] - pi[row]; 1646 Jptr = p_loc->j + pi[row]; 1647 /* add non-zero cols of P into the sorted linked list lnk */ 1648 PetscCall(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt)); 1649 } 1650 /* off-diagonal portion: Ao[i,:]*P */ 1651 if (ao) { 1652 ai = ao->i; pi = p_oth->i; 1653 nzi = ai[i+1] - ai[i]; 1654 aj = ao->j + ai[i]; 1655 for (j=0; j<nzi; j++) { 1656 row = aj[j]; 1657 pnz = pi[row+1] - pi[row]; 1658 Jptr = p_oth->j + pi[row]; 1659 PetscCall(PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt)); 1660 } 1661 } 1662 apnz = lnk[0]; 1663 api[i+1] = api[i] + apnz; 1664 if (ap_rmax < apnz) ap_rmax = apnz; 1665 1666 /* if free space is not available, double the total space in the list */ 1667 if (current_space->local_remaining<apnz) { 1668 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space)); 1669 nspacedouble++; 1670 } 1671 1672 /* Copy data into free space, then initialize lnk */ 1673 PetscCall(PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt)); 1674 1675 current_space->array += apnz; 1676 current_space->local_used += apnz; 1677 current_space->local_remaining -= apnz; 1678 } 1679 /* Allocate space for apj and apv, initialize apj, and */ 1680 /* destroy list of free space and other temporary array(s) */ 1681 PetscCall(PetscMalloc2(api[am],&apj,api[am],&apv)); 1682 PetscCall(PetscFreeSpaceContiguous(&free_space,apj)); 1683 PetscCall(PetscLLDestroy(lnk,lnkbt)); 1684 1685 /* Create AP_loc for reuse */ 1686 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc)); 1687 PetscCall(MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name)); 1688 #if defined(PETSC_USE_INFO) 1689 if (ao) { 1690 apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1); 1691 } else { 1692 apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1); 1693 } 1694 ptap->AP_loc->info.mallocs = nspacedouble; 1695 ptap->AP_loc->info.fill_ratio_given = fill; 1696 ptap->AP_loc->info.fill_ratio_needed = apfill; 1697 1698 if (api[am]) { 1699 PetscCall(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill)); 1700 PetscCall(PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill)); 1701 } else { 1702 PetscCall(PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n")); 1703 } 1704 #endif 1705 1706 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1707 /* ------------------------------------ */ 1708 PetscCall(MatGetOptionsPrefix(A,&prefix)); 1709 PetscCall(MatSetOptionsPrefix(ptap->Ro,prefix)); 1710 PetscCall(MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_")); 1711 PetscCall(MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth)); 1712 PetscCall(MatGetOptionsPrefix(Cmpi,&prefix)); 1713 PetscCall(MatSetOptionsPrefix(ptap->C_oth,prefix)); 1714 PetscCall(MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_")); 1715 PetscCall(MatProductSetType(ptap->C_oth,MATPRODUCT_AB)); 1716 PetscCall(MatProductSetAlgorithm(ptap->C_oth,"default")); 1717 PetscCall(MatProductSetFill(ptap->C_oth,fill)); 1718 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 1719 PetscCall(MatProductSymbolic(ptap->C_oth)); 1720 1721 /* (3) send coj of C_oth to other processors */ 1722 /* ------------------------------------------ */ 1723 /* determine row ownership */ 1724 PetscCall(PetscLayoutCreate(comm,&rowmap)); 1725 rowmap->n = pn; 1726 rowmap->bs = 1; 1727 PetscCall(PetscLayoutSetUp(rowmap)); 1728 owners = rowmap->range; 1729 1730 /* determine the number of messages to send, their lengths */ 1731 PetscCall(PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co)); 1732 PetscCall(PetscArrayzero(len_s,size)); 1733 PetscCall(PetscArrayzero(len_si,size)); 1734 1735 c_oth = (Mat_SeqAIJ*)ptap->C_oth->data; 1736 coi = c_oth->i; coj = c_oth->j; 1737 con = ptap->C_oth->rmap->n; 1738 proc = 0; 1739 for (i=0; i<con; i++) { 1740 while (prmap[i] >= owners[proc+1]) proc++; 1741 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1742 len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1743 } 1744 1745 len = 0; /* max length of buf_si[], see (4) */ 1746 owners_co[0] = 0; 1747 nsend = 0; 1748 for (proc=0; proc<size; proc++) { 1749 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1750 if (len_s[proc]) { 1751 nsend++; 1752 len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1753 len += len_si[proc]; 1754 } 1755 } 1756 1757 /* determine the number and length of messages to receive for coi and coj */ 1758 PetscCall(PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv)); 1759 PetscCall(PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri)); 1760 1761 /* post the Irecv and Isend of coj */ 1762 PetscCall(PetscCommGetNewTag(comm,&tagj)); 1763 PetscCall(PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits)); 1764 PetscCall(PetscMalloc1(nsend+1,&swaits)); 1765 for (proc=0, k=0; proc<size; proc++) { 1766 if (!len_s[proc]) continue; 1767 i = owners_co[proc]; 1768 PetscCallMPI(MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k)); 1769 k++; 1770 } 1771 1772 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1773 /* ---------------------------------------- */ 1774 PetscCall(MatSetOptionsPrefix(ptap->Rd,prefix)); 1775 PetscCall(MatAppendOptionsPrefix(ptap->Rd,"inner_diag_")); 1776 PetscCall(MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc)); 1777 PetscCall(MatGetOptionsPrefix(Cmpi,&prefix)); 1778 PetscCall(MatSetOptionsPrefix(ptap->C_loc,prefix)); 1779 PetscCall(MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_")); 1780 PetscCall(MatProductSetType(ptap->C_loc,MATPRODUCT_AB)); 1781 PetscCall(MatProductSetAlgorithm(ptap->C_loc,"default")); 1782 PetscCall(MatProductSetFill(ptap->C_loc,fill)); 1783 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 1784 PetscCall(MatProductSymbolic(ptap->C_loc)); 1785 1786 c_loc = (Mat_SeqAIJ*)ptap->C_loc->data; 1787 1788 /* receives coj are complete */ 1789 for (i=0; i<nrecv; i++) { 1790 PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus)); 1791 } 1792 PetscCall(PetscFree(rwaits)); 1793 if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus)); 1794 1795 /* add received column indices into ta to update Crmax */ 1796 for (k=0; k<nrecv; k++) {/* k-th received message */ 1797 Jptr = buf_rj[k]; 1798 for (j=0; j<len_r[k]; j++) { 1799 PetscCall(PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES)); 1800 } 1801 } 1802 PetscCall(PetscTableGetCount(ta,&Crmax)); 1803 PetscCall(PetscTableDestroy(&ta)); 1804 1805 /* (4) send and recv coi */ 1806 /*-----------------------*/ 1807 PetscCall(PetscCommGetNewTag(comm,&tagi)); 1808 PetscCall(PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits)); 1809 PetscCall(PetscMalloc1(len+1,&buf_s)); 1810 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1811 for (proc=0,k=0; proc<size; proc++) { 1812 if (!len_s[proc]) continue; 1813 /* form outgoing message for i-structure: 1814 buf_si[0]: nrows to be sent 1815 [1:nrows]: row index (global) 1816 [nrows+1:2*nrows+1]: i-structure index 1817 */ 1818 /*-------------------------------------------*/ 1819 nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */ 1820 buf_si_i = buf_si + nrows+1; 1821 buf_si[0] = nrows; 1822 buf_si_i[0] = 0; 1823 nrows = 0; 1824 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1825 nzi = coi[i+1] - coi[i]; 1826 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1827 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1828 nrows++; 1829 } 1830 PetscCallMPI(MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k)); 1831 k++; 1832 buf_si += len_si[proc]; 1833 } 1834 for (i=0; i<nrecv; i++) { 1835 PetscCallMPI(MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus)); 1836 } 1837 PetscCall(PetscFree(rwaits)); 1838 if (nsend) PetscCallMPI(MPI_Waitall(nsend,swaits,sstatus)); 1839 1840 PetscCall(PetscFree4(len_s,len_si,sstatus,owners_co)); 1841 PetscCall(PetscFree(len_ri)); 1842 PetscCall(PetscFree(swaits)); 1843 PetscCall(PetscFree(buf_s)); 1844 1845 /* (5) compute the local portion of Cmpi */ 1846 /* ------------------------------------------ */ 1847 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1848 PetscCall(PetscFreeSpaceGet(Crmax,&free_space)); 1849 current_space = free_space; 1850 1851 PetscCall(PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci)); 1852 for (k=0; k<nrecv; k++) { 1853 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1854 nrows = *buf_ri_k[k]; 1855 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1856 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1857 } 1858 1859 MatPreallocateBegin(comm,pn,pn,dnz,onz); 1860 PetscCall(PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt)); 1861 for (i=0; i<pn; i++) { 1862 /* add C_loc into Cmpi */ 1863 nzi = c_loc->i[i+1] - c_loc->i[i]; 1864 Jptr = c_loc->j + c_loc->i[i]; 1865 PetscCall(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt)); 1866 1867 /* add received col data into lnk */ 1868 for (k=0; k<nrecv; k++) { /* k-th received message */ 1869 if (i == *nextrow[k]) { /* i-th row */ 1870 nzi = *(nextci[k]+1) - *nextci[k]; 1871 Jptr = buf_rj[k] + *nextci[k]; 1872 PetscCall(PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt)); 1873 nextrow[k]++; nextci[k]++; 1874 } 1875 } 1876 nzi = lnk[0]; 1877 1878 /* copy data into free space, then initialize lnk */ 1879 PetscCall(PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt)); 1880 PetscCall(MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz)); 1881 } 1882 PetscCall(PetscFree3(buf_ri_k,nextrow,nextci)); 1883 PetscCall(PetscLLDestroy(lnk,lnkbt)); 1884 PetscCall(PetscFreeSpaceDestroy(free_space)); 1885 1886 /* local sizes and preallocation */ 1887 PetscCall(MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE)); 1888 if (P->cmap->bs > 0) { 1889 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs)); 1890 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs)); 1891 } 1892 PetscCall(MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz)); 1893 MatPreallocateEnd(dnz,onz); 1894 1895 /* members in merge */ 1896 PetscCall(PetscFree(id_r)); 1897 PetscCall(PetscFree(len_r)); 1898 PetscCall(PetscFree(buf_ri[0])); 1899 PetscCall(PetscFree(buf_ri)); 1900 PetscCall(PetscFree(buf_rj[0])); 1901 PetscCall(PetscFree(buf_rj)); 1902 PetscCall(PetscLayoutDestroy(&rowmap)); 1903 1904 PetscCall(PetscCalloc1(pN,&ptap->apa)); 1905 1906 /* attach the supporting struct to Cmpi for reuse */ 1907 Cmpi->product->data = ptap; 1908 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1909 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1910 1911 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1912 Cmpi->assembled = PETSC_FALSE; 1913 PetscFunctionReturn(0); 1914 } 1915 1916 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 1917 { 1918 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data; 1919 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 1920 Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq; 1921 Mat_APMPI *ptap; 1922 Mat AP_loc,C_loc,C_oth; 1923 PetscInt i,rstart,rend,cm,ncols,row; 1924 PetscInt *api,*apj,am = A->rmap->n,j,col,apnz; 1925 PetscScalar *apa; 1926 const PetscInt *cols; 1927 const PetscScalar *vals; 1928 1929 PetscFunctionBegin; 1930 MatCheckProduct(C,3); 1931 ptap = (Mat_APMPI*)C->product->data; 1932 PetscCheck(ptap,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be computed. Missing data"); 1933 PetscCheck(ptap->AP_loc,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatProductClear()"); 1934 1935 PetscCall(MatZeroEntries(C)); 1936 /* 1) get R = Pd^T,Ro = Po^T */ 1937 if (ptap->reuse == MAT_REUSE_MATRIX) { 1938 PetscCall(MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd)); 1939 PetscCall(MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro)); 1940 } 1941 1942 /* 2) get AP_loc */ 1943 AP_loc = ptap->AP_loc; 1944 ap = (Mat_SeqAIJ*)AP_loc->data; 1945 1946 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1947 /*-----------------------------------------------------*/ 1948 if (ptap->reuse == MAT_REUSE_MATRIX) { 1949 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1950 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth)); 1951 PetscCall(MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc)); 1952 } 1953 1954 /* 2-2) compute numeric A_loc*P - dominating part */ 1955 /* ---------------------------------------------- */ 1956 /* get data from symbolic products */ 1957 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 1958 if (ptap->P_oth) { 1959 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 1960 } 1961 apa = ptap->apa; 1962 api = ap->i; 1963 apj = ap->j; 1964 for (i=0; i<am; i++) { 1965 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1966 AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa); 1967 apnz = api[i+1] - api[i]; 1968 for (j=0; j<apnz; j++) { 1969 col = apj[j+api[i]]; 1970 ap->a[j+ap->i[i]] = apa[col]; 1971 apa[col] = 0.0; 1972 } 1973 } 1974 /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */ 1975 PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc)); 1976 1977 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1978 PetscCall(MatProductNumeric(ptap->C_loc)); 1979 PetscCall(MatProductNumeric(ptap->C_oth)); 1980 C_loc = ptap->C_loc; 1981 C_oth = ptap->C_oth; 1982 1983 /* add C_loc and Co to to C */ 1984 PetscCall(MatGetOwnershipRange(C,&rstart,&rend)); 1985 1986 /* C_loc -> C */ 1987 cm = C_loc->rmap->N; 1988 c_seq = (Mat_SeqAIJ*)C_loc->data; 1989 cols = c_seq->j; 1990 vals = c_seq->a; 1991 1992 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 1993 /* when there are no off-processor parts. */ 1994 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 1995 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 1996 /* a table, and other, more complex stuff has to be done. */ 1997 if (C->assembled) { 1998 C->was_assembled = PETSC_TRUE; 1999 C->assembled = PETSC_FALSE; 2000 } 2001 if (C->was_assembled) { 2002 for (i=0; i<cm; i++) { 2003 ncols = c_seq->i[i+1] - c_seq->i[i]; 2004 row = rstart + i; 2005 PetscCall(MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES)); 2006 cols += ncols; vals += ncols; 2007 } 2008 } else { 2009 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a)); 2010 } 2011 2012 /* Co -> C, off-processor part */ 2013 cm = C_oth->rmap->N; 2014 c_seq = (Mat_SeqAIJ*)C_oth->data; 2015 cols = c_seq->j; 2016 vals = c_seq->a; 2017 for (i=0; i<cm; i++) { 2018 ncols = c_seq->i[i+1] - c_seq->i[i]; 2019 row = p->garray[i]; 2020 PetscCall(MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES)); 2021 cols += ncols; vals += ncols; 2022 } 2023 2024 PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY)); 2025 PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY)); 2026 2027 ptap->reuse = MAT_REUSE_MATRIX; 2028 PetscFunctionReturn(0); 2029 } 2030 2031 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) 2032 { 2033 Mat_Product *product = C->product; 2034 Mat A=product->A,P=product->B; 2035 MatProductAlgorithm alg=product->alg; 2036 PetscReal fill=product->fill; 2037 PetscBool flg; 2038 2039 PetscFunctionBegin; 2040 /* scalable: do R=P^T locally, then C=R*A*P */ 2041 PetscCall(PetscStrcmp(alg,"scalable",&flg)); 2042 if (flg) { 2043 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C)); 2044 C->ops->productnumeric = MatProductNumeric_PtAP; 2045 goto next; 2046 } 2047 2048 /* nonscalable: do R=P^T locally, then C=R*A*P */ 2049 PetscCall(PetscStrcmp(alg,"nonscalable",&flg)); 2050 if (flg) { 2051 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C)); 2052 goto next; 2053 } 2054 2055 /* allatonce */ 2056 PetscCall(PetscStrcmp(alg,"allatonce",&flg)); 2057 if (flg) { 2058 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C)); 2059 goto next; 2060 } 2061 2062 /* allatonce_merged */ 2063 PetscCall(PetscStrcmp(alg,"allatonce_merged",&flg)); 2064 if (flg) { 2065 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C)); 2066 goto next; 2067 } 2068 2069 /* backend general code */ 2070 PetscCall(PetscStrcmp(alg,"backend",&flg)); 2071 if (flg) { 2072 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 2073 PetscFunctionReturn(0); 2074 } 2075 2076 /* hypre */ 2077 #if defined(PETSC_HAVE_HYPRE) 2078 PetscCall(PetscStrcmp(alg,"hypre",&flg)); 2079 if (flg) { 2080 PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C)); 2081 C->ops->productnumeric = MatProductNumeric_PtAP; 2082 PetscFunctionReturn(0); 2083 } 2084 #endif 2085 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported"); 2086 2087 next: 2088 C->ops->productnumeric = MatProductNumeric_PtAP; 2089 PetscFunctionReturn(0); 2090 } 2091