1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmapiv.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer) 17 { 18 PetscBool iascii; 19 PetscViewerFormat format; 20 Mat_APMPI *ptap; 21 22 PetscFunctionBegin; 23 MatCheckProduct(A, 1); 24 ptap = (Mat_APMPI *)A->product->data; 25 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 26 if (iascii) { 27 PetscCall(PetscViewerGetFormat(viewer, &format)); 28 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 29 if (ptap->algType == 0) { 30 PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n")); 31 } else if (ptap->algType == 1) { 32 PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n")); 33 } else if (ptap->algType == 2) { 34 PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n")); 35 } else if (ptap->algType == 3) { 36 PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n")); 37 } 38 } 39 } 40 PetscFunctionReturn(PETSC_SUCCESS); 41 } 42 43 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) 44 { 45 Mat_APMPI *ptap = (Mat_APMPI *)data; 46 Mat_Merge_SeqsToMPI *merge; 47 48 PetscFunctionBegin; 49 PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r)); 50 PetscCall(PetscFree(ptap->bufa)); 51 PetscCall(MatDestroy(&ptap->P_loc)); 52 PetscCall(MatDestroy(&ptap->P_oth)); 53 PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */ 54 PetscCall(MatDestroy(&ptap->Rd)); 55 PetscCall(MatDestroy(&ptap->Ro)); 56 if (ptap->AP_loc) { /* used by alg_rap */ 57 Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data; 58 PetscCall(PetscFree(ap->i)); 59 PetscCall(PetscFree2(ap->j, ap->a)); 60 PetscCall(MatDestroy(&ptap->AP_loc)); 61 } else { /* used by alg_ptap */ 62 PetscCall(PetscFree(ptap->api)); 63 PetscCall(PetscFree(ptap->apj)); 64 } 65 PetscCall(MatDestroy(&ptap->C_loc)); 66 PetscCall(MatDestroy(&ptap->C_oth)); 67 if (ptap->apa) PetscCall(PetscFree(ptap->apa)); 68 69 PetscCall(MatDestroy(&ptap->Pt)); 70 71 merge = ptap->merge; 72 if (merge) { /* used by alg_ptap */ 73 PetscCall(PetscFree(merge->id_r)); 74 PetscCall(PetscFree(merge->len_s)); 75 PetscCall(PetscFree(merge->len_r)); 76 PetscCall(PetscFree(merge->bi)); 77 PetscCall(PetscFree(merge->bj)); 78 PetscCall(PetscFree(merge->buf_ri[0])); 79 PetscCall(PetscFree(merge->buf_ri)); 80 PetscCall(PetscFree(merge->buf_rj[0])); 81 PetscCall(PetscFree(merge->buf_rj)); 82 PetscCall(PetscFree(merge->coi)); 83 PetscCall(PetscFree(merge->coj)); 84 PetscCall(PetscFree(merge->owners_co)); 85 PetscCall(PetscLayoutDestroy(&merge->rowmap)); 86 PetscCall(PetscFree(ptap->merge)); 87 } 88 PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog)); 89 90 PetscCall(PetscSFDestroy(&ptap->sf)); 91 PetscCall(PetscFree(ptap->c_othi)); 92 PetscCall(PetscFree(ptap->c_rmti)); 93 PetscCall(PetscFree(ptap)); 94 PetscFunctionReturn(PETSC_SUCCESS); 95 } 96 97 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C) 98 { 99 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 100 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 101 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 102 Mat_APMPI *ptap; 103 Mat AP_loc, C_loc, C_oth; 104 PetscInt i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout; 105 PetscScalar *apa; 106 const PetscInt *cols; 107 const PetscScalar *vals; 108 109 PetscFunctionBegin; 110 MatCheckProduct(C, 3); 111 ptap = (Mat_APMPI *)C->product->data; 112 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 113 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 114 115 PetscCall(MatZeroEntries(C)); 116 117 /* 1) get R = Pd^T,Ro = Po^T */ 118 if (ptap->reuse == MAT_REUSE_MATRIX) { 119 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 120 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 121 } 122 123 /* 2) get AP_loc */ 124 AP_loc = ptap->AP_loc; 125 ap = (Mat_SeqAIJ *)AP_loc->data; 126 127 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 128 /*-----------------------------------------------------*/ 129 if (ptap->reuse == MAT_REUSE_MATRIX) { 130 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 131 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 132 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 133 } 134 135 /* 2-2) compute numeric A_loc*P - dominating part */ 136 /* ---------------------------------------------- */ 137 /* get data from symbolic products */ 138 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 139 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 140 141 api = ap->i; 142 apj = ap->j; 143 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj)); 144 for (i = 0; i < am; i++) { 145 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 146 apnz = api[i + 1] - api[i]; 147 apa = ap->a + api[i]; 148 PetscCall(PetscArrayzero(apa, apnz)); 149 AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa); 150 } 151 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj)); 152 PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout); 153 154 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 155 /* Always use scalable version since we are in the MPI scalable version */ 156 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc)); 157 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth)); 158 159 C_loc = ptap->C_loc; 160 C_oth = ptap->C_oth; 161 162 /* add C_loc and Co to to C */ 163 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 164 165 /* C_loc -> C */ 166 cm = C_loc->rmap->N; 167 c_seq = (Mat_SeqAIJ *)C_loc->data; 168 cols = c_seq->j; 169 vals = c_seq->a; 170 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j)); 171 172 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 173 /* when there are no off-processor parts. */ 174 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 175 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 176 /* a table, and other, more complex stuff has to be done. */ 177 if (C->assembled) { 178 C->was_assembled = PETSC_TRUE; 179 C->assembled = PETSC_FALSE; 180 } 181 if (C->was_assembled) { 182 for (i = 0; i < cm; i++) { 183 ncols = c_seq->i[i + 1] - c_seq->i[i]; 184 row = rstart + i; 185 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 186 cols += ncols; 187 vals += ncols; 188 } 189 } else { 190 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 191 } 192 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j)); 193 PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 194 195 /* Co -> C, off-processor part */ 196 cm = C_oth->rmap->N; 197 c_seq = (Mat_SeqAIJ *)C_oth->data; 198 cols = c_seq->j; 199 vals = c_seq->a; 200 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j)); 201 for (i = 0; i < cm; i++) { 202 ncols = c_seq->i[i + 1] - c_seq->i[i]; 203 row = p->garray[i]; 204 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 205 cols += ncols; 206 vals += ncols; 207 } 208 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 209 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 210 211 ptap->reuse = MAT_REUSE_MATRIX; 212 213 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j)); 214 PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 215 PetscFunctionReturn(PETSC_SUCCESS); 216 } 217 218 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi) 219 { 220 Mat_APMPI *ptap; 221 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 222 MPI_Comm comm; 223 PetscMPIInt size, rank; 224 Mat P_loc, P_oth; 225 PetscFreeSpaceList free_space = NULL, current_space = NULL; 226 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 227 PetscInt *lnk, i, k, pnz, row, nsend; 228 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 229 PETSC_UNUSED PetscMPIInt icompleted = 0; 230 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 231 const PetscInt *owners; 232 PetscInt len, proc, *dnz, *onz, nzi, nspacedouble; 233 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 234 MPI_Request *swaits, *rwaits; 235 MPI_Status *sstatus, rstatus; 236 PetscLayout rowmap; 237 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 238 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 239 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout; 240 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 241 PetscScalar *apv; 242 PetscHMapI ta; 243 MatType mtype; 244 const char *prefix; 245 #if defined(PETSC_USE_INFO) 246 PetscReal apfill; 247 #endif 248 249 PetscFunctionBegin; 250 MatCheckProduct(Cmpi, 4); 251 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 252 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 253 PetscCallMPI(MPI_Comm_size(comm, &size)); 254 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 255 256 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 257 258 /* create symbolic parallel matrix Cmpi */ 259 PetscCall(MatGetType(A, &mtype)); 260 PetscCall(MatSetType(Cmpi, mtype)); 261 262 /* create struct Mat_APMPI and attached it to C later */ 263 PetscCall(PetscNew(&ptap)); 264 ptap->reuse = MAT_INITIAL_MATRIX; 265 ptap->algType = 0; 266 267 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 268 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth)); 269 /* get P_loc by taking all local rows of P */ 270 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc)); 271 272 ptap->P_loc = P_loc; 273 ptap->P_oth = P_oth; 274 275 /* (0) compute Rd = Pd^T, Ro = Po^T */ 276 /* --------------------------------- */ 277 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 278 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 279 280 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 281 /* ----------------------------------------------------------------- */ 282 p_loc = (Mat_SeqAIJ *)P_loc->data; 283 if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data; 284 285 /* create and initialize a linked list */ 286 PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */ 287 MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta); 288 MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta); 289 PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 290 291 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 292 293 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 294 if (ao) { 295 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 296 } else { 297 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 298 } 299 current_space = free_space; 300 nspacedouble = 0; 301 302 PetscCall(PetscMalloc1(am + 1, &api)); 303 api[0] = 0; 304 for (i = 0; i < am; i++) { 305 /* diagonal portion: Ad[i,:]*P */ 306 ai = ad->i; 307 pi = p_loc->i; 308 nzi = ai[i + 1] - ai[i]; 309 aj = ad->j + ai[i]; 310 for (j = 0; j < nzi; j++) { 311 row = aj[j]; 312 pnz = pi[row + 1] - pi[row]; 313 Jptr = p_loc->j + pi[row]; 314 /* add non-zero cols of P into the sorted linked list lnk */ 315 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 316 } 317 /* off-diagonal portion: Ao[i,:]*P */ 318 if (ao) { 319 ai = ao->i; 320 pi = p_oth->i; 321 nzi = ai[i + 1] - ai[i]; 322 aj = ao->j + ai[i]; 323 for (j = 0; j < nzi; j++) { 324 row = aj[j]; 325 pnz = pi[row + 1] - pi[row]; 326 Jptr = p_oth->j + pi[row]; 327 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 328 } 329 } 330 apnz = lnk[0]; 331 api[i + 1] = api[i] + apnz; 332 333 /* if free space is not available, double the total space in the list */ 334 if (current_space->local_remaining < apnz) { 335 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 336 nspacedouble++; 337 } 338 339 /* Copy data into free space, then initialize lnk */ 340 PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk)); 341 342 current_space->array += apnz; 343 current_space->local_used += apnz; 344 current_space->local_remaining -= apnz; 345 } 346 /* Allocate space for apj and apv, initialize apj, and */ 347 /* destroy list of free space and other temporary array(s) */ 348 PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv)); 349 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 350 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 351 352 /* Create AP_loc for reuse */ 353 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 354 PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog)); 355 356 #if defined(PETSC_USE_INFO) 357 if (ao) { 358 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 359 } else { 360 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 361 } 362 ptap->AP_loc->info.mallocs = nspacedouble; 363 ptap->AP_loc->info.fill_ratio_given = fill; 364 ptap->AP_loc->info.fill_ratio_needed = apfill; 365 366 if (api[am]) { 367 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 368 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 369 } else { 370 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n")); 371 } 372 #endif 373 374 /* (2-1) compute symbolic Co = Ro*AP_loc */ 375 /* -------------------------------------- */ 376 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 377 PetscCall(MatGetOptionsPrefix(A, &prefix)); 378 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 379 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_")); 380 381 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 382 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted")); 383 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 384 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 385 PetscCall(MatProductSymbolic(ptap->C_oth)); 386 387 /* (3) send coj of C_oth to other processors */ 388 /* ------------------------------------------ */ 389 /* determine row ownership */ 390 PetscCall(PetscLayoutCreate(comm, &rowmap)); 391 PetscCall(PetscLayoutSetLocalSize(rowmap, pn)); 392 PetscCall(PetscLayoutSetBlockSize(rowmap, 1)); 393 PetscCall(PetscLayoutSetUp(rowmap)); 394 PetscCall(PetscLayoutGetRanges(rowmap, &owners)); 395 396 /* determine the number of messages to send, their lengths */ 397 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 398 PetscCall(PetscArrayzero(len_s, size)); 399 PetscCall(PetscArrayzero(len_si, size)); 400 401 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 402 coi = c_oth->i; 403 coj = c_oth->j; 404 con = ptap->C_oth->rmap->n; 405 proc = 0; 406 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj)); 407 for (i = 0; i < con; i++) { 408 while (prmap[i] >= owners[proc + 1]) proc++; 409 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 410 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 411 } 412 413 len = 0; /* max length of buf_si[], see (4) */ 414 owners_co[0] = 0; 415 nsend = 0; 416 for (proc = 0; proc < size; proc++) { 417 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 418 if (len_s[proc]) { 419 nsend++; 420 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 421 len += len_si[proc]; 422 } 423 } 424 425 /* determine the number and length of messages to receive for coi and coj */ 426 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 427 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 428 429 /* post the Irecv and Isend of coj */ 430 PetscCall(PetscCommGetNewTag(comm, &tagj)); 431 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 432 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 433 for (proc = 0, k = 0; proc < size; proc++) { 434 if (!len_s[proc]) continue; 435 i = owners_co[proc]; 436 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 437 k++; 438 } 439 440 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 441 /* ---------------------------------------- */ 442 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 443 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 444 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 445 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 446 447 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 448 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_")); 449 450 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 451 PetscCall(MatProductSymbolic(ptap->C_loc)); 452 453 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 454 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j)); 455 456 /* receives coj are complete */ 457 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 458 PetscCall(PetscFree(rwaits)); 459 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 460 461 /* add received column indices into ta to update Crmax */ 462 for (k = 0; k < nrecv; k++) { /* k-th received message */ 463 Jptr = buf_rj[k]; 464 for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1)); 465 } 466 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 467 PetscCall(PetscHMapIDestroy(&ta)); 468 469 /* (4) send and recv coi */ 470 /*-----------------------*/ 471 PetscCall(PetscCommGetNewTag(comm, &tagi)); 472 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 473 PetscCall(PetscMalloc1(len + 1, &buf_s)); 474 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 475 for (proc = 0, k = 0; proc < size; proc++) { 476 if (!len_s[proc]) continue; 477 /* form outgoing message for i-structure: 478 buf_si[0]: nrows to be sent 479 [1:nrows]: row index (global) 480 [nrows+1:2*nrows+1]: i-structure index 481 */ 482 /*-------------------------------------------*/ 483 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 484 buf_si_i = buf_si + nrows + 1; 485 buf_si[0] = nrows; 486 buf_si_i[0] = 0; 487 nrows = 0; 488 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 489 nzi = coi[i + 1] - coi[i]; 490 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 491 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 492 nrows++; 493 } 494 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 495 k++; 496 buf_si += len_si[proc]; 497 } 498 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 499 PetscCall(PetscFree(rwaits)); 500 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 501 502 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 503 PetscCall(PetscFree(len_ri)); 504 PetscCall(PetscFree(swaits)); 505 PetscCall(PetscFree(buf_s)); 506 507 /* (5) compute the local portion of Cmpi */ 508 /* ------------------------------------------ */ 509 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 510 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 511 current_space = free_space; 512 513 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 514 for (k = 0; k < nrecv; k++) { 515 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 516 nrows = *buf_ri_k[k]; 517 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 518 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 519 } 520 521 MatPreallocateBegin(comm, pn, pn, dnz, onz); 522 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 523 for (i = 0; i < pn; i++) { 524 /* add C_loc into Cmpi */ 525 nzi = c_loc->i[i + 1] - c_loc->i[i]; 526 Jptr = c_loc->j + c_loc->i[i]; 527 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 528 529 /* add received col data into lnk */ 530 for (k = 0; k < nrecv; k++) { /* k-th received message */ 531 if (i == *nextrow[k]) { /* i-th row */ 532 nzi = *(nextci[k] + 1) - *nextci[k]; 533 Jptr = buf_rj[k] + *nextci[k]; 534 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 535 nextrow[k]++; 536 nextci[k]++; 537 } 538 } 539 nzi = lnk[0]; 540 541 /* copy data into free space, then initialize lnk */ 542 PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk)); 543 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 544 } 545 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 546 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 547 PetscCall(PetscFreeSpaceDestroy(free_space)); 548 549 /* local sizes and preallocation */ 550 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 551 if (P->cmap->bs > 0) { 552 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 553 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 554 } 555 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 556 MatPreallocateEnd(dnz, onz); 557 558 /* members in merge */ 559 PetscCall(PetscFree(id_r)); 560 PetscCall(PetscFree(len_r)); 561 PetscCall(PetscFree(buf_ri[0])); 562 PetscCall(PetscFree(buf_ri)); 563 PetscCall(PetscFree(buf_rj[0])); 564 PetscCall(PetscFree(buf_rj)); 565 PetscCall(PetscLayoutDestroy(&rowmap)); 566 567 nout = 0; 568 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j)); 569 PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout); 570 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j)); 571 PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout); 572 573 /* attach the supporting struct to Cmpi for reuse */ 574 Cmpi->product->data = ptap; 575 Cmpi->product->view = MatView_MPIAIJ_PtAP; 576 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 577 578 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 579 Cmpi->assembled = PETSC_FALSE; 580 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 581 PetscFunctionReturn(PETSC_SUCCESS); 582 } 583 584 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht) 585 { 586 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 587 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 588 PetscInt *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k; 589 PetscInt pcstart, pcend, column, offset; 590 591 PetscFunctionBegin; 592 pcstart = P->cmap->rstart; 593 pcstart *= dof; 594 pcend = P->cmap->rend; 595 pcend *= dof; 596 /* diagonal portion: Ad[i,:]*P */ 597 ai = ad->i; 598 nzi = ai[i + 1] - ai[i]; 599 aj = ad->j + ai[i]; 600 for (j = 0; j < nzi; j++) { 601 row = aj[j]; 602 offset = row % dof; 603 row /= dof; 604 nzpi = pd->i[row + 1] - pd->i[row]; 605 pj = pd->j + pd->i[row]; 606 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart)); 607 } 608 /* off diag P */ 609 for (j = 0; j < nzi; j++) { 610 row = aj[j]; 611 offset = row % dof; 612 row /= dof; 613 nzpi = po->i[row + 1] - po->i[row]; 614 pj = po->j + po->i[row]; 615 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset)); 616 } 617 618 /* off diagonal part: Ao[i, :]*P_oth */ 619 if (ao) { 620 ai = ao->i; 621 pi = p_oth->i; 622 nzi = ai[i + 1] - ai[i]; 623 aj = ao->j + ai[i]; 624 for (j = 0; j < nzi; j++) { 625 row = aj[j]; 626 offset = a->garray[row] % dof; 627 row = map[row]; 628 pnz = pi[row + 1] - pi[row]; 629 p_othcols = p_oth->j + pi[row]; 630 for (col = 0; col < pnz; col++) { 631 column = p_othcols[col] * dof + offset; 632 if (column >= pcstart && column < pcend) { 633 PetscCall(PetscHSetIAdd(dht, column)); 634 } else { 635 PetscCall(PetscHSetIAdd(oht, column)); 636 } 637 } 638 } 639 } /* end if (ao) */ 640 PetscFunctionReturn(PETSC_SUCCESS); 641 } 642 643 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap) 644 { 645 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 646 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 647 PetscInt *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset; 648 PetscScalar ra, *aa, *pa; 649 650 PetscFunctionBegin; 651 pcstart = P->cmap->rstart; 652 pcstart *= dof; 653 654 /* diagonal portion: Ad[i,:]*P */ 655 ai = ad->i; 656 nzi = ai[i + 1] - ai[i]; 657 aj = ad->j + ai[i]; 658 aa = ad->a + ai[i]; 659 for (j = 0; j < nzi; j++) { 660 ra = aa[j]; 661 row = aj[j]; 662 offset = row % dof; 663 row /= dof; 664 nzpi = pd->i[row + 1] - pd->i[row]; 665 pj = pd->j + pd->i[row]; 666 pa = pd->a + pd->i[row]; 667 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k])); 668 PetscCall(PetscLogFlops(2.0 * nzpi)); 669 } 670 for (j = 0; j < nzi; j++) { 671 ra = aa[j]; 672 row = aj[j]; 673 offset = row % dof; 674 row /= dof; 675 nzpi = po->i[row + 1] - po->i[row]; 676 pj = po->j + po->i[row]; 677 pa = po->a + po->i[row]; 678 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k])); 679 PetscCall(PetscLogFlops(2.0 * nzpi)); 680 } 681 682 /* off diagonal part: Ao[i, :]*P_oth */ 683 if (ao) { 684 ai = ao->i; 685 pi = p_oth->i; 686 nzi = ai[i + 1] - ai[i]; 687 aj = ao->j + ai[i]; 688 aa = ao->a + ai[i]; 689 for (j = 0; j < nzi; j++) { 690 row = aj[j]; 691 offset = a->garray[row] % dof; 692 row = map[row]; 693 ra = aa[j]; 694 pnz = pi[row + 1] - pi[row]; 695 p_othcols = p_oth->j + pi[row]; 696 pa = p_oth->a + pi[row]; 697 for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col])); 698 PetscCall(PetscLogFlops(2.0 * pnz)); 699 } 700 } /* end if (ao) */ 701 702 PetscFunctionReturn(PETSC_SUCCESS); 703 } 704 705 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *); 706 707 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C) 708 { 709 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 710 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 711 Mat_APMPI *ptap; 712 PetscHMapIV hmap; 713 PetscInt i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc; 714 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 715 PetscInt offset, ii, pocol; 716 const PetscInt *mappingindices; 717 IS map; 718 719 PetscFunctionBegin; 720 MatCheckProduct(C, 4); 721 ptap = (Mat_APMPI *)C->product->data; 722 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 723 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 724 725 PetscCall(MatZeroEntries(C)); 726 727 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 728 /*-----------------------------------------------------*/ 729 if (ptap->reuse == MAT_REUSE_MATRIX) { 730 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 731 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 732 } 733 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 734 735 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 736 pon *= dof; 737 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 738 PetscCall(MatGetLocalSize(A, &am, NULL)); 739 cmaxr = 0; 740 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 741 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 742 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 743 PetscCall(ISGetIndices(map, &mappingindices)); 744 for (i = 0; i < am && pon; i++) { 745 PetscCall(PetscHMapIVClear(hmap)); 746 offset = i % dof; 747 ii = i / dof; 748 nzi = po->i[ii + 1] - po->i[ii]; 749 if (!nzi) continue; 750 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 751 voff = 0; 752 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 753 if (!voff) continue; 754 755 /* Form C(ii, :) */ 756 poj = po->j + po->i[ii]; 757 poa = po->a + po->i[ii]; 758 for (j = 0; j < nzi; j++) { 759 pocol = poj[j] * dof + offset; 760 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 761 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 762 for (jj = 0; jj < voff; jj++) { 763 apvaluestmp[jj] = apvalues[jj] * poa[j]; 764 /* If the row is empty */ 765 if (!c_rmtc[pocol]) { 766 c_rmtjj[jj] = apindices[jj]; 767 c_rmtaa[jj] = apvaluestmp[jj]; 768 c_rmtc[pocol]++; 769 } else { 770 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 771 if (loc >= 0) { /* hit */ 772 c_rmtaa[loc] += apvaluestmp[jj]; 773 PetscCall(PetscLogFlops(1.0)); 774 } else { /* new element */ 775 loc = -(loc + 1); 776 /* Move data backward */ 777 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 778 c_rmtjj[kk] = c_rmtjj[kk - 1]; 779 c_rmtaa[kk] = c_rmtaa[kk - 1]; 780 } /* End kk */ 781 c_rmtjj[loc] = apindices[jj]; 782 c_rmtaa[loc] = apvaluestmp[jj]; 783 c_rmtc[pocol]++; 784 } 785 } 786 PetscCall(PetscLogFlops(voff)); 787 } /* End jj */ 788 } /* End j */ 789 } /* End i */ 790 791 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 792 PetscCall(PetscHMapIVDestroy(&hmap)); 793 794 PetscCall(MatGetLocalSize(P, NULL, &pn)); 795 pn *= dof; 796 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 797 798 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 799 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 800 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 801 pcstart = pcstart * dof; 802 pcend = pcend * dof; 803 cd = (Mat_SeqAIJ *)(c->A)->data; 804 co = (Mat_SeqAIJ *)(c->B)->data; 805 806 cmaxr = 0; 807 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 808 PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ)); 809 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 810 for (i = 0; i < am && pn; i++) { 811 PetscCall(PetscHMapIVClear(hmap)); 812 offset = i % dof; 813 ii = i / dof; 814 nzi = pd->i[ii + 1] - pd->i[ii]; 815 if (!nzi) continue; 816 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 817 voff = 0; 818 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 819 if (!voff) continue; 820 /* Form C(ii, :) */ 821 pdj = pd->j + pd->i[ii]; 822 pda = pd->a + pd->i[ii]; 823 for (j = 0; j < nzi; j++) { 824 row = pcstart + pdj[j] * dof + offset; 825 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; 826 PetscCall(PetscLogFlops(voff)); 827 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 828 } 829 } 830 PetscCall(ISRestoreIndices(map, &mappingindices)); 831 PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend)); 832 PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ)); 833 PetscCall(PetscHMapIVDestroy(&hmap)); 834 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 835 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 836 PetscCall(PetscFree2(c_rmtj, c_rmta)); 837 838 /* Add contributions from remote */ 839 for (i = 0; i < pn; i++) { 840 row = i + pcstart; 841 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 842 } 843 PetscCall(PetscFree2(c_othj, c_otha)); 844 845 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 846 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 847 848 ptap->reuse = MAT_REUSE_MATRIX; 849 PetscFunctionReturn(PETSC_SUCCESS); 850 } 851 852 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C) 853 { 854 PetscFunctionBegin; 855 856 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C)); 857 PetscFunctionReturn(PETSC_SUCCESS); 858 } 859 860 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C) 861 { 862 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 863 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 864 Mat_APMPI *ptap; 865 PetscHMapIV hmap; 866 PetscInt i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc; 867 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 868 PetscInt offset, ii, pocol; 869 const PetscInt *mappingindices; 870 IS map; 871 872 PetscFunctionBegin; 873 MatCheckProduct(C, 4); 874 ptap = (Mat_APMPI *)C->product->data; 875 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 876 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 877 878 PetscCall(MatZeroEntries(C)); 879 880 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 881 /*-----------------------------------------------------*/ 882 if (ptap->reuse == MAT_REUSE_MATRIX) { 883 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 884 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 885 } 886 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 887 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 888 pon *= dof; 889 PetscCall(MatGetLocalSize(P, NULL, &pn)); 890 pn *= dof; 891 892 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 893 PetscCall(MatGetLocalSize(A, &am, NULL)); 894 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 895 pcstart *= dof; 896 pcend *= dof; 897 cmaxr = 0; 898 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 899 cd = (Mat_SeqAIJ *)(c->A)->data; 900 co = (Mat_SeqAIJ *)(c->B)->data; 901 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 902 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 903 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 904 PetscCall(ISGetIndices(map, &mappingindices)); 905 for (i = 0; i < am && (pon || pn); i++) { 906 PetscCall(PetscHMapIVClear(hmap)); 907 offset = i % dof; 908 ii = i / dof; 909 nzi = po->i[ii + 1] - po->i[ii]; 910 dnzi = pd->i[ii + 1] - pd->i[ii]; 911 if (!nzi && !dnzi) continue; 912 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 913 voff = 0; 914 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 915 if (!voff) continue; 916 917 /* Form remote C(ii, :) */ 918 poj = po->j + po->i[ii]; 919 poa = po->a + po->i[ii]; 920 for (j = 0; j < nzi; j++) { 921 pocol = poj[j] * dof + offset; 922 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 923 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 924 for (jj = 0; jj < voff; jj++) { 925 apvaluestmp[jj] = apvalues[jj] * poa[j]; 926 /* If the row is empty */ 927 if (!c_rmtc[pocol]) { 928 c_rmtjj[jj] = apindices[jj]; 929 c_rmtaa[jj] = apvaluestmp[jj]; 930 c_rmtc[pocol]++; 931 } else { 932 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 933 if (loc >= 0) { /* hit */ 934 c_rmtaa[loc] += apvaluestmp[jj]; 935 PetscCall(PetscLogFlops(1.0)); 936 } else { /* new element */ 937 loc = -(loc + 1); 938 /* Move data backward */ 939 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 940 c_rmtjj[kk] = c_rmtjj[kk - 1]; 941 c_rmtaa[kk] = c_rmtaa[kk - 1]; 942 } /* End kk */ 943 c_rmtjj[loc] = apindices[jj]; 944 c_rmtaa[loc] = apvaluestmp[jj]; 945 c_rmtc[pocol]++; 946 } 947 } 948 } /* End jj */ 949 PetscCall(PetscLogFlops(voff)); 950 } /* End j */ 951 952 /* Form local C(ii, :) */ 953 pdj = pd->j + pd->i[ii]; 954 pda = pd->a + pd->i[ii]; 955 for (j = 0; j < dnzi; j++) { 956 row = pcstart + pdj[j] * dof + offset; 957 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */ 958 PetscCall(PetscLogFlops(voff)); 959 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 960 } /* End j */ 961 } /* End i */ 962 963 PetscCall(ISRestoreIndices(map, &mappingindices)); 964 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 965 PetscCall(PetscHMapIVDestroy(&hmap)); 966 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 967 968 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 969 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 970 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 971 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 972 PetscCall(PetscFree2(c_rmtj, c_rmta)); 973 974 /* Add contributions from remote */ 975 for (i = 0; i < pn; i++) { 976 row = i + pcstart; 977 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 978 } 979 PetscCall(PetscFree2(c_othj, c_otha)); 980 981 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 982 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 983 984 ptap->reuse = MAT_REUSE_MATRIX; 985 PetscFunctionReturn(PETSC_SUCCESS); 986 } 987 988 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C) 989 { 990 PetscFunctionBegin; 991 992 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C)); 993 PetscFunctionReturn(PETSC_SUCCESS); 994 } 995 996 /* TODO: move algorithm selection to MatProductSetFromOptions */ 997 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 998 { 999 Mat_APMPI *ptap; 1000 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1001 MPI_Comm comm; 1002 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 1003 MatType mtype; 1004 PetscSF sf; 1005 PetscSFNode *iremote; 1006 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 1007 const PetscInt *rootdegrees; 1008 PetscHSetI ht, oht, *hta, *hto; 1009 PetscInt pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 1010 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 1011 PetscInt nalg = 2, alg = 0, offset, ii; 1012 PetscMPIInt owner; 1013 const PetscInt *mappingindices; 1014 PetscBool flg; 1015 const char *algTypes[2] = {"overlapping", "merged"}; 1016 IS map; 1017 1018 PetscFunctionBegin; 1019 MatCheckProduct(Cmpi, 5); 1020 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1021 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1022 1023 /* Create symbolic parallel matrix Cmpi */ 1024 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1025 pn *= dof; 1026 PetscCall(MatGetType(A, &mtype)); 1027 PetscCall(MatSetType(Cmpi, mtype)); 1028 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1029 1030 PetscCall(PetscNew(&ptap)); 1031 ptap->reuse = MAT_INITIAL_MATRIX; 1032 ptap->algType = 2; 1033 1034 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1035 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1036 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1037 /* This equals to the number of offdiag columns in P */ 1038 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1039 pon *= dof; 1040 /* offsets */ 1041 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1042 /* The number of columns we will send to remote ranks */ 1043 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1044 PetscCall(PetscMalloc1(pon, &hta)); 1045 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1046 PetscCall(MatGetLocalSize(A, &am, NULL)); 1047 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1048 /* Create hash table to merge all columns for C(i, :) */ 1049 PetscCall(PetscHSetICreate(&ht)); 1050 1051 PetscCall(ISGetIndices(map, &mappingindices)); 1052 ptap->c_rmti[0] = 0; 1053 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1054 for (i = 0; i < am && pon; i++) { 1055 /* Form one row of AP */ 1056 PetscCall(PetscHSetIClear(ht)); 1057 offset = i % dof; 1058 ii = i / dof; 1059 /* If the off diag is empty, we should not do any calculation */ 1060 nzi = po->i[ii + 1] - po->i[ii]; 1061 if (!nzi) continue; 1062 1063 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht)); 1064 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1065 /* If AP is empty, just continue */ 1066 if (!htsize) continue; 1067 /* Form C(ii, :) */ 1068 poj = po->j + po->i[ii]; 1069 for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1070 } 1071 1072 for (i = 0; i < pon; i++) { 1073 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1074 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1075 c_rmtc[i] = htsize; 1076 } 1077 1078 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1079 1080 for (i = 0; i < pon; i++) { 1081 off = 0; 1082 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1083 PetscCall(PetscHSetIDestroy(&hta[i])); 1084 } 1085 PetscCall(PetscFree(hta)); 1086 1087 PetscCall(PetscMalloc1(pon, &iremote)); 1088 for (i = 0; i < pon; i++) { 1089 owner = 0; 1090 lidx = 0; 1091 offset = i % dof; 1092 ii = i / dof; 1093 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1094 iremote[i].index = lidx * dof + offset; 1095 iremote[i].rank = owner; 1096 } 1097 1098 PetscCall(PetscSFCreate(comm, &sf)); 1099 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1100 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1101 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1102 PetscCall(PetscSFSetFromOptions(sf)); 1103 PetscCall(PetscSFSetUp(sf)); 1104 /* How many neighbors have contributions to my rows? */ 1105 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1106 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1107 rootspacesize = 0; 1108 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1109 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1110 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1111 /* Get information from leaves 1112 * Number of columns other people contribute to my rows 1113 * */ 1114 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1115 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1116 PetscCall(PetscFree(c_rmtc)); 1117 PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi)); 1118 /* The number of columns is received for each row */ 1119 ptap->c_othi[0] = 0; 1120 rootspacesize = 0; 1121 rootspaceoffsets[0] = 0; 1122 for (i = 0; i < pn; i++) { 1123 rcvncols = 0; 1124 for (j = 0; j < rootdegrees[i]; j++) { 1125 rcvncols += rootspace[rootspacesize]; 1126 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1127 rootspacesize++; 1128 } 1129 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1130 } 1131 PetscCall(PetscFree(rootspace)); 1132 1133 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1134 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1135 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1136 PetscCall(PetscSFDestroy(&sf)); 1137 PetscCall(PetscFree(rootspaceoffsets)); 1138 1139 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1140 nleaves = 0; 1141 for (i = 0; i < pon; i++) { 1142 owner = 0; 1143 ii = i / dof; 1144 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1145 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1146 for (j = 0; j < sendncols; j++) { 1147 iremote[nleaves].rank = owner; 1148 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1149 } 1150 } 1151 PetscCall(PetscFree(c_rmtoffsets)); 1152 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1153 1154 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1155 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1156 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1157 /* One to one map */ 1158 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1159 1160 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1161 PetscCall(PetscHSetICreate(&oht)); 1162 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1163 pcstart *= dof; 1164 pcend *= dof; 1165 PetscCall(PetscMalloc2(pn, &hta, pn, &hto)); 1166 for (i = 0; i < pn; i++) { 1167 PetscCall(PetscHSetICreate(&hta[i])); 1168 PetscCall(PetscHSetICreate(&hto[i])); 1169 } 1170 /* Work on local part */ 1171 /* 4) Pass 1: Estimate memory for C_loc */ 1172 for (i = 0; i < am && pn; i++) { 1173 PetscCall(PetscHSetIClear(ht)); 1174 PetscCall(PetscHSetIClear(oht)); 1175 offset = i % dof; 1176 ii = i / dof; 1177 nzi = pd->i[ii + 1] - pd->i[ii]; 1178 if (!nzi) continue; 1179 1180 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1181 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1182 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1183 if (!(htsize + htosize)) continue; 1184 /* Form C(ii, :) */ 1185 pdj = pd->j + pd->i[ii]; 1186 for (j = 0; j < nzi; j++) { 1187 PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht)); 1188 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1189 } 1190 } 1191 1192 PetscCall(ISRestoreIndices(map, &mappingindices)); 1193 1194 PetscCall(PetscHSetIDestroy(&ht)); 1195 PetscCall(PetscHSetIDestroy(&oht)); 1196 1197 /* Get remote data */ 1198 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1199 PetscCall(PetscFree(c_rmtj)); 1200 1201 for (i = 0; i < pn; i++) { 1202 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1203 rdj = c_othj + ptap->c_othi[i]; 1204 for (j = 0; j < nzi; j++) { 1205 col = rdj[j]; 1206 /* diag part */ 1207 if (col >= pcstart && col < pcend) { 1208 PetscCall(PetscHSetIAdd(hta[i], col)); 1209 } else { /* off diag */ 1210 PetscCall(PetscHSetIAdd(hto[i], col)); 1211 } 1212 } 1213 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1214 dnz[i] = htsize; 1215 PetscCall(PetscHSetIDestroy(&hta[i])); 1216 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1217 onz[i] = htsize; 1218 PetscCall(PetscHSetIDestroy(&hto[i])); 1219 } 1220 1221 PetscCall(PetscFree2(hta, hto)); 1222 PetscCall(PetscFree(c_othj)); 1223 1224 /* local sizes and preallocation */ 1225 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1226 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1227 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1228 PetscCall(MatSetUp(Cmpi)); 1229 PetscCall(PetscFree2(dnz, onz)); 1230 1231 /* attach the supporting struct to Cmpi for reuse */ 1232 Cmpi->product->data = ptap; 1233 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1234 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1235 1236 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1237 Cmpi->assembled = PETSC_FALSE; 1238 /* pick an algorithm */ 1239 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1240 alg = 0; 1241 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1242 PetscOptionsEnd(); 1243 switch (alg) { 1244 case 0: 1245 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1246 break; 1247 case 1: 1248 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1249 break; 1250 default: 1251 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1252 } 1253 PetscFunctionReturn(PETSC_SUCCESS); 1254 } 1255 1256 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C) 1257 { 1258 PetscFunctionBegin; 1259 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C)); 1260 PetscFunctionReturn(PETSC_SUCCESS); 1261 } 1262 1263 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 1264 { 1265 Mat_APMPI *ptap; 1266 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1267 MPI_Comm comm; 1268 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 1269 MatType mtype; 1270 PetscSF sf; 1271 PetscSFNode *iremote; 1272 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 1273 const PetscInt *rootdegrees; 1274 PetscHSetI ht, oht, *hta, *hto, *htd; 1275 PetscInt pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 1276 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 1277 PetscInt nalg = 2, alg = 0, offset, ii; 1278 PetscMPIInt owner; 1279 PetscBool flg; 1280 const char *algTypes[2] = {"merged", "overlapping"}; 1281 const PetscInt *mappingindices; 1282 IS map; 1283 1284 PetscFunctionBegin; 1285 MatCheckProduct(Cmpi, 5); 1286 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1287 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1288 1289 /* Create symbolic parallel matrix Cmpi */ 1290 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1291 pn *= dof; 1292 PetscCall(MatGetType(A, &mtype)); 1293 PetscCall(MatSetType(Cmpi, mtype)); 1294 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1295 1296 PetscCall(PetscNew(&ptap)); 1297 ptap->reuse = MAT_INITIAL_MATRIX; 1298 ptap->algType = 3; 1299 1300 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1301 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1302 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1303 1304 /* This equals to the number of offdiag columns in P */ 1305 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1306 pon *= dof; 1307 /* offsets */ 1308 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1309 /* The number of columns we will send to remote ranks */ 1310 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1311 PetscCall(PetscMalloc1(pon, &hta)); 1312 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1313 PetscCall(MatGetLocalSize(A, &am, NULL)); 1314 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1315 /* Create hash table to merge all columns for C(i, :) */ 1316 PetscCall(PetscHSetICreate(&ht)); 1317 PetscCall(PetscHSetICreate(&oht)); 1318 PetscCall(PetscMalloc2(pn, &htd, pn, &hto)); 1319 for (i = 0; i < pn; i++) { 1320 PetscCall(PetscHSetICreate(&htd[i])); 1321 PetscCall(PetscHSetICreate(&hto[i])); 1322 } 1323 1324 PetscCall(ISGetIndices(map, &mappingindices)); 1325 ptap->c_rmti[0] = 0; 1326 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1327 for (i = 0; i < am && (pon || pn); i++) { 1328 /* Form one row of AP */ 1329 PetscCall(PetscHSetIClear(ht)); 1330 PetscCall(PetscHSetIClear(oht)); 1331 offset = i % dof; 1332 ii = i / dof; 1333 /* If the off diag is empty, we should not do any calculation */ 1334 nzi = po->i[ii + 1] - po->i[ii]; 1335 dnzi = pd->i[ii + 1] - pd->i[ii]; 1336 if (!nzi && !dnzi) continue; 1337 1338 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1339 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1340 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1341 /* If AP is empty, just continue */ 1342 if (!(htsize + htosize)) continue; 1343 1344 /* Form remote C(ii, :) */ 1345 poj = po->j + po->i[ii]; 1346 for (j = 0; j < nzi; j++) { 1347 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1348 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht)); 1349 } 1350 1351 /* Form local C(ii, :) */ 1352 pdj = pd->j + pd->i[ii]; 1353 for (j = 0; j < dnzi; j++) { 1354 PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht)); 1355 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1356 } 1357 } 1358 1359 PetscCall(ISRestoreIndices(map, &mappingindices)); 1360 1361 PetscCall(PetscHSetIDestroy(&ht)); 1362 PetscCall(PetscHSetIDestroy(&oht)); 1363 1364 for (i = 0; i < pon; i++) { 1365 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1366 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1367 c_rmtc[i] = htsize; 1368 } 1369 1370 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1371 1372 for (i = 0; i < pon; i++) { 1373 off = 0; 1374 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1375 PetscCall(PetscHSetIDestroy(&hta[i])); 1376 } 1377 PetscCall(PetscFree(hta)); 1378 1379 PetscCall(PetscMalloc1(pon, &iremote)); 1380 for (i = 0; i < pon; i++) { 1381 owner = 0; 1382 lidx = 0; 1383 offset = i % dof; 1384 ii = i / dof; 1385 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1386 iremote[i].index = lidx * dof + offset; 1387 iremote[i].rank = owner; 1388 } 1389 1390 PetscCall(PetscSFCreate(comm, &sf)); 1391 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1392 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1393 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1394 PetscCall(PetscSFSetFromOptions(sf)); 1395 PetscCall(PetscSFSetUp(sf)); 1396 /* How many neighbors have contributions to my rows? */ 1397 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1398 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1399 rootspacesize = 0; 1400 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1401 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1402 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1403 /* Get information from leaves 1404 * Number of columns other people contribute to my rows 1405 * */ 1406 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1407 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1408 PetscCall(PetscFree(c_rmtc)); 1409 PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi)); 1410 /* The number of columns is received for each row */ 1411 ptap->c_othi[0] = 0; 1412 rootspacesize = 0; 1413 rootspaceoffsets[0] = 0; 1414 for (i = 0; i < pn; i++) { 1415 rcvncols = 0; 1416 for (j = 0; j < rootdegrees[i]; j++) { 1417 rcvncols += rootspace[rootspacesize]; 1418 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1419 rootspacesize++; 1420 } 1421 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1422 } 1423 PetscCall(PetscFree(rootspace)); 1424 1425 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1426 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1427 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1428 PetscCall(PetscSFDestroy(&sf)); 1429 PetscCall(PetscFree(rootspaceoffsets)); 1430 1431 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1432 nleaves = 0; 1433 for (i = 0; i < pon; i++) { 1434 owner = 0; 1435 ii = i / dof; 1436 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1437 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1438 for (j = 0; j < sendncols; j++) { 1439 iremote[nleaves].rank = owner; 1440 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1441 } 1442 } 1443 PetscCall(PetscFree(c_rmtoffsets)); 1444 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1445 1446 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1447 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1448 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1449 /* One to one map */ 1450 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1451 /* Get remote data */ 1452 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1453 PetscCall(PetscFree(c_rmtj)); 1454 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1455 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1456 pcstart *= dof; 1457 pcend *= dof; 1458 for (i = 0; i < pn; i++) { 1459 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1460 rdj = c_othj + ptap->c_othi[i]; 1461 for (j = 0; j < nzi; j++) { 1462 col = rdj[j]; 1463 /* diag part */ 1464 if (col >= pcstart && col < pcend) { 1465 PetscCall(PetscHSetIAdd(htd[i], col)); 1466 } else { /* off diag */ 1467 PetscCall(PetscHSetIAdd(hto[i], col)); 1468 } 1469 } 1470 PetscCall(PetscHSetIGetSize(htd[i], &htsize)); 1471 dnz[i] = htsize; 1472 PetscCall(PetscHSetIDestroy(&htd[i])); 1473 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1474 onz[i] = htsize; 1475 PetscCall(PetscHSetIDestroy(&hto[i])); 1476 } 1477 1478 PetscCall(PetscFree2(htd, hto)); 1479 PetscCall(PetscFree(c_othj)); 1480 1481 /* local sizes and preallocation */ 1482 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1483 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1484 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1485 PetscCall(PetscFree2(dnz, onz)); 1486 1487 /* attach the supporting struct to Cmpi for reuse */ 1488 Cmpi->product->data = ptap; 1489 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1490 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1491 1492 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1493 Cmpi->assembled = PETSC_FALSE; 1494 /* pick an algorithm */ 1495 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1496 alg = 0; 1497 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1498 PetscOptionsEnd(); 1499 switch (alg) { 1500 case 0: 1501 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1502 break; 1503 case 1: 1504 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1505 break; 1506 default: 1507 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1508 } 1509 PetscFunctionReturn(PETSC_SUCCESS); 1510 } 1511 1512 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C) 1513 { 1514 PetscFunctionBegin; 1515 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C)); 1516 PetscFunctionReturn(PETSC_SUCCESS); 1517 } 1518 1519 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi) 1520 { 1521 Mat_APMPI *ptap; 1522 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1523 MPI_Comm comm; 1524 PetscMPIInt size, rank; 1525 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1526 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 1527 PetscInt *lnk, i, k, pnz, row, nsend; 1528 PetscBT lnkbt; 1529 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 1530 PETSC_UNUSED PetscMPIInt icompleted = 0; 1531 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 1532 PetscInt len, proc, *dnz, *onz, *owners, nzi, nspacedouble; 1533 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1534 MPI_Request *swaits, *rwaits; 1535 MPI_Status *sstatus, rstatus; 1536 PetscLayout rowmap; 1537 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1538 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 1539 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi; 1540 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 1541 PetscScalar *apv; 1542 PetscHMapI ta; 1543 MatType mtype; 1544 const char *prefix; 1545 #if defined(PETSC_USE_INFO) 1546 PetscReal apfill; 1547 #endif 1548 1549 PetscFunctionBegin; 1550 MatCheckProduct(Cmpi, 4); 1551 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1552 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1553 PetscCallMPI(MPI_Comm_size(comm, &size)); 1554 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1555 1556 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 1557 1558 /* create symbolic parallel matrix Cmpi */ 1559 PetscCall(MatGetType(A, &mtype)); 1560 PetscCall(MatSetType(Cmpi, mtype)); 1561 1562 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1563 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1564 1565 /* create struct Mat_APMPI and attached it to C later */ 1566 PetscCall(PetscNew(&ptap)); 1567 ptap->reuse = MAT_INITIAL_MATRIX; 1568 ptap->algType = 1; 1569 1570 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1571 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1572 /* get P_loc by taking all local rows of P */ 1573 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 1574 1575 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1576 /* --------------------------------- */ 1577 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 1578 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 1579 1580 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1581 /* ----------------------------------------------------------------- */ 1582 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1583 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1584 1585 /* create and initialize a linked list */ 1586 PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */ 1587 MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta); 1588 MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta); 1589 PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 1590 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1591 1592 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1593 1594 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1595 if (ao) { 1596 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 1597 } else { 1598 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 1599 } 1600 current_space = free_space; 1601 nspacedouble = 0; 1602 1603 PetscCall(PetscMalloc1(am + 1, &api)); 1604 api[0] = 0; 1605 for (i = 0; i < am; i++) { 1606 /* diagonal portion: Ad[i,:]*P */ 1607 ai = ad->i; 1608 pi = p_loc->i; 1609 nzi = ai[i + 1] - ai[i]; 1610 aj = ad->j + ai[i]; 1611 for (j = 0; j < nzi; j++) { 1612 row = aj[j]; 1613 pnz = pi[row + 1] - pi[row]; 1614 Jptr = p_loc->j + pi[row]; 1615 /* add non-zero cols of P into the sorted linked list lnk */ 1616 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1617 } 1618 /* off-diagonal portion: Ao[i,:]*P */ 1619 if (ao) { 1620 ai = ao->i; 1621 pi = p_oth->i; 1622 nzi = ai[i + 1] - ai[i]; 1623 aj = ao->j + ai[i]; 1624 for (j = 0; j < nzi; j++) { 1625 row = aj[j]; 1626 pnz = pi[row + 1] - pi[row]; 1627 Jptr = p_oth->j + pi[row]; 1628 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1629 } 1630 } 1631 apnz = lnk[0]; 1632 api[i + 1] = api[i] + apnz; 1633 if (ap_rmax < apnz) ap_rmax = apnz; 1634 1635 /* if free space is not available, double the total space in the list */ 1636 if (current_space->local_remaining < apnz) { 1637 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 1638 nspacedouble++; 1639 } 1640 1641 /* Copy data into free space, then initialize lnk */ 1642 PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt)); 1643 1644 current_space->array += apnz; 1645 current_space->local_used += apnz; 1646 current_space->local_remaining -= apnz; 1647 } 1648 /* Allocate space for apj and apv, initialize apj, and */ 1649 /* destroy list of free space and other temporary array(s) */ 1650 PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv)); 1651 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 1652 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1653 1654 /* Create AP_loc for reuse */ 1655 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 1656 PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name)); 1657 #if defined(PETSC_USE_INFO) 1658 if (ao) { 1659 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 1660 } else { 1661 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 1662 } 1663 ptap->AP_loc->info.mallocs = nspacedouble; 1664 ptap->AP_loc->info.fill_ratio_given = fill; 1665 ptap->AP_loc->info.fill_ratio_needed = apfill; 1666 1667 if (api[am]) { 1668 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 1669 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 1670 } else { 1671 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n")); 1672 } 1673 #endif 1674 1675 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1676 /* ------------------------------------ */ 1677 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1678 PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix)); 1679 PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_")); 1680 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 1681 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1682 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 1683 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_")); 1684 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 1685 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default")); 1686 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 1687 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 1688 PetscCall(MatProductSymbolic(ptap->C_oth)); 1689 1690 /* (3) send coj of C_oth to other processors */ 1691 /* ------------------------------------------ */ 1692 /* determine row ownership */ 1693 PetscCall(PetscLayoutCreate(comm, &rowmap)); 1694 rowmap->n = pn; 1695 rowmap->bs = 1; 1696 PetscCall(PetscLayoutSetUp(rowmap)); 1697 owners = rowmap->range; 1698 1699 /* determine the number of messages to send, their lengths */ 1700 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 1701 PetscCall(PetscArrayzero(len_s, size)); 1702 PetscCall(PetscArrayzero(len_si, size)); 1703 1704 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 1705 coi = c_oth->i; 1706 coj = c_oth->j; 1707 con = ptap->C_oth->rmap->n; 1708 proc = 0; 1709 for (i = 0; i < con; i++) { 1710 while (prmap[i] >= owners[proc + 1]) proc++; 1711 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1712 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1713 } 1714 1715 len = 0; /* max length of buf_si[], see (4) */ 1716 owners_co[0] = 0; 1717 nsend = 0; 1718 for (proc = 0; proc < size; proc++) { 1719 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1720 if (len_s[proc]) { 1721 nsend++; 1722 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1723 len += len_si[proc]; 1724 } 1725 } 1726 1727 /* determine the number and length of messages to receive for coi and coj */ 1728 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 1729 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 1730 1731 /* post the Irecv and Isend of coj */ 1732 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1733 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 1734 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 1735 for (proc = 0, k = 0; proc < size; proc++) { 1736 if (!len_s[proc]) continue; 1737 i = owners_co[proc]; 1738 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1739 k++; 1740 } 1741 1742 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1743 /* ---------------------------------------- */ 1744 PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix)); 1745 PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_")); 1746 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 1747 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1748 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 1749 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_")); 1750 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 1751 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 1752 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 1753 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 1754 PetscCall(MatProductSymbolic(ptap->C_loc)); 1755 1756 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 1757 1758 /* receives coj are complete */ 1759 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1760 PetscCall(PetscFree(rwaits)); 1761 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1762 1763 /* add received column indices into ta to update Crmax */ 1764 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1765 Jptr = buf_rj[k]; 1766 for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1)); 1767 } 1768 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 1769 PetscCall(PetscHMapIDestroy(&ta)); 1770 1771 /* (4) send and recv coi */ 1772 /*-----------------------*/ 1773 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1774 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 1775 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1776 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1777 for (proc = 0, k = 0; proc < size; proc++) { 1778 if (!len_s[proc]) continue; 1779 /* form outgoing message for i-structure: 1780 buf_si[0]: nrows to be sent 1781 [1:nrows]: row index (global) 1782 [nrows+1:2*nrows+1]: i-structure index 1783 */ 1784 /*-------------------------------------------*/ 1785 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 1786 buf_si_i = buf_si + nrows + 1; 1787 buf_si[0] = nrows; 1788 buf_si_i[0] = 0; 1789 nrows = 0; 1790 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1791 nzi = coi[i + 1] - coi[i]; 1792 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1793 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1794 nrows++; 1795 } 1796 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1797 k++; 1798 buf_si += len_si[proc]; 1799 } 1800 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1801 PetscCall(PetscFree(rwaits)); 1802 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1803 1804 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 1805 PetscCall(PetscFree(len_ri)); 1806 PetscCall(PetscFree(swaits)); 1807 PetscCall(PetscFree(buf_s)); 1808 1809 /* (5) compute the local portion of Cmpi */ 1810 /* ------------------------------------------ */ 1811 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1812 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 1813 current_space = free_space; 1814 1815 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 1816 for (k = 0; k < nrecv; k++) { 1817 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1818 nrows = *buf_ri_k[k]; 1819 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1820 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1821 } 1822 1823 MatPreallocateBegin(comm, pn, pn, dnz, onz); 1824 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1825 for (i = 0; i < pn; i++) { 1826 /* add C_loc into Cmpi */ 1827 nzi = c_loc->i[i + 1] - c_loc->i[i]; 1828 Jptr = c_loc->j + c_loc->i[i]; 1829 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1830 1831 /* add received col data into lnk */ 1832 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1833 if (i == *nextrow[k]) { /* i-th row */ 1834 nzi = *(nextci[k] + 1) - *nextci[k]; 1835 Jptr = buf_rj[k] + *nextci[k]; 1836 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1837 nextrow[k]++; 1838 nextci[k]++; 1839 } 1840 } 1841 nzi = lnk[0]; 1842 1843 /* copy data into free space, then initialize lnk */ 1844 PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt)); 1845 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 1846 } 1847 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1848 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1849 PetscCall(PetscFreeSpaceDestroy(free_space)); 1850 1851 /* local sizes and preallocation */ 1852 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1853 if (P->cmap->bs > 0) { 1854 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 1855 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 1856 } 1857 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1858 MatPreallocateEnd(dnz, onz); 1859 1860 /* members in merge */ 1861 PetscCall(PetscFree(id_r)); 1862 PetscCall(PetscFree(len_r)); 1863 PetscCall(PetscFree(buf_ri[0])); 1864 PetscCall(PetscFree(buf_ri)); 1865 PetscCall(PetscFree(buf_rj[0])); 1866 PetscCall(PetscFree(buf_rj)); 1867 PetscCall(PetscLayoutDestroy(&rowmap)); 1868 1869 PetscCall(PetscCalloc1(pN, &ptap->apa)); 1870 1871 /* attach the supporting struct to Cmpi for reuse */ 1872 Cmpi->product->data = ptap; 1873 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1874 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1875 1876 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1877 Cmpi->assembled = PETSC_FALSE; 1878 PetscFunctionReturn(PETSC_SUCCESS); 1879 } 1880 1881 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) 1882 { 1883 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1884 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 1885 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 1886 Mat_APMPI *ptap; 1887 Mat AP_loc, C_loc, C_oth; 1888 PetscInt i, rstart, rend, cm, ncols, row; 1889 PetscInt *api, *apj, am = A->rmap->n, j, col, apnz; 1890 PetscScalar *apa; 1891 const PetscInt *cols; 1892 const PetscScalar *vals; 1893 1894 PetscFunctionBegin; 1895 MatCheckProduct(C, 3); 1896 ptap = (Mat_APMPI *)C->product->data; 1897 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1898 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 1899 1900 PetscCall(MatZeroEntries(C)); 1901 /* 1) get R = Pd^T,Ro = Po^T */ 1902 if (ptap->reuse == MAT_REUSE_MATRIX) { 1903 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 1904 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 1905 } 1906 1907 /* 2) get AP_loc */ 1908 AP_loc = ptap->AP_loc; 1909 ap = (Mat_SeqAIJ *)AP_loc->data; 1910 1911 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1912 /*-----------------------------------------------------*/ 1913 if (ptap->reuse == MAT_REUSE_MATRIX) { 1914 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1915 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1916 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 1917 } 1918 1919 /* 2-2) compute numeric A_loc*P - dominating part */ 1920 /* ---------------------------------------------- */ 1921 /* get data from symbolic products */ 1922 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1923 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1924 apa = ptap->apa; 1925 api = ap->i; 1926 apj = ap->j; 1927 for (i = 0; i < am; i++) { 1928 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1929 AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa); 1930 apnz = api[i + 1] - api[i]; 1931 for (j = 0; j < apnz; j++) { 1932 col = apj[j + api[i]]; 1933 ap->a[j + ap->i[i]] = apa[col]; 1934 apa[col] = 0.0; 1935 } 1936 } 1937 /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */ 1938 PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc)); 1939 1940 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1941 PetscCall(MatProductNumeric(ptap->C_loc)); 1942 PetscCall(MatProductNumeric(ptap->C_oth)); 1943 C_loc = ptap->C_loc; 1944 C_oth = ptap->C_oth; 1945 1946 /* add C_loc and Co to to C */ 1947 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 1948 1949 /* C_loc -> C */ 1950 cm = C_loc->rmap->N; 1951 c_seq = (Mat_SeqAIJ *)C_loc->data; 1952 cols = c_seq->j; 1953 vals = c_seq->a; 1954 1955 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 1956 /* when there are no off-processor parts. */ 1957 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 1958 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 1959 /* a table, and other, more complex stuff has to be done. */ 1960 if (C->assembled) { 1961 C->was_assembled = PETSC_TRUE; 1962 C->assembled = PETSC_FALSE; 1963 } 1964 if (C->was_assembled) { 1965 for (i = 0; i < cm; i++) { 1966 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1967 row = rstart + i; 1968 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1969 cols += ncols; 1970 vals += ncols; 1971 } 1972 } else { 1973 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 1974 } 1975 1976 /* Co -> C, off-processor part */ 1977 cm = C_oth->rmap->N; 1978 c_seq = (Mat_SeqAIJ *)C_oth->data; 1979 cols = c_seq->j; 1980 vals = c_seq->a; 1981 for (i = 0; i < cm; i++) { 1982 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1983 row = p->garray[i]; 1984 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1985 cols += ncols; 1986 vals += ncols; 1987 } 1988 1989 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1990 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1991 1992 ptap->reuse = MAT_REUSE_MATRIX; 1993 PetscFunctionReturn(PETSC_SUCCESS); 1994 } 1995 1996 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) 1997 { 1998 Mat_Product *product = C->product; 1999 Mat A = product->A, P = product->B; 2000 MatProductAlgorithm alg = product->alg; 2001 PetscReal fill = product->fill; 2002 PetscBool flg; 2003 2004 PetscFunctionBegin; 2005 /* scalable: do R=P^T locally, then C=R*A*P */ 2006 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 2007 if (flg) { 2008 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C)); 2009 C->ops->productnumeric = MatProductNumeric_PtAP; 2010 goto next; 2011 } 2012 2013 /* nonscalable: do R=P^T locally, then C=R*A*P */ 2014 PetscCall(PetscStrcmp(alg, "nonscalable", &flg)); 2015 if (flg) { 2016 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C)); 2017 goto next; 2018 } 2019 2020 /* allatonce */ 2021 PetscCall(PetscStrcmp(alg, "allatonce", &flg)); 2022 if (flg) { 2023 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C)); 2024 goto next; 2025 } 2026 2027 /* allatonce_merged */ 2028 PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg)); 2029 if (flg) { 2030 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C)); 2031 goto next; 2032 } 2033 2034 /* backend general code */ 2035 PetscCall(PetscStrcmp(alg, "backend", &flg)); 2036 if (flg) { 2037 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 2038 PetscFunctionReturn(PETSC_SUCCESS); 2039 } 2040 2041 /* hypre */ 2042 #if defined(PETSC_HAVE_HYPRE) 2043 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 2044 if (flg) { 2045 PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C)); 2046 C->ops->productnumeric = MatProductNumeric_PtAP; 2047 PetscFunctionReturn(PETSC_SUCCESS); 2048 } 2049 #endif 2050 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 2051 2052 next: 2053 C->ops->productnumeric = MatProductNumeric_PtAP; 2054 PetscFunctionReturn(PETSC_SUCCESS); 2055 } 2056