1 /* 2 Defines projective product routines where A is a MPIAIJ matrix 3 C = P^T * A * P 4 */ 5 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <../src/mat/impls/aij/mpi/mpiaij.h> 9 #include <petscbt.h> 10 #include <petsctime.h> 11 #include <petsc/private/hashmapiv.h> 12 #include <petsc/private/hashseti.h> 13 #include <petscsf.h> 14 15 static PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer) 16 { 17 PetscBool iascii; 18 PetscViewerFormat format; 19 Mat_APMPI *ptap; 20 21 PetscFunctionBegin; 22 MatCheckProduct(A, 1); 23 ptap = (Mat_APMPI *)A->product->data; 24 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 25 if (iascii) { 26 PetscCall(PetscViewerGetFormat(viewer, &format)); 27 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 28 if (ptap->algType == 0) { 29 PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n")); 30 } else if (ptap->algType == 1) { 31 PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n")); 32 } else if (ptap->algType == 2) { 33 PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n")); 34 } else if (ptap->algType == 3) { 35 PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n")); 36 } 37 } 38 } 39 PetscFunctionReturn(PETSC_SUCCESS); 40 } 41 42 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) 43 { 44 Mat_APMPI *ptap = (Mat_APMPI *)data; 45 Mat_Merge_SeqsToMPI *merge; 46 47 PetscFunctionBegin; 48 PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r)); 49 PetscCall(PetscFree(ptap->bufa)); 50 PetscCall(MatDestroy(&ptap->P_loc)); 51 PetscCall(MatDestroy(&ptap->P_oth)); 52 PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */ 53 PetscCall(MatDestroy(&ptap->Rd)); 54 PetscCall(MatDestroy(&ptap->Ro)); 55 if (ptap->AP_loc) { /* used by alg_rap */ 56 Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data; 57 PetscCall(PetscFree(ap->i)); 58 PetscCall(PetscFree2(ap->j, ap->a)); 59 PetscCall(MatDestroy(&ptap->AP_loc)); 60 } else { /* used by alg_ptap */ 61 PetscCall(PetscFree(ptap->api)); 62 PetscCall(PetscFree(ptap->apj)); 63 } 64 PetscCall(MatDestroy(&ptap->C_loc)); 65 PetscCall(MatDestroy(&ptap->C_oth)); 66 if (ptap->apa) PetscCall(PetscFree(ptap->apa)); 67 68 PetscCall(MatDestroy(&ptap->Pt)); 69 70 merge = ptap->merge; 71 if (merge) { /* used by alg_ptap */ 72 PetscCall(PetscFree(merge->id_r)); 73 PetscCall(PetscFree(merge->len_s)); 74 PetscCall(PetscFree(merge->len_r)); 75 PetscCall(PetscFree(merge->bi)); 76 PetscCall(PetscFree(merge->bj)); 77 PetscCall(PetscFree(merge->buf_ri[0])); 78 PetscCall(PetscFree(merge->buf_ri)); 79 PetscCall(PetscFree(merge->buf_rj[0])); 80 PetscCall(PetscFree(merge->buf_rj)); 81 PetscCall(PetscFree(merge->coi)); 82 PetscCall(PetscFree(merge->coj)); 83 PetscCall(PetscFree(merge->owners_co)); 84 PetscCall(PetscLayoutDestroy(&merge->rowmap)); 85 PetscCall(PetscFree(ptap->merge)); 86 } 87 PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog)); 88 89 PetscCall(PetscSFDestroy(&ptap->sf)); 90 PetscCall(PetscFree(ptap->c_othi)); 91 PetscCall(PetscFree(ptap->c_rmti)); 92 PetscCall(PetscFree(ptap)); 93 PetscFunctionReturn(PETSC_SUCCESS); 94 } 95 96 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C) 97 { 98 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 99 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 100 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 101 Mat_APMPI *ptap; 102 Mat AP_loc, C_loc, C_oth; 103 PetscInt i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout; 104 PetscScalar *apa; 105 const PetscInt *cols; 106 const PetscScalar *vals; 107 108 PetscFunctionBegin; 109 MatCheckProduct(C, 3); 110 ptap = (Mat_APMPI *)C->product->data; 111 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 112 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 113 114 PetscCall(MatZeroEntries(C)); 115 116 /* 1) get R = Pd^T,Ro = Po^T */ 117 if (ptap->reuse == MAT_REUSE_MATRIX) { 118 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 119 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 120 } 121 122 /* 2) get AP_loc */ 123 AP_loc = ptap->AP_loc; 124 ap = (Mat_SeqAIJ *)AP_loc->data; 125 126 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 127 if (ptap->reuse == MAT_REUSE_MATRIX) { 128 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 129 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 130 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 131 } 132 133 /* 2-2) compute numeric A_loc*P - dominating part */ 134 /* get data from symbolic products */ 135 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 136 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 137 138 api = ap->i; 139 apj = ap->j; 140 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj)); 141 for (i = 0; i < am; i++) { 142 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 143 apnz = api[i + 1] - api[i]; 144 apa = ap->a + api[i]; 145 PetscCall(PetscArrayzero(apa, apnz)); 146 AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa); 147 } 148 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj)); 149 PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout); 150 151 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 152 /* Always use scalable version since we are in the MPI scalable version */ 153 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc)); 154 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth)); 155 156 C_loc = ptap->C_loc; 157 C_oth = ptap->C_oth; 158 159 /* add C_loc and Co to to C */ 160 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 161 162 /* C_loc -> C */ 163 cm = C_loc->rmap->N; 164 c_seq = (Mat_SeqAIJ *)C_loc->data; 165 cols = c_seq->j; 166 vals = c_seq->a; 167 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j)); 168 169 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 170 /* when there are no off-processor parts. */ 171 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 172 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 173 /* a table, and other, more complex stuff has to be done. */ 174 if (C->assembled) { 175 C->was_assembled = PETSC_TRUE; 176 C->assembled = PETSC_FALSE; 177 } 178 if (C->was_assembled) { 179 for (i = 0; i < cm; i++) { 180 ncols = c_seq->i[i + 1] - c_seq->i[i]; 181 row = rstart + i; 182 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 183 cols += ncols; 184 vals += ncols; 185 } 186 } else { 187 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 188 } 189 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j)); 190 PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 191 192 /* Co -> C, off-processor part */ 193 cm = C_oth->rmap->N; 194 c_seq = (Mat_SeqAIJ *)C_oth->data; 195 cols = c_seq->j; 196 vals = c_seq->a; 197 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j)); 198 for (i = 0; i < cm; i++) { 199 ncols = c_seq->i[i + 1] - c_seq->i[i]; 200 row = p->garray[i]; 201 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 202 cols += ncols; 203 vals += ncols; 204 } 205 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 206 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 207 208 ptap->reuse = MAT_REUSE_MATRIX; 209 210 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j)); 211 PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi) 216 { 217 Mat_APMPI *ptap; 218 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 219 MPI_Comm comm; 220 PetscMPIInt size, rank; 221 Mat P_loc, P_oth; 222 PetscFreeSpaceList free_space = NULL, current_space = NULL; 223 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 224 PetscInt *lnk, i, k, pnz, row, nsend; 225 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 226 PETSC_UNUSED PetscMPIInt icompleted = 0; 227 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 228 const PetscInt *owners; 229 PetscInt len, proc, *dnz, *onz, nzi, nspacedouble; 230 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 231 MPI_Request *swaits, *rwaits; 232 MPI_Status *sstatus, rstatus; 233 PetscLayout rowmap; 234 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 235 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 236 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout; 237 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 238 PetscScalar *apv; 239 PetscHMapI ta; 240 MatType mtype; 241 const char *prefix; 242 #if defined(PETSC_USE_INFO) 243 PetscReal apfill; 244 #endif 245 246 PetscFunctionBegin; 247 MatCheckProduct(Cmpi, 4); 248 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 249 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 250 PetscCallMPI(MPI_Comm_size(comm, &size)); 251 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 252 253 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 254 255 /* create symbolic parallel matrix Cmpi */ 256 PetscCall(MatGetType(A, &mtype)); 257 PetscCall(MatSetType(Cmpi, mtype)); 258 259 /* create struct Mat_APMPI and attached it to C later */ 260 PetscCall(PetscNew(&ptap)); 261 ptap->reuse = MAT_INITIAL_MATRIX; 262 ptap->algType = 0; 263 264 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 265 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth)); 266 /* get P_loc by taking all local rows of P */ 267 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc)); 268 269 ptap->P_loc = P_loc; 270 ptap->P_oth = P_oth; 271 272 /* (0) compute Rd = Pd^T, Ro = Po^T */ 273 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 274 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 275 276 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 277 p_loc = (Mat_SeqAIJ *)P_loc->data; 278 if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data; 279 280 /* create and initialize a linked list */ 281 PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */ 282 MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta); 283 MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta); 284 PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 285 286 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 287 288 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 289 if (ao) { 290 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 291 } else { 292 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 293 } 294 current_space = free_space; 295 nspacedouble = 0; 296 297 PetscCall(PetscMalloc1(am + 1, &api)); 298 api[0] = 0; 299 for (i = 0; i < am; i++) { 300 /* diagonal portion: Ad[i,:]*P */ 301 ai = ad->i; 302 pi = p_loc->i; 303 nzi = ai[i + 1] - ai[i]; 304 aj = ad->j + ai[i]; 305 for (j = 0; j < nzi; j++) { 306 row = aj[j]; 307 pnz = pi[row + 1] - pi[row]; 308 Jptr = p_loc->j + pi[row]; 309 /* add non-zero cols of P into the sorted linked list lnk */ 310 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 311 } 312 /* off-diagonal portion: Ao[i,:]*P */ 313 if (ao) { 314 ai = ao->i; 315 pi = p_oth->i; 316 nzi = ai[i + 1] - ai[i]; 317 aj = ao->j + ai[i]; 318 for (j = 0; j < nzi; j++) { 319 row = aj[j]; 320 pnz = pi[row + 1] - pi[row]; 321 Jptr = p_oth->j + pi[row]; 322 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 323 } 324 } 325 apnz = lnk[0]; 326 api[i + 1] = api[i] + apnz; 327 328 /* if free space is not available, double the total space in the list */ 329 if (current_space->local_remaining < apnz) { 330 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 331 nspacedouble++; 332 } 333 334 /* Copy data into free space, then initialize lnk */ 335 PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk)); 336 337 current_space->array += apnz; 338 current_space->local_used += apnz; 339 current_space->local_remaining -= apnz; 340 } 341 /* Allocate space for apj and apv, initialize apj, and */ 342 /* destroy list of free space and other temporary array(s) */ 343 PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv)); 344 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 345 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 346 347 /* Create AP_loc for reuse */ 348 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 349 PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog)); 350 351 #if defined(PETSC_USE_INFO) 352 if (ao) { 353 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 354 } else { 355 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 356 } 357 ptap->AP_loc->info.mallocs = nspacedouble; 358 ptap->AP_loc->info.fill_ratio_given = fill; 359 ptap->AP_loc->info.fill_ratio_needed = apfill; 360 361 if (api[am]) { 362 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 363 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 364 } else { 365 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n")); 366 } 367 #endif 368 369 /* (2-1) compute symbolic Co = Ro*AP_loc */ 370 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 371 PetscCall(MatGetOptionsPrefix(A, &prefix)); 372 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 373 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_")); 374 375 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 376 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted")); 377 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 378 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 379 PetscCall(MatProductSymbolic(ptap->C_oth)); 380 381 /* (3) send coj of C_oth to other processors */ 382 /* determine row ownership */ 383 PetscCall(PetscLayoutCreate(comm, &rowmap)); 384 PetscCall(PetscLayoutSetLocalSize(rowmap, pn)); 385 PetscCall(PetscLayoutSetBlockSize(rowmap, 1)); 386 PetscCall(PetscLayoutSetUp(rowmap)); 387 PetscCall(PetscLayoutGetRanges(rowmap, &owners)); 388 389 /* determine the number of messages to send, their lengths */ 390 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 391 PetscCall(PetscArrayzero(len_s, size)); 392 PetscCall(PetscArrayzero(len_si, size)); 393 394 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 395 coi = c_oth->i; 396 coj = c_oth->j; 397 con = ptap->C_oth->rmap->n; 398 proc = 0; 399 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj)); 400 for (i = 0; i < con; i++) { 401 while (prmap[i] >= owners[proc + 1]) proc++; 402 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 403 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 404 } 405 406 len = 0; /* max length of buf_si[], see (4) */ 407 owners_co[0] = 0; 408 nsend = 0; 409 for (proc = 0; proc < size; proc++) { 410 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 411 if (len_s[proc]) { 412 nsend++; 413 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 414 len += len_si[proc]; 415 } 416 } 417 418 /* determine the number and length of messages to receive for coi and coj */ 419 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 420 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 421 422 /* post the Irecv and Isend of coj */ 423 PetscCall(PetscCommGetNewTag(comm, &tagj)); 424 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 425 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 426 for (proc = 0, k = 0; proc < size; proc++) { 427 if (!len_s[proc]) continue; 428 i = owners_co[proc]; 429 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 430 k++; 431 } 432 433 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 434 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 435 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 436 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 437 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 438 439 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 440 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_")); 441 442 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 443 PetscCall(MatProductSymbolic(ptap->C_loc)); 444 445 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 446 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j)); 447 448 /* receives coj are complete */ 449 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 450 PetscCall(PetscFree(rwaits)); 451 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 452 453 /* add received column indices into ta to update Crmax */ 454 for (k = 0; k < nrecv; k++) { /* k-th received message */ 455 Jptr = buf_rj[k]; 456 for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1)); 457 } 458 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 459 PetscCall(PetscHMapIDestroy(&ta)); 460 461 /* (4) send and recv coi */ 462 PetscCall(PetscCommGetNewTag(comm, &tagi)); 463 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 464 PetscCall(PetscMalloc1(len + 1, &buf_s)); 465 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 466 for (proc = 0, k = 0; proc < size; proc++) { 467 if (!len_s[proc]) continue; 468 /* form outgoing message for i-structure: 469 buf_si[0]: nrows to be sent 470 [1:nrows]: row index (global) 471 [nrows+1:2*nrows+1]: i-structure index 472 */ 473 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 474 buf_si_i = buf_si + nrows + 1; 475 buf_si[0] = nrows; 476 buf_si_i[0] = 0; 477 nrows = 0; 478 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 479 nzi = coi[i + 1] - coi[i]; 480 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 481 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 482 nrows++; 483 } 484 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 485 k++; 486 buf_si += len_si[proc]; 487 } 488 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 489 PetscCall(PetscFree(rwaits)); 490 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 491 492 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 493 PetscCall(PetscFree(len_ri)); 494 PetscCall(PetscFree(swaits)); 495 PetscCall(PetscFree(buf_s)); 496 497 /* (5) compute the local portion of Cmpi */ 498 /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */ 499 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 500 current_space = free_space; 501 502 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 503 for (k = 0; k < nrecv; k++) { 504 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 505 nrows = *buf_ri_k[k]; 506 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 507 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 508 } 509 510 MatPreallocateBegin(comm, pn, pn, dnz, onz); 511 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 512 for (i = 0; i < pn; i++) { 513 /* add C_loc into Cmpi */ 514 nzi = c_loc->i[i + 1] - c_loc->i[i]; 515 Jptr = c_loc->j + c_loc->i[i]; 516 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 517 518 /* add received col data into lnk */ 519 for (k = 0; k < nrecv; k++) { /* k-th received message */ 520 if (i == *nextrow[k]) { /* i-th row */ 521 nzi = *(nextci[k] + 1) - *nextci[k]; 522 Jptr = buf_rj[k] + *nextci[k]; 523 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 524 nextrow[k]++; 525 nextci[k]++; 526 } 527 } 528 nzi = lnk[0]; 529 530 /* copy data into free space, then initialize lnk */ 531 PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk)); 532 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 533 } 534 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 535 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 536 PetscCall(PetscFreeSpaceDestroy(free_space)); 537 538 /* local sizes and preallocation */ 539 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 540 if (P->cmap->bs > 0) { 541 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 542 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 543 } 544 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 545 MatPreallocateEnd(dnz, onz); 546 547 /* members in merge */ 548 PetscCall(PetscFree(id_r)); 549 PetscCall(PetscFree(len_r)); 550 PetscCall(PetscFree(buf_ri[0])); 551 PetscCall(PetscFree(buf_ri)); 552 PetscCall(PetscFree(buf_rj[0])); 553 PetscCall(PetscFree(buf_rj)); 554 PetscCall(PetscLayoutDestroy(&rowmap)); 555 556 nout = 0; 557 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j)); 558 PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout); 559 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j)); 560 PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout); 561 562 /* attach the supporting struct to Cmpi for reuse */ 563 Cmpi->product->data = ptap; 564 Cmpi->product->view = MatView_MPIAIJ_PtAP; 565 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 566 567 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 568 Cmpi->assembled = PETSC_FALSE; 569 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 570 PetscFunctionReturn(PETSC_SUCCESS); 571 } 572 573 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht) 574 { 575 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 576 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 577 PetscInt *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k; 578 PetscInt pcstart, pcend, column, offset; 579 580 PetscFunctionBegin; 581 pcstart = P->cmap->rstart; 582 pcstart *= dof; 583 pcend = P->cmap->rend; 584 pcend *= dof; 585 /* diagonal portion: Ad[i,:]*P */ 586 ai = ad->i; 587 nzi = ai[i + 1] - ai[i]; 588 aj = ad->j + ai[i]; 589 for (j = 0; j < nzi; j++) { 590 row = aj[j]; 591 offset = row % dof; 592 row /= dof; 593 nzpi = pd->i[row + 1] - pd->i[row]; 594 pj = pd->j + pd->i[row]; 595 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart)); 596 } 597 /* off diag P */ 598 for (j = 0; j < nzi; j++) { 599 row = aj[j]; 600 offset = row % dof; 601 row /= dof; 602 nzpi = po->i[row + 1] - po->i[row]; 603 pj = po->j + po->i[row]; 604 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset)); 605 } 606 607 /* off diagonal part: Ao[i, :]*P_oth */ 608 if (ao) { 609 ai = ao->i; 610 pi = p_oth->i; 611 nzi = ai[i + 1] - ai[i]; 612 aj = ao->j + ai[i]; 613 for (j = 0; j < nzi; j++) { 614 row = aj[j]; 615 offset = a->garray[row] % dof; 616 row = map[row]; 617 pnz = pi[row + 1] - pi[row]; 618 p_othcols = p_oth->j + pi[row]; 619 for (col = 0; col < pnz; col++) { 620 column = p_othcols[col] * dof + offset; 621 if (column >= pcstart && column < pcend) { 622 PetscCall(PetscHSetIAdd(dht, column)); 623 } else { 624 PetscCall(PetscHSetIAdd(oht, column)); 625 } 626 } 627 } 628 } /* end if (ao) */ 629 PetscFunctionReturn(PETSC_SUCCESS); 630 } 631 632 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap) 633 { 634 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 635 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 636 PetscInt *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset; 637 PetscScalar ra, *aa, *pa; 638 639 PetscFunctionBegin; 640 pcstart = P->cmap->rstart; 641 pcstart *= dof; 642 643 /* diagonal portion: Ad[i,:]*P */ 644 ai = ad->i; 645 nzi = ai[i + 1] - ai[i]; 646 aj = ad->j + ai[i]; 647 aa = ad->a + ai[i]; 648 for (j = 0; j < nzi; j++) { 649 ra = aa[j]; 650 row = aj[j]; 651 offset = row % dof; 652 row /= dof; 653 nzpi = pd->i[row + 1] - pd->i[row]; 654 pj = pd->j + pd->i[row]; 655 pa = pd->a + pd->i[row]; 656 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k])); 657 PetscCall(PetscLogFlops(2.0 * nzpi)); 658 } 659 for (j = 0; j < nzi; j++) { 660 ra = aa[j]; 661 row = aj[j]; 662 offset = row % dof; 663 row /= dof; 664 nzpi = po->i[row + 1] - po->i[row]; 665 pj = po->j + po->i[row]; 666 pa = po->a + po->i[row]; 667 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k])); 668 PetscCall(PetscLogFlops(2.0 * nzpi)); 669 } 670 671 /* off diagonal part: Ao[i, :]*P_oth */ 672 if (ao) { 673 ai = ao->i; 674 pi = p_oth->i; 675 nzi = ai[i + 1] - ai[i]; 676 aj = ao->j + ai[i]; 677 aa = ao->a + ai[i]; 678 for (j = 0; j < nzi; j++) { 679 row = aj[j]; 680 offset = a->garray[row] % dof; 681 row = map[row]; 682 ra = aa[j]; 683 pnz = pi[row + 1] - pi[row]; 684 p_othcols = p_oth->j + pi[row]; 685 pa = p_oth->a + pi[row]; 686 for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col])); 687 PetscCall(PetscLogFlops(2.0 * pnz)); 688 } 689 } /* end if (ao) */ 690 691 PetscFunctionReturn(PETSC_SUCCESS); 692 } 693 694 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *); 695 696 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C) 697 { 698 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 699 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 700 Mat_APMPI *ptap; 701 PetscHMapIV hmap; 702 PetscInt i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc; 703 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 704 PetscInt offset, ii, pocol; 705 const PetscInt *mappingindices; 706 IS map; 707 708 PetscFunctionBegin; 709 MatCheckProduct(C, 4); 710 ptap = (Mat_APMPI *)C->product->data; 711 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 712 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 713 714 PetscCall(MatZeroEntries(C)); 715 716 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 717 if (ptap->reuse == MAT_REUSE_MATRIX) { 718 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 719 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 720 } 721 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 722 723 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 724 pon *= dof; 725 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 726 PetscCall(MatGetLocalSize(A, &am, NULL)); 727 cmaxr = 0; 728 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 729 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 730 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 731 PetscCall(ISGetIndices(map, &mappingindices)); 732 for (i = 0; i < am && pon; i++) { 733 PetscCall(PetscHMapIVClear(hmap)); 734 offset = i % dof; 735 ii = i / dof; 736 nzi = po->i[ii + 1] - po->i[ii]; 737 if (!nzi) continue; 738 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 739 voff = 0; 740 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 741 if (!voff) continue; 742 743 /* Form C(ii, :) */ 744 poj = po->j + po->i[ii]; 745 poa = po->a + po->i[ii]; 746 for (j = 0; j < nzi; j++) { 747 pocol = poj[j] * dof + offset; 748 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 749 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 750 for (jj = 0; jj < voff; jj++) { 751 apvaluestmp[jj] = apvalues[jj] * poa[j]; 752 /* If the row is empty */ 753 if (!c_rmtc[pocol]) { 754 c_rmtjj[jj] = apindices[jj]; 755 c_rmtaa[jj] = apvaluestmp[jj]; 756 c_rmtc[pocol]++; 757 } else { 758 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 759 if (loc >= 0) { /* hit */ 760 c_rmtaa[loc] += apvaluestmp[jj]; 761 PetscCall(PetscLogFlops(1.0)); 762 } else { /* new element */ 763 loc = -(loc + 1); 764 /* Move data backward */ 765 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 766 c_rmtjj[kk] = c_rmtjj[kk - 1]; 767 c_rmtaa[kk] = c_rmtaa[kk - 1]; 768 } /* End kk */ 769 c_rmtjj[loc] = apindices[jj]; 770 c_rmtaa[loc] = apvaluestmp[jj]; 771 c_rmtc[pocol]++; 772 } 773 } 774 PetscCall(PetscLogFlops(voff)); 775 } /* End jj */ 776 } /* End j */ 777 } /* End i */ 778 779 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 780 PetscCall(PetscHMapIVDestroy(&hmap)); 781 782 PetscCall(MatGetLocalSize(P, NULL, &pn)); 783 pn *= dof; 784 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 785 786 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 787 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 788 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 789 pcstart = pcstart * dof; 790 pcend = pcend * dof; 791 cd = (Mat_SeqAIJ *)(c->A)->data; 792 co = (Mat_SeqAIJ *)(c->B)->data; 793 794 cmaxr = 0; 795 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 796 PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ)); 797 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 798 for (i = 0; i < am && pn; i++) { 799 PetscCall(PetscHMapIVClear(hmap)); 800 offset = i % dof; 801 ii = i / dof; 802 nzi = pd->i[ii + 1] - pd->i[ii]; 803 if (!nzi) continue; 804 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 805 voff = 0; 806 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 807 if (!voff) continue; 808 /* Form C(ii, :) */ 809 pdj = pd->j + pd->i[ii]; 810 pda = pd->a + pd->i[ii]; 811 for (j = 0; j < nzi; j++) { 812 row = pcstart + pdj[j] * dof + offset; 813 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; 814 PetscCall(PetscLogFlops(voff)); 815 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 816 } 817 } 818 PetscCall(ISRestoreIndices(map, &mappingindices)); 819 PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend)); 820 PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ)); 821 PetscCall(PetscHMapIVDestroy(&hmap)); 822 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 823 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 824 PetscCall(PetscFree2(c_rmtj, c_rmta)); 825 826 /* Add contributions from remote */ 827 for (i = 0; i < pn; i++) { 828 row = i + pcstart; 829 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 830 } 831 PetscCall(PetscFree2(c_othj, c_otha)); 832 833 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 834 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 835 836 ptap->reuse = MAT_REUSE_MATRIX; 837 PetscFunctionReturn(PETSC_SUCCESS); 838 } 839 840 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C) 841 { 842 PetscFunctionBegin; 843 844 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C)); 845 PetscFunctionReturn(PETSC_SUCCESS); 846 } 847 848 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C) 849 { 850 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 851 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 852 Mat_APMPI *ptap; 853 PetscHMapIV hmap; 854 PetscInt i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc; 855 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 856 PetscInt offset, ii, pocol; 857 const PetscInt *mappingindices; 858 IS map; 859 860 PetscFunctionBegin; 861 MatCheckProduct(C, 4); 862 ptap = (Mat_APMPI *)C->product->data; 863 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 864 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 865 866 PetscCall(MatZeroEntries(C)); 867 868 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 869 if (ptap->reuse == MAT_REUSE_MATRIX) { 870 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 871 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 872 } 873 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 874 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 875 pon *= dof; 876 PetscCall(MatGetLocalSize(P, NULL, &pn)); 877 pn *= dof; 878 879 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 880 PetscCall(MatGetLocalSize(A, &am, NULL)); 881 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 882 pcstart *= dof; 883 pcend *= dof; 884 cmaxr = 0; 885 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 886 cd = (Mat_SeqAIJ *)(c->A)->data; 887 co = (Mat_SeqAIJ *)(c->B)->data; 888 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 889 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 890 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 891 PetscCall(ISGetIndices(map, &mappingindices)); 892 for (i = 0; i < am && (pon || pn); i++) { 893 PetscCall(PetscHMapIVClear(hmap)); 894 offset = i % dof; 895 ii = i / dof; 896 nzi = po->i[ii + 1] - po->i[ii]; 897 dnzi = pd->i[ii + 1] - pd->i[ii]; 898 if (!nzi && !dnzi) continue; 899 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 900 voff = 0; 901 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 902 if (!voff) continue; 903 904 /* Form remote C(ii, :) */ 905 poj = po->j + po->i[ii]; 906 poa = po->a + po->i[ii]; 907 for (j = 0; j < nzi; j++) { 908 pocol = poj[j] * dof + offset; 909 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 910 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 911 for (jj = 0; jj < voff; jj++) { 912 apvaluestmp[jj] = apvalues[jj] * poa[j]; 913 /* If the row is empty */ 914 if (!c_rmtc[pocol]) { 915 c_rmtjj[jj] = apindices[jj]; 916 c_rmtaa[jj] = apvaluestmp[jj]; 917 c_rmtc[pocol]++; 918 } else { 919 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 920 if (loc >= 0) { /* hit */ 921 c_rmtaa[loc] += apvaluestmp[jj]; 922 PetscCall(PetscLogFlops(1.0)); 923 } else { /* new element */ 924 loc = -(loc + 1); 925 /* Move data backward */ 926 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 927 c_rmtjj[kk] = c_rmtjj[kk - 1]; 928 c_rmtaa[kk] = c_rmtaa[kk - 1]; 929 } /* End kk */ 930 c_rmtjj[loc] = apindices[jj]; 931 c_rmtaa[loc] = apvaluestmp[jj]; 932 c_rmtc[pocol]++; 933 } 934 } 935 } /* End jj */ 936 PetscCall(PetscLogFlops(voff)); 937 } /* End j */ 938 939 /* Form local C(ii, :) */ 940 pdj = pd->j + pd->i[ii]; 941 pda = pd->a + pd->i[ii]; 942 for (j = 0; j < dnzi; j++) { 943 row = pcstart + pdj[j] * dof + offset; 944 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */ 945 PetscCall(PetscLogFlops(voff)); 946 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 947 } /* End j */ 948 } /* End i */ 949 950 PetscCall(ISRestoreIndices(map, &mappingindices)); 951 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 952 PetscCall(PetscHMapIVDestroy(&hmap)); 953 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 954 955 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 956 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 957 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 958 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 959 PetscCall(PetscFree2(c_rmtj, c_rmta)); 960 961 /* Add contributions from remote */ 962 for (i = 0; i < pn; i++) { 963 row = i + pcstart; 964 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 965 } 966 PetscCall(PetscFree2(c_othj, c_otha)); 967 968 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 969 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 970 971 ptap->reuse = MAT_REUSE_MATRIX; 972 PetscFunctionReturn(PETSC_SUCCESS); 973 } 974 975 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C) 976 { 977 PetscFunctionBegin; 978 979 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C)); 980 PetscFunctionReturn(PETSC_SUCCESS); 981 } 982 983 /* TODO: move algorithm selection to MatProductSetFromOptions */ 984 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 985 { 986 Mat_APMPI *ptap; 987 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 988 MPI_Comm comm; 989 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 990 MatType mtype; 991 PetscSF sf; 992 PetscSFNode *iremote; 993 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 994 const PetscInt *rootdegrees; 995 PetscHSetI ht, oht, *hta, *hto; 996 PetscInt pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 997 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 998 PetscInt nalg = 2, alg = 0, offset, ii; 999 PetscMPIInt owner; 1000 const PetscInt *mappingindices; 1001 PetscBool flg; 1002 const char *algTypes[2] = {"overlapping", "merged"}; 1003 IS map; 1004 1005 PetscFunctionBegin; 1006 MatCheckProduct(Cmpi, 5); 1007 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1008 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1009 1010 /* Create symbolic parallel matrix Cmpi */ 1011 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1012 pn *= dof; 1013 PetscCall(MatGetType(A, &mtype)); 1014 PetscCall(MatSetType(Cmpi, mtype)); 1015 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1016 1017 PetscCall(PetscNew(&ptap)); 1018 ptap->reuse = MAT_INITIAL_MATRIX; 1019 ptap->algType = 2; 1020 1021 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1022 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1023 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1024 /* This equals to the number of offdiag columns in P */ 1025 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1026 pon *= dof; 1027 /* offsets */ 1028 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1029 /* The number of columns we will send to remote ranks */ 1030 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1031 PetscCall(PetscMalloc1(pon, &hta)); 1032 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1033 PetscCall(MatGetLocalSize(A, &am, NULL)); 1034 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1035 /* Create hash table to merge all columns for C(i, :) */ 1036 PetscCall(PetscHSetICreate(&ht)); 1037 1038 PetscCall(ISGetIndices(map, &mappingindices)); 1039 ptap->c_rmti[0] = 0; 1040 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1041 for (i = 0; i < am && pon; i++) { 1042 /* Form one row of AP */ 1043 PetscCall(PetscHSetIClear(ht)); 1044 offset = i % dof; 1045 ii = i / dof; 1046 /* If the off diag is empty, we should not do any calculation */ 1047 nzi = po->i[ii + 1] - po->i[ii]; 1048 if (!nzi) continue; 1049 1050 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht)); 1051 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1052 /* If AP is empty, just continue */ 1053 if (!htsize) continue; 1054 /* Form C(ii, :) */ 1055 poj = po->j + po->i[ii]; 1056 for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1057 } 1058 1059 for (i = 0; i < pon; i++) { 1060 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1061 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1062 c_rmtc[i] = htsize; 1063 } 1064 1065 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1066 1067 for (i = 0; i < pon; i++) { 1068 off = 0; 1069 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1070 PetscCall(PetscHSetIDestroy(&hta[i])); 1071 } 1072 PetscCall(PetscFree(hta)); 1073 1074 PetscCall(PetscMalloc1(pon, &iremote)); 1075 for (i = 0; i < pon; i++) { 1076 owner = 0; 1077 lidx = 0; 1078 offset = i % dof; 1079 ii = i / dof; 1080 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1081 iremote[i].index = lidx * dof + offset; 1082 iremote[i].rank = owner; 1083 } 1084 1085 PetscCall(PetscSFCreate(comm, &sf)); 1086 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1087 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1088 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1089 PetscCall(PetscSFSetFromOptions(sf)); 1090 PetscCall(PetscSFSetUp(sf)); 1091 /* How many neighbors have contributions to my rows? */ 1092 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1093 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1094 rootspacesize = 0; 1095 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1096 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1097 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1098 /* Get information from leaves 1099 * Number of columns other people contribute to my rows 1100 * */ 1101 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1102 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1103 PetscCall(PetscFree(c_rmtc)); 1104 PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi)); 1105 /* The number of columns is received for each row */ 1106 ptap->c_othi[0] = 0; 1107 rootspacesize = 0; 1108 rootspaceoffsets[0] = 0; 1109 for (i = 0; i < pn; i++) { 1110 rcvncols = 0; 1111 for (j = 0; j < rootdegrees[i]; j++) { 1112 rcvncols += rootspace[rootspacesize]; 1113 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1114 rootspacesize++; 1115 } 1116 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1117 } 1118 PetscCall(PetscFree(rootspace)); 1119 1120 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1121 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1122 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1123 PetscCall(PetscSFDestroy(&sf)); 1124 PetscCall(PetscFree(rootspaceoffsets)); 1125 1126 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1127 nleaves = 0; 1128 for (i = 0; i < pon; i++) { 1129 owner = 0; 1130 ii = i / dof; 1131 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1132 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1133 for (j = 0; j < sendncols; j++) { 1134 iremote[nleaves].rank = owner; 1135 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1136 } 1137 } 1138 PetscCall(PetscFree(c_rmtoffsets)); 1139 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1140 1141 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1142 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1143 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1144 /* One to one map */ 1145 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1146 1147 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1148 PetscCall(PetscHSetICreate(&oht)); 1149 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1150 pcstart *= dof; 1151 pcend *= dof; 1152 PetscCall(PetscMalloc2(pn, &hta, pn, &hto)); 1153 for (i = 0; i < pn; i++) { 1154 PetscCall(PetscHSetICreate(&hta[i])); 1155 PetscCall(PetscHSetICreate(&hto[i])); 1156 } 1157 /* Work on local part */ 1158 /* 4) Pass 1: Estimate memory for C_loc */ 1159 for (i = 0; i < am && pn; i++) { 1160 PetscCall(PetscHSetIClear(ht)); 1161 PetscCall(PetscHSetIClear(oht)); 1162 offset = i % dof; 1163 ii = i / dof; 1164 nzi = pd->i[ii + 1] - pd->i[ii]; 1165 if (!nzi) continue; 1166 1167 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1168 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1169 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1170 if (!(htsize + htosize)) continue; 1171 /* Form C(ii, :) */ 1172 pdj = pd->j + pd->i[ii]; 1173 for (j = 0; j < nzi; j++) { 1174 PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht)); 1175 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1176 } 1177 } 1178 1179 PetscCall(ISRestoreIndices(map, &mappingindices)); 1180 1181 PetscCall(PetscHSetIDestroy(&ht)); 1182 PetscCall(PetscHSetIDestroy(&oht)); 1183 1184 /* Get remote data */ 1185 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1186 PetscCall(PetscFree(c_rmtj)); 1187 1188 for (i = 0; i < pn; i++) { 1189 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1190 rdj = c_othj + ptap->c_othi[i]; 1191 for (j = 0; j < nzi; j++) { 1192 col = rdj[j]; 1193 /* diag part */ 1194 if (col >= pcstart && col < pcend) { 1195 PetscCall(PetscHSetIAdd(hta[i], col)); 1196 } else { /* off diag */ 1197 PetscCall(PetscHSetIAdd(hto[i], col)); 1198 } 1199 } 1200 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1201 dnz[i] = htsize; 1202 PetscCall(PetscHSetIDestroy(&hta[i])); 1203 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1204 onz[i] = htsize; 1205 PetscCall(PetscHSetIDestroy(&hto[i])); 1206 } 1207 1208 PetscCall(PetscFree2(hta, hto)); 1209 PetscCall(PetscFree(c_othj)); 1210 1211 /* local sizes and preallocation */ 1212 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1213 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1214 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1215 PetscCall(MatSetUp(Cmpi)); 1216 PetscCall(PetscFree2(dnz, onz)); 1217 1218 /* attach the supporting struct to Cmpi for reuse */ 1219 Cmpi->product->data = ptap; 1220 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1221 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1222 1223 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1224 Cmpi->assembled = PETSC_FALSE; 1225 /* pick an algorithm */ 1226 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1227 alg = 0; 1228 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1229 PetscOptionsEnd(); 1230 switch (alg) { 1231 case 0: 1232 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1233 break; 1234 case 1: 1235 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1236 break; 1237 default: 1238 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1239 } 1240 PetscFunctionReturn(PETSC_SUCCESS); 1241 } 1242 1243 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C) 1244 { 1245 PetscFunctionBegin; 1246 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C)); 1247 PetscFunctionReturn(PETSC_SUCCESS); 1248 } 1249 1250 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 1251 { 1252 Mat_APMPI *ptap; 1253 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1254 MPI_Comm comm; 1255 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 1256 MatType mtype; 1257 PetscSF sf; 1258 PetscSFNode *iremote; 1259 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 1260 const PetscInt *rootdegrees; 1261 PetscHSetI ht, oht, *hta, *hto, *htd; 1262 PetscInt pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 1263 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 1264 PetscInt nalg = 2, alg = 0, offset, ii; 1265 PetscMPIInt owner; 1266 PetscBool flg; 1267 const char *algTypes[2] = {"merged", "overlapping"}; 1268 const PetscInt *mappingindices; 1269 IS map; 1270 1271 PetscFunctionBegin; 1272 MatCheckProduct(Cmpi, 5); 1273 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1274 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1275 1276 /* Create symbolic parallel matrix Cmpi */ 1277 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1278 pn *= dof; 1279 PetscCall(MatGetType(A, &mtype)); 1280 PetscCall(MatSetType(Cmpi, mtype)); 1281 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1282 1283 PetscCall(PetscNew(&ptap)); 1284 ptap->reuse = MAT_INITIAL_MATRIX; 1285 ptap->algType = 3; 1286 1287 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1288 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1289 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1290 1291 /* This equals to the number of offdiag columns in P */ 1292 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1293 pon *= dof; 1294 /* offsets */ 1295 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1296 /* The number of columns we will send to remote ranks */ 1297 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1298 PetscCall(PetscMalloc1(pon, &hta)); 1299 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1300 PetscCall(MatGetLocalSize(A, &am, NULL)); 1301 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1302 /* Create hash table to merge all columns for C(i, :) */ 1303 PetscCall(PetscHSetICreate(&ht)); 1304 PetscCall(PetscHSetICreate(&oht)); 1305 PetscCall(PetscMalloc2(pn, &htd, pn, &hto)); 1306 for (i = 0; i < pn; i++) { 1307 PetscCall(PetscHSetICreate(&htd[i])); 1308 PetscCall(PetscHSetICreate(&hto[i])); 1309 } 1310 1311 PetscCall(ISGetIndices(map, &mappingindices)); 1312 ptap->c_rmti[0] = 0; 1313 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1314 for (i = 0; i < am && (pon || pn); i++) { 1315 /* Form one row of AP */ 1316 PetscCall(PetscHSetIClear(ht)); 1317 PetscCall(PetscHSetIClear(oht)); 1318 offset = i % dof; 1319 ii = i / dof; 1320 /* If the off diag is empty, we should not do any calculation */ 1321 nzi = po->i[ii + 1] - po->i[ii]; 1322 dnzi = pd->i[ii + 1] - pd->i[ii]; 1323 if (!nzi && !dnzi) continue; 1324 1325 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1326 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1327 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1328 /* If AP is empty, just continue */ 1329 if (!(htsize + htosize)) continue; 1330 1331 /* Form remote C(ii, :) */ 1332 poj = po->j + po->i[ii]; 1333 for (j = 0; j < nzi; j++) { 1334 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1335 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht)); 1336 } 1337 1338 /* Form local C(ii, :) */ 1339 pdj = pd->j + pd->i[ii]; 1340 for (j = 0; j < dnzi; j++) { 1341 PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht)); 1342 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1343 } 1344 } 1345 1346 PetscCall(ISRestoreIndices(map, &mappingindices)); 1347 1348 PetscCall(PetscHSetIDestroy(&ht)); 1349 PetscCall(PetscHSetIDestroy(&oht)); 1350 1351 for (i = 0; i < pon; i++) { 1352 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1353 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1354 c_rmtc[i] = htsize; 1355 } 1356 1357 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1358 1359 for (i = 0; i < pon; i++) { 1360 off = 0; 1361 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1362 PetscCall(PetscHSetIDestroy(&hta[i])); 1363 } 1364 PetscCall(PetscFree(hta)); 1365 1366 PetscCall(PetscMalloc1(pon, &iremote)); 1367 for (i = 0; i < pon; i++) { 1368 owner = 0; 1369 lidx = 0; 1370 offset = i % dof; 1371 ii = i / dof; 1372 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1373 iremote[i].index = lidx * dof + offset; 1374 iremote[i].rank = owner; 1375 } 1376 1377 PetscCall(PetscSFCreate(comm, &sf)); 1378 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1379 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1380 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1381 PetscCall(PetscSFSetFromOptions(sf)); 1382 PetscCall(PetscSFSetUp(sf)); 1383 /* How many neighbors have contributions to my rows? */ 1384 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1385 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1386 rootspacesize = 0; 1387 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1388 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1389 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1390 /* Get information from leaves 1391 * Number of columns other people contribute to my rows 1392 * */ 1393 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1394 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1395 PetscCall(PetscFree(c_rmtc)); 1396 PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi)); 1397 /* The number of columns is received for each row */ 1398 ptap->c_othi[0] = 0; 1399 rootspacesize = 0; 1400 rootspaceoffsets[0] = 0; 1401 for (i = 0; i < pn; i++) { 1402 rcvncols = 0; 1403 for (j = 0; j < rootdegrees[i]; j++) { 1404 rcvncols += rootspace[rootspacesize]; 1405 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1406 rootspacesize++; 1407 } 1408 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1409 } 1410 PetscCall(PetscFree(rootspace)); 1411 1412 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1413 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1414 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1415 PetscCall(PetscSFDestroy(&sf)); 1416 PetscCall(PetscFree(rootspaceoffsets)); 1417 1418 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1419 nleaves = 0; 1420 for (i = 0; i < pon; i++) { 1421 owner = 0; 1422 ii = i / dof; 1423 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1424 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1425 for (j = 0; j < sendncols; j++) { 1426 iremote[nleaves].rank = owner; 1427 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1428 } 1429 } 1430 PetscCall(PetscFree(c_rmtoffsets)); 1431 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1432 1433 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1434 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1435 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1436 /* One to one map */ 1437 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1438 /* Get remote data */ 1439 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1440 PetscCall(PetscFree(c_rmtj)); 1441 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1442 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1443 pcstart *= dof; 1444 pcend *= dof; 1445 for (i = 0; i < pn; i++) { 1446 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1447 rdj = c_othj + ptap->c_othi[i]; 1448 for (j = 0; j < nzi; j++) { 1449 col = rdj[j]; 1450 /* diag part */ 1451 if (col >= pcstart && col < pcend) { 1452 PetscCall(PetscHSetIAdd(htd[i], col)); 1453 } else { /* off diag */ 1454 PetscCall(PetscHSetIAdd(hto[i], col)); 1455 } 1456 } 1457 PetscCall(PetscHSetIGetSize(htd[i], &htsize)); 1458 dnz[i] = htsize; 1459 PetscCall(PetscHSetIDestroy(&htd[i])); 1460 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1461 onz[i] = htsize; 1462 PetscCall(PetscHSetIDestroy(&hto[i])); 1463 } 1464 1465 PetscCall(PetscFree2(htd, hto)); 1466 PetscCall(PetscFree(c_othj)); 1467 1468 /* local sizes and preallocation */ 1469 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1470 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1471 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1472 PetscCall(PetscFree2(dnz, onz)); 1473 1474 /* attach the supporting struct to Cmpi for reuse */ 1475 Cmpi->product->data = ptap; 1476 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1477 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1478 1479 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1480 Cmpi->assembled = PETSC_FALSE; 1481 /* pick an algorithm */ 1482 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1483 alg = 0; 1484 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1485 PetscOptionsEnd(); 1486 switch (alg) { 1487 case 0: 1488 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1489 break; 1490 case 1: 1491 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1492 break; 1493 default: 1494 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1495 } 1496 PetscFunctionReturn(PETSC_SUCCESS); 1497 } 1498 1499 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C) 1500 { 1501 PetscFunctionBegin; 1502 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C)); 1503 PetscFunctionReturn(PETSC_SUCCESS); 1504 } 1505 1506 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi) 1507 { 1508 Mat_APMPI *ptap; 1509 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1510 MPI_Comm comm; 1511 PetscMPIInt size, rank; 1512 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1513 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 1514 PetscInt *lnk, i, k, pnz, row, nsend; 1515 PetscBT lnkbt; 1516 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 1517 PETSC_UNUSED PetscMPIInt icompleted = 0; 1518 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 1519 PetscInt len, proc, *dnz, *onz, *owners, nzi, nspacedouble; 1520 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1521 MPI_Request *swaits, *rwaits; 1522 MPI_Status *sstatus, rstatus; 1523 PetscLayout rowmap; 1524 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1525 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 1526 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi; 1527 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 1528 PetscScalar *apv; 1529 PetscHMapI ta; 1530 MatType mtype; 1531 const char *prefix; 1532 #if defined(PETSC_USE_INFO) 1533 PetscReal apfill; 1534 #endif 1535 1536 PetscFunctionBegin; 1537 MatCheckProduct(Cmpi, 4); 1538 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1539 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1540 PetscCallMPI(MPI_Comm_size(comm, &size)); 1541 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1542 1543 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 1544 1545 /* create symbolic parallel matrix Cmpi */ 1546 PetscCall(MatGetType(A, &mtype)); 1547 PetscCall(MatSetType(Cmpi, mtype)); 1548 1549 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1550 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1551 1552 /* create struct Mat_APMPI and attached it to C later */ 1553 PetscCall(PetscNew(&ptap)); 1554 ptap->reuse = MAT_INITIAL_MATRIX; 1555 ptap->algType = 1; 1556 1557 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1558 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1559 /* get P_loc by taking all local rows of P */ 1560 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 1561 1562 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1563 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 1564 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 1565 1566 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1567 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1568 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1569 1570 /* create and initialize a linked list */ 1571 PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */ 1572 MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta); 1573 MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta); 1574 PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 1575 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1576 1577 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1578 1579 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1580 if (ao) { 1581 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 1582 } else { 1583 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 1584 } 1585 current_space = free_space; 1586 nspacedouble = 0; 1587 1588 PetscCall(PetscMalloc1(am + 1, &api)); 1589 api[0] = 0; 1590 for (i = 0; i < am; i++) { 1591 /* diagonal portion: Ad[i,:]*P */ 1592 ai = ad->i; 1593 pi = p_loc->i; 1594 nzi = ai[i + 1] - ai[i]; 1595 aj = ad->j + ai[i]; 1596 for (j = 0; j < nzi; j++) { 1597 row = aj[j]; 1598 pnz = pi[row + 1] - pi[row]; 1599 Jptr = p_loc->j + pi[row]; 1600 /* add non-zero cols of P into the sorted linked list lnk */ 1601 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1602 } 1603 /* off-diagonal portion: Ao[i,:]*P */ 1604 if (ao) { 1605 ai = ao->i; 1606 pi = p_oth->i; 1607 nzi = ai[i + 1] - ai[i]; 1608 aj = ao->j + ai[i]; 1609 for (j = 0; j < nzi; j++) { 1610 row = aj[j]; 1611 pnz = pi[row + 1] - pi[row]; 1612 Jptr = p_oth->j + pi[row]; 1613 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1614 } 1615 } 1616 apnz = lnk[0]; 1617 api[i + 1] = api[i] + apnz; 1618 if (ap_rmax < apnz) ap_rmax = apnz; 1619 1620 /* if free space is not available, double the total space in the list */ 1621 if (current_space->local_remaining < apnz) { 1622 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 1623 nspacedouble++; 1624 } 1625 1626 /* Copy data into free space, then initialize lnk */ 1627 PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt)); 1628 1629 current_space->array += apnz; 1630 current_space->local_used += apnz; 1631 current_space->local_remaining -= apnz; 1632 } 1633 /* Allocate space for apj and apv, initialize apj, and */ 1634 /* destroy list of free space and other temporary array(s) */ 1635 PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv)); 1636 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 1637 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1638 1639 /* Create AP_loc for reuse */ 1640 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 1641 PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name)); 1642 #if defined(PETSC_USE_INFO) 1643 if (ao) { 1644 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 1645 } else { 1646 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 1647 } 1648 ptap->AP_loc->info.mallocs = nspacedouble; 1649 ptap->AP_loc->info.fill_ratio_given = fill; 1650 ptap->AP_loc->info.fill_ratio_needed = apfill; 1651 1652 if (api[am]) { 1653 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 1654 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 1655 } else { 1656 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n")); 1657 } 1658 #endif 1659 1660 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1661 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1662 PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix)); 1663 PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_")); 1664 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 1665 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1666 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 1667 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_")); 1668 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 1669 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default")); 1670 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 1671 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 1672 PetscCall(MatProductSymbolic(ptap->C_oth)); 1673 1674 /* (3) send coj of C_oth to other processors */ 1675 /* determine row ownership */ 1676 PetscCall(PetscLayoutCreate(comm, &rowmap)); 1677 rowmap->n = pn; 1678 rowmap->bs = 1; 1679 PetscCall(PetscLayoutSetUp(rowmap)); 1680 owners = rowmap->range; 1681 1682 /* determine the number of messages to send, their lengths */ 1683 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 1684 PetscCall(PetscArrayzero(len_s, size)); 1685 PetscCall(PetscArrayzero(len_si, size)); 1686 1687 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 1688 coi = c_oth->i; 1689 coj = c_oth->j; 1690 con = ptap->C_oth->rmap->n; 1691 proc = 0; 1692 for (i = 0; i < con; i++) { 1693 while (prmap[i] >= owners[proc + 1]) proc++; 1694 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1695 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1696 } 1697 1698 len = 0; /* max length of buf_si[], see (4) */ 1699 owners_co[0] = 0; 1700 nsend = 0; 1701 for (proc = 0; proc < size; proc++) { 1702 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1703 if (len_s[proc]) { 1704 nsend++; 1705 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1706 len += len_si[proc]; 1707 } 1708 } 1709 1710 /* determine the number and length of messages to receive for coi and coj */ 1711 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 1712 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 1713 1714 /* post the Irecv and Isend of coj */ 1715 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1716 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 1717 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 1718 for (proc = 0, k = 0; proc < size; proc++) { 1719 if (!len_s[proc]) continue; 1720 i = owners_co[proc]; 1721 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1722 k++; 1723 } 1724 1725 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1726 PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix)); 1727 PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_")); 1728 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 1729 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1730 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 1731 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_")); 1732 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 1733 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 1734 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 1735 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 1736 PetscCall(MatProductSymbolic(ptap->C_loc)); 1737 1738 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 1739 1740 /* receives coj are complete */ 1741 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1742 PetscCall(PetscFree(rwaits)); 1743 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1744 1745 /* add received column indices into ta to update Crmax */ 1746 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1747 Jptr = buf_rj[k]; 1748 for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1)); 1749 } 1750 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 1751 PetscCall(PetscHMapIDestroy(&ta)); 1752 1753 /* (4) send and recv coi */ 1754 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1755 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 1756 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1757 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1758 for (proc = 0, k = 0; proc < size; proc++) { 1759 if (!len_s[proc]) continue; 1760 /* form outgoing message for i-structure: 1761 buf_si[0]: nrows to be sent 1762 [1:nrows]: row index (global) 1763 [nrows+1:2*nrows+1]: i-structure index 1764 */ 1765 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 1766 buf_si_i = buf_si + nrows + 1; 1767 buf_si[0] = nrows; 1768 buf_si_i[0] = 0; 1769 nrows = 0; 1770 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1771 nzi = coi[i + 1] - coi[i]; 1772 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1773 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1774 nrows++; 1775 } 1776 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1777 k++; 1778 buf_si += len_si[proc]; 1779 } 1780 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1781 PetscCall(PetscFree(rwaits)); 1782 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1783 1784 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 1785 PetscCall(PetscFree(len_ri)); 1786 PetscCall(PetscFree(swaits)); 1787 PetscCall(PetscFree(buf_s)); 1788 1789 /* (5) compute the local portion of Cmpi */ 1790 /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */ 1791 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 1792 current_space = free_space; 1793 1794 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 1795 for (k = 0; k < nrecv; k++) { 1796 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1797 nrows = *buf_ri_k[k]; 1798 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1799 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1800 } 1801 1802 MatPreallocateBegin(comm, pn, pn, dnz, onz); 1803 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1804 for (i = 0; i < pn; i++) { 1805 /* add C_loc into Cmpi */ 1806 nzi = c_loc->i[i + 1] - c_loc->i[i]; 1807 Jptr = c_loc->j + c_loc->i[i]; 1808 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1809 1810 /* add received col data into lnk */ 1811 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1812 if (i == *nextrow[k]) { /* i-th row */ 1813 nzi = *(nextci[k] + 1) - *nextci[k]; 1814 Jptr = buf_rj[k] + *nextci[k]; 1815 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1816 nextrow[k]++; 1817 nextci[k]++; 1818 } 1819 } 1820 nzi = lnk[0]; 1821 1822 /* copy data into free space, then initialize lnk */ 1823 PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt)); 1824 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 1825 } 1826 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1827 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1828 PetscCall(PetscFreeSpaceDestroy(free_space)); 1829 1830 /* local sizes and preallocation */ 1831 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1832 if (P->cmap->bs > 0) { 1833 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 1834 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 1835 } 1836 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1837 MatPreallocateEnd(dnz, onz); 1838 1839 /* members in merge */ 1840 PetscCall(PetscFree(id_r)); 1841 PetscCall(PetscFree(len_r)); 1842 PetscCall(PetscFree(buf_ri[0])); 1843 PetscCall(PetscFree(buf_ri)); 1844 PetscCall(PetscFree(buf_rj[0])); 1845 PetscCall(PetscFree(buf_rj)); 1846 PetscCall(PetscLayoutDestroy(&rowmap)); 1847 1848 PetscCall(PetscCalloc1(pN, &ptap->apa)); 1849 1850 /* attach the supporting struct to Cmpi for reuse */ 1851 Cmpi->product->data = ptap; 1852 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1853 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1854 1855 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1856 Cmpi->assembled = PETSC_FALSE; 1857 PetscFunctionReturn(PETSC_SUCCESS); 1858 } 1859 1860 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) 1861 { 1862 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1863 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 1864 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 1865 Mat_APMPI *ptap; 1866 Mat AP_loc, C_loc, C_oth; 1867 PetscInt i, rstart, rend, cm, ncols, row; 1868 PetscInt *api, *apj, am = A->rmap->n, j, col, apnz; 1869 PetscScalar *apa; 1870 const PetscInt *cols; 1871 const PetscScalar *vals; 1872 1873 PetscFunctionBegin; 1874 MatCheckProduct(C, 3); 1875 ptap = (Mat_APMPI *)C->product->data; 1876 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1877 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 1878 1879 PetscCall(MatZeroEntries(C)); 1880 /* 1) get R = Pd^T,Ro = Po^T */ 1881 if (ptap->reuse == MAT_REUSE_MATRIX) { 1882 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 1883 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 1884 } 1885 1886 /* 2) get AP_loc */ 1887 AP_loc = ptap->AP_loc; 1888 ap = (Mat_SeqAIJ *)AP_loc->data; 1889 1890 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1891 if (ptap->reuse == MAT_REUSE_MATRIX) { 1892 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1893 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1894 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 1895 } 1896 1897 /* 2-2) compute numeric A_loc*P - dominating part */ 1898 /* get data from symbolic products */ 1899 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1900 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1901 apa = ptap->apa; 1902 api = ap->i; 1903 apj = ap->j; 1904 for (i = 0; i < am; i++) { 1905 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1906 AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa); 1907 apnz = api[i + 1] - api[i]; 1908 for (j = 0; j < apnz; j++) { 1909 col = apj[j + api[i]]; 1910 ap->a[j + ap->i[i]] = apa[col]; 1911 apa[col] = 0.0; 1912 } 1913 } 1914 /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */ 1915 PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc)); 1916 1917 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1918 PetscCall(MatProductNumeric(ptap->C_loc)); 1919 PetscCall(MatProductNumeric(ptap->C_oth)); 1920 C_loc = ptap->C_loc; 1921 C_oth = ptap->C_oth; 1922 1923 /* add C_loc and Co to to C */ 1924 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 1925 1926 /* C_loc -> C */ 1927 cm = C_loc->rmap->N; 1928 c_seq = (Mat_SeqAIJ *)C_loc->data; 1929 cols = c_seq->j; 1930 vals = c_seq->a; 1931 1932 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 1933 /* when there are no off-processor parts. */ 1934 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 1935 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 1936 /* a table, and other, more complex stuff has to be done. */ 1937 if (C->assembled) { 1938 C->was_assembled = PETSC_TRUE; 1939 C->assembled = PETSC_FALSE; 1940 } 1941 if (C->was_assembled) { 1942 for (i = 0; i < cm; i++) { 1943 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1944 row = rstart + i; 1945 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1946 cols += ncols; 1947 vals += ncols; 1948 } 1949 } else { 1950 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 1951 } 1952 1953 /* Co -> C, off-processor part */ 1954 cm = C_oth->rmap->N; 1955 c_seq = (Mat_SeqAIJ *)C_oth->data; 1956 cols = c_seq->j; 1957 vals = c_seq->a; 1958 for (i = 0; i < cm; i++) { 1959 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1960 row = p->garray[i]; 1961 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1962 cols += ncols; 1963 vals += ncols; 1964 } 1965 1966 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1967 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1968 1969 ptap->reuse = MAT_REUSE_MATRIX; 1970 PetscFunctionReturn(PETSC_SUCCESS); 1971 } 1972 1973 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) 1974 { 1975 Mat_Product *product = C->product; 1976 Mat A = product->A, P = product->B; 1977 MatProductAlgorithm alg = product->alg; 1978 PetscReal fill = product->fill; 1979 PetscBool flg; 1980 1981 PetscFunctionBegin; 1982 /* scalable: do R=P^T locally, then C=R*A*P */ 1983 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 1984 if (flg) { 1985 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C)); 1986 C->ops->productnumeric = MatProductNumeric_PtAP; 1987 goto next; 1988 } 1989 1990 /* nonscalable: do R=P^T locally, then C=R*A*P */ 1991 PetscCall(PetscStrcmp(alg, "nonscalable", &flg)); 1992 if (flg) { 1993 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C)); 1994 goto next; 1995 } 1996 1997 /* allatonce */ 1998 PetscCall(PetscStrcmp(alg, "allatonce", &flg)); 1999 if (flg) { 2000 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C)); 2001 goto next; 2002 } 2003 2004 /* allatonce_merged */ 2005 PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg)); 2006 if (flg) { 2007 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C)); 2008 goto next; 2009 } 2010 2011 /* backend general code */ 2012 PetscCall(PetscStrcmp(alg, "backend", &flg)); 2013 if (flg) { 2014 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 2015 PetscFunctionReturn(PETSC_SUCCESS); 2016 } 2017 2018 /* hypre */ 2019 #if defined(PETSC_HAVE_HYPRE) 2020 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 2021 if (flg) { 2022 PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C)); 2023 C->ops->productnumeric = MatProductNumeric_PtAP; 2024 PetscFunctionReturn(PETSC_SUCCESS); 2025 } 2026 #endif 2027 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 2028 2029 next: 2030 C->ops->productnumeric = MatProductNumeric_PtAP; 2031 PetscFunctionReturn(PETSC_SUCCESS); 2032 } 2033