1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmapiv.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer) 17 { 18 PetscBool iascii; 19 PetscViewerFormat format; 20 Mat_APMPI *ptap; 21 22 PetscFunctionBegin; 23 MatCheckProduct(A, 1); 24 ptap = (Mat_APMPI *)A->product->data; 25 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 26 if (iascii) { 27 PetscCall(PetscViewerGetFormat(viewer, &format)); 28 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 29 if (ptap->algType == 0) { 30 PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n")); 31 } else if (ptap->algType == 1) { 32 PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n")); 33 } else if (ptap->algType == 2) { 34 PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n")); 35 } else if (ptap->algType == 3) { 36 PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n")); 37 } 38 } 39 } 40 PetscFunctionReturn(PETSC_SUCCESS); 41 } 42 43 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) 44 { 45 Mat_APMPI *ptap = (Mat_APMPI *)data; 46 Mat_Merge_SeqsToMPI *merge; 47 48 PetscFunctionBegin; 49 PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r)); 50 PetscCall(PetscFree(ptap->bufa)); 51 PetscCall(MatDestroy(&ptap->P_loc)); 52 PetscCall(MatDestroy(&ptap->P_oth)); 53 PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */ 54 PetscCall(MatDestroy(&ptap->Rd)); 55 PetscCall(MatDestroy(&ptap->Ro)); 56 if (ptap->AP_loc) { /* used by alg_rap */ 57 Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data; 58 PetscCall(PetscFree(ap->i)); 59 PetscCall(PetscFree2(ap->j, ap->a)); 60 PetscCall(MatDestroy(&ptap->AP_loc)); 61 } else { /* used by alg_ptap */ 62 PetscCall(PetscFree(ptap->api)); 63 PetscCall(PetscFree(ptap->apj)); 64 } 65 PetscCall(MatDestroy(&ptap->C_loc)); 66 PetscCall(MatDestroy(&ptap->C_oth)); 67 if (ptap->apa) PetscCall(PetscFree(ptap->apa)); 68 69 PetscCall(MatDestroy(&ptap->Pt)); 70 71 merge = ptap->merge; 72 if (merge) { /* used by alg_ptap */ 73 PetscCall(PetscFree(merge->id_r)); 74 PetscCall(PetscFree(merge->len_s)); 75 PetscCall(PetscFree(merge->len_r)); 76 PetscCall(PetscFree(merge->bi)); 77 PetscCall(PetscFree(merge->bj)); 78 PetscCall(PetscFree(merge->buf_ri[0])); 79 PetscCall(PetscFree(merge->buf_ri)); 80 PetscCall(PetscFree(merge->buf_rj[0])); 81 PetscCall(PetscFree(merge->buf_rj)); 82 PetscCall(PetscFree(merge->coi)); 83 PetscCall(PetscFree(merge->coj)); 84 PetscCall(PetscFree(merge->owners_co)); 85 PetscCall(PetscLayoutDestroy(&merge->rowmap)); 86 PetscCall(PetscFree(ptap->merge)); 87 } 88 PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog)); 89 90 PetscCall(PetscSFDestroy(&ptap->sf)); 91 PetscCall(PetscFree(ptap->c_othi)); 92 PetscCall(PetscFree(ptap->c_rmti)); 93 PetscCall(PetscFree(ptap)); 94 PetscFunctionReturn(PETSC_SUCCESS); 95 } 96 97 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C) 98 { 99 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 100 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 101 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 102 Mat_APMPI *ptap; 103 Mat AP_loc, C_loc, C_oth; 104 PetscInt i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout; 105 PetscScalar *apa; 106 const PetscInt *cols; 107 const PetscScalar *vals; 108 109 PetscFunctionBegin; 110 MatCheckProduct(C, 3); 111 ptap = (Mat_APMPI *)C->product->data; 112 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 113 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 114 115 PetscCall(MatZeroEntries(C)); 116 117 /* 1) get R = Pd^T,Ro = Po^T */ 118 if (ptap->reuse == MAT_REUSE_MATRIX) { 119 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 120 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 121 } 122 123 /* 2) get AP_loc */ 124 AP_loc = ptap->AP_loc; 125 ap = (Mat_SeqAIJ *)AP_loc->data; 126 127 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 128 if (ptap->reuse == MAT_REUSE_MATRIX) { 129 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 130 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 131 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 132 } 133 134 /* 2-2) compute numeric A_loc*P - dominating part */ 135 /* get data from symbolic products */ 136 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 137 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 138 139 api = ap->i; 140 apj = ap->j; 141 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj)); 142 for (i = 0; i < am; i++) { 143 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 144 apnz = api[i + 1] - api[i]; 145 apa = ap->a + api[i]; 146 PetscCall(PetscArrayzero(apa, apnz)); 147 AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa); 148 } 149 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj)); 150 PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout); 151 152 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 153 /* Always use scalable version since we are in the MPI scalable version */ 154 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc)); 155 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth)); 156 157 C_loc = ptap->C_loc; 158 C_oth = ptap->C_oth; 159 160 /* add C_loc and Co to to C */ 161 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 162 163 /* C_loc -> C */ 164 cm = C_loc->rmap->N; 165 c_seq = (Mat_SeqAIJ *)C_loc->data; 166 cols = c_seq->j; 167 vals = c_seq->a; 168 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j)); 169 170 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 171 /* when there are no off-processor parts. */ 172 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 173 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 174 /* a table, and other, more complex stuff has to be done. */ 175 if (C->assembled) { 176 C->was_assembled = PETSC_TRUE; 177 C->assembled = PETSC_FALSE; 178 } 179 if (C->was_assembled) { 180 for (i = 0; i < cm; i++) { 181 ncols = c_seq->i[i + 1] - c_seq->i[i]; 182 row = rstart + i; 183 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 184 cols += ncols; 185 vals += ncols; 186 } 187 } else { 188 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 189 } 190 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j)); 191 PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 192 193 /* Co -> C, off-processor part */ 194 cm = C_oth->rmap->N; 195 c_seq = (Mat_SeqAIJ *)C_oth->data; 196 cols = c_seq->j; 197 vals = c_seq->a; 198 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j)); 199 for (i = 0; i < cm; i++) { 200 ncols = c_seq->i[i + 1] - c_seq->i[i]; 201 row = p->garray[i]; 202 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 203 cols += ncols; 204 vals += ncols; 205 } 206 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 207 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 208 209 ptap->reuse = MAT_REUSE_MATRIX; 210 211 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j)); 212 PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 213 PetscFunctionReturn(PETSC_SUCCESS); 214 } 215 216 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi) 217 { 218 Mat_APMPI *ptap; 219 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 220 MPI_Comm comm; 221 PetscMPIInt size, rank; 222 Mat P_loc, P_oth; 223 PetscFreeSpaceList free_space = NULL, current_space = NULL; 224 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 225 PetscInt *lnk, i, k, pnz, row, nsend; 226 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 227 PETSC_UNUSED PetscMPIInt icompleted = 0; 228 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 229 const PetscInt *owners; 230 PetscInt len, proc, *dnz, *onz, nzi, nspacedouble; 231 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 232 MPI_Request *swaits, *rwaits; 233 MPI_Status *sstatus, rstatus; 234 PetscLayout rowmap; 235 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 236 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 237 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout; 238 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 239 PetscScalar *apv; 240 PetscHMapI ta; 241 MatType mtype; 242 const char *prefix; 243 #if defined(PETSC_USE_INFO) 244 PetscReal apfill; 245 #endif 246 247 PetscFunctionBegin; 248 MatCheckProduct(Cmpi, 4); 249 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 250 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 251 PetscCallMPI(MPI_Comm_size(comm, &size)); 252 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 253 254 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 255 256 /* create symbolic parallel matrix Cmpi */ 257 PetscCall(MatGetType(A, &mtype)); 258 PetscCall(MatSetType(Cmpi, mtype)); 259 260 /* create struct Mat_APMPI and attached it to C later */ 261 PetscCall(PetscNew(&ptap)); 262 ptap->reuse = MAT_INITIAL_MATRIX; 263 ptap->algType = 0; 264 265 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 266 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth)); 267 /* get P_loc by taking all local rows of P */ 268 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc)); 269 270 ptap->P_loc = P_loc; 271 ptap->P_oth = P_oth; 272 273 /* (0) compute Rd = Pd^T, Ro = Po^T */ 274 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 275 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 276 277 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 278 p_loc = (Mat_SeqAIJ *)P_loc->data; 279 if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data; 280 281 /* create and initialize a linked list */ 282 PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */ 283 MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta); 284 MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta); 285 PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 286 287 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 288 289 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 290 if (ao) { 291 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 292 } else { 293 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 294 } 295 current_space = free_space; 296 nspacedouble = 0; 297 298 PetscCall(PetscMalloc1(am + 1, &api)); 299 api[0] = 0; 300 for (i = 0; i < am; i++) { 301 /* diagonal portion: Ad[i,:]*P */ 302 ai = ad->i; 303 pi = p_loc->i; 304 nzi = ai[i + 1] - ai[i]; 305 aj = ad->j + ai[i]; 306 for (j = 0; j < nzi; j++) { 307 row = aj[j]; 308 pnz = pi[row + 1] - pi[row]; 309 Jptr = p_loc->j + pi[row]; 310 /* add non-zero cols of P into the sorted linked list lnk */ 311 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 312 } 313 /* off-diagonal portion: Ao[i,:]*P */ 314 if (ao) { 315 ai = ao->i; 316 pi = p_oth->i; 317 nzi = ai[i + 1] - ai[i]; 318 aj = ao->j + ai[i]; 319 for (j = 0; j < nzi; j++) { 320 row = aj[j]; 321 pnz = pi[row + 1] - pi[row]; 322 Jptr = p_oth->j + pi[row]; 323 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 324 } 325 } 326 apnz = lnk[0]; 327 api[i + 1] = api[i] + apnz; 328 329 /* if free space is not available, double the total space in the list */ 330 if (current_space->local_remaining < apnz) { 331 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 332 nspacedouble++; 333 } 334 335 /* Copy data into free space, then initialize lnk */ 336 PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk)); 337 338 current_space->array += apnz; 339 current_space->local_used += apnz; 340 current_space->local_remaining -= apnz; 341 } 342 /* Allocate space for apj and apv, initialize apj, and */ 343 /* destroy list of free space and other temporary array(s) */ 344 PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv)); 345 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 346 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 347 348 /* Create AP_loc for reuse */ 349 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 350 PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog)); 351 352 #if defined(PETSC_USE_INFO) 353 if (ao) { 354 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 355 } else { 356 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 357 } 358 ptap->AP_loc->info.mallocs = nspacedouble; 359 ptap->AP_loc->info.fill_ratio_given = fill; 360 ptap->AP_loc->info.fill_ratio_needed = apfill; 361 362 if (api[am]) { 363 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 364 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 365 } else { 366 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n")); 367 } 368 #endif 369 370 /* (2-1) compute symbolic Co = Ro*AP_loc */ 371 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 372 PetscCall(MatGetOptionsPrefix(A, &prefix)); 373 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 374 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_")); 375 376 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 377 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted")); 378 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 379 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 380 PetscCall(MatProductSymbolic(ptap->C_oth)); 381 382 /* (3) send coj of C_oth to other processors */ 383 /* determine row ownership */ 384 PetscCall(PetscLayoutCreate(comm, &rowmap)); 385 PetscCall(PetscLayoutSetLocalSize(rowmap, pn)); 386 PetscCall(PetscLayoutSetBlockSize(rowmap, 1)); 387 PetscCall(PetscLayoutSetUp(rowmap)); 388 PetscCall(PetscLayoutGetRanges(rowmap, &owners)); 389 390 /* determine the number of messages to send, their lengths */ 391 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 392 PetscCall(PetscArrayzero(len_s, size)); 393 PetscCall(PetscArrayzero(len_si, size)); 394 395 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 396 coi = c_oth->i; 397 coj = c_oth->j; 398 con = ptap->C_oth->rmap->n; 399 proc = 0; 400 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj)); 401 for (i = 0; i < con; i++) { 402 while (prmap[i] >= owners[proc + 1]) proc++; 403 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 404 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 405 } 406 407 len = 0; /* max length of buf_si[], see (4) */ 408 owners_co[0] = 0; 409 nsend = 0; 410 for (proc = 0; proc < size; proc++) { 411 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 412 if (len_s[proc]) { 413 nsend++; 414 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 415 len += len_si[proc]; 416 } 417 } 418 419 /* determine the number and length of messages to receive for coi and coj */ 420 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 421 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 422 423 /* post the Irecv and Isend of coj */ 424 PetscCall(PetscCommGetNewTag(comm, &tagj)); 425 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 426 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 427 for (proc = 0, k = 0; proc < size; proc++) { 428 if (!len_s[proc]) continue; 429 i = owners_co[proc]; 430 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 431 k++; 432 } 433 434 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 435 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 436 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 437 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 438 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 439 440 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 441 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_")); 442 443 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 444 PetscCall(MatProductSymbolic(ptap->C_loc)); 445 446 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 447 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j)); 448 449 /* receives coj are complete */ 450 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 451 PetscCall(PetscFree(rwaits)); 452 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 453 454 /* add received column indices into ta to update Crmax */ 455 for (k = 0; k < nrecv; k++) { /* k-th received message */ 456 Jptr = buf_rj[k]; 457 for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1)); 458 } 459 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 460 PetscCall(PetscHMapIDestroy(&ta)); 461 462 /* (4) send and recv coi */ 463 PetscCall(PetscCommGetNewTag(comm, &tagi)); 464 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 465 PetscCall(PetscMalloc1(len + 1, &buf_s)); 466 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 467 for (proc = 0, k = 0; proc < size; proc++) { 468 if (!len_s[proc]) continue; 469 /* form outgoing message for i-structure: 470 buf_si[0]: nrows to be sent 471 [1:nrows]: row index (global) 472 [nrows+1:2*nrows+1]: i-structure index 473 */ 474 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 475 buf_si_i = buf_si + nrows + 1; 476 buf_si[0] = nrows; 477 buf_si_i[0] = 0; 478 nrows = 0; 479 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 480 nzi = coi[i + 1] - coi[i]; 481 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 482 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 483 nrows++; 484 } 485 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 486 k++; 487 buf_si += len_si[proc]; 488 } 489 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 490 PetscCall(PetscFree(rwaits)); 491 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 492 493 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 494 PetscCall(PetscFree(len_ri)); 495 PetscCall(PetscFree(swaits)); 496 PetscCall(PetscFree(buf_s)); 497 498 /* (5) compute the local portion of Cmpi */ 499 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 500 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 501 current_space = free_space; 502 503 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 504 for (k = 0; k < nrecv; k++) { 505 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 506 nrows = *buf_ri_k[k]; 507 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 508 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 509 } 510 511 MatPreallocateBegin(comm, pn, pn, dnz, onz); 512 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 513 for (i = 0; i < pn; i++) { 514 /* add C_loc into Cmpi */ 515 nzi = c_loc->i[i + 1] - c_loc->i[i]; 516 Jptr = c_loc->j + c_loc->i[i]; 517 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 518 519 /* add received col data into lnk */ 520 for (k = 0; k < nrecv; k++) { /* k-th received message */ 521 if (i == *nextrow[k]) { /* i-th row */ 522 nzi = *(nextci[k] + 1) - *nextci[k]; 523 Jptr = buf_rj[k] + *nextci[k]; 524 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 525 nextrow[k]++; 526 nextci[k]++; 527 } 528 } 529 nzi = lnk[0]; 530 531 /* copy data into free space, then initialize lnk */ 532 PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk)); 533 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 534 } 535 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 536 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 537 PetscCall(PetscFreeSpaceDestroy(free_space)); 538 539 /* local sizes and preallocation */ 540 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 541 if (P->cmap->bs > 0) { 542 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 543 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 544 } 545 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 546 MatPreallocateEnd(dnz, onz); 547 548 /* members in merge */ 549 PetscCall(PetscFree(id_r)); 550 PetscCall(PetscFree(len_r)); 551 PetscCall(PetscFree(buf_ri[0])); 552 PetscCall(PetscFree(buf_ri)); 553 PetscCall(PetscFree(buf_rj[0])); 554 PetscCall(PetscFree(buf_rj)); 555 PetscCall(PetscLayoutDestroy(&rowmap)); 556 557 nout = 0; 558 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j)); 559 PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout); 560 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j)); 561 PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout); 562 563 /* attach the supporting struct to Cmpi for reuse */ 564 Cmpi->product->data = ptap; 565 Cmpi->product->view = MatView_MPIAIJ_PtAP; 566 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 567 568 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 569 Cmpi->assembled = PETSC_FALSE; 570 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 571 PetscFunctionReturn(PETSC_SUCCESS); 572 } 573 574 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht) 575 { 576 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 577 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 578 PetscInt *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k; 579 PetscInt pcstart, pcend, column, offset; 580 581 PetscFunctionBegin; 582 pcstart = P->cmap->rstart; 583 pcstart *= dof; 584 pcend = P->cmap->rend; 585 pcend *= dof; 586 /* diagonal portion: Ad[i,:]*P */ 587 ai = ad->i; 588 nzi = ai[i + 1] - ai[i]; 589 aj = ad->j + ai[i]; 590 for (j = 0; j < nzi; j++) { 591 row = aj[j]; 592 offset = row % dof; 593 row /= dof; 594 nzpi = pd->i[row + 1] - pd->i[row]; 595 pj = pd->j + pd->i[row]; 596 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart)); 597 } 598 /* off diag P */ 599 for (j = 0; j < nzi; j++) { 600 row = aj[j]; 601 offset = row % dof; 602 row /= dof; 603 nzpi = po->i[row + 1] - po->i[row]; 604 pj = po->j + po->i[row]; 605 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset)); 606 } 607 608 /* off diagonal part: Ao[i, :]*P_oth */ 609 if (ao) { 610 ai = ao->i; 611 pi = p_oth->i; 612 nzi = ai[i + 1] - ai[i]; 613 aj = ao->j + ai[i]; 614 for (j = 0; j < nzi; j++) { 615 row = aj[j]; 616 offset = a->garray[row] % dof; 617 row = map[row]; 618 pnz = pi[row + 1] - pi[row]; 619 p_othcols = p_oth->j + pi[row]; 620 for (col = 0; col < pnz; col++) { 621 column = p_othcols[col] * dof + offset; 622 if (column >= pcstart && column < pcend) { 623 PetscCall(PetscHSetIAdd(dht, column)); 624 } else { 625 PetscCall(PetscHSetIAdd(oht, column)); 626 } 627 } 628 } 629 } /* end if (ao) */ 630 PetscFunctionReturn(PETSC_SUCCESS); 631 } 632 633 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap) 634 { 635 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 636 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 637 PetscInt *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset; 638 PetscScalar ra, *aa, *pa; 639 640 PetscFunctionBegin; 641 pcstart = P->cmap->rstart; 642 pcstart *= dof; 643 644 /* diagonal portion: Ad[i,:]*P */ 645 ai = ad->i; 646 nzi = ai[i + 1] - ai[i]; 647 aj = ad->j + ai[i]; 648 aa = ad->a + ai[i]; 649 for (j = 0; j < nzi; j++) { 650 ra = aa[j]; 651 row = aj[j]; 652 offset = row % dof; 653 row /= dof; 654 nzpi = pd->i[row + 1] - pd->i[row]; 655 pj = pd->j + pd->i[row]; 656 pa = pd->a + pd->i[row]; 657 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k])); 658 PetscCall(PetscLogFlops(2.0 * nzpi)); 659 } 660 for (j = 0; j < nzi; j++) { 661 ra = aa[j]; 662 row = aj[j]; 663 offset = row % dof; 664 row /= dof; 665 nzpi = po->i[row + 1] - po->i[row]; 666 pj = po->j + po->i[row]; 667 pa = po->a + po->i[row]; 668 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k])); 669 PetscCall(PetscLogFlops(2.0 * nzpi)); 670 } 671 672 /* off diagonal part: Ao[i, :]*P_oth */ 673 if (ao) { 674 ai = ao->i; 675 pi = p_oth->i; 676 nzi = ai[i + 1] - ai[i]; 677 aj = ao->j + ai[i]; 678 aa = ao->a + ai[i]; 679 for (j = 0; j < nzi; j++) { 680 row = aj[j]; 681 offset = a->garray[row] % dof; 682 row = map[row]; 683 ra = aa[j]; 684 pnz = pi[row + 1] - pi[row]; 685 p_othcols = p_oth->j + pi[row]; 686 pa = p_oth->a + pi[row]; 687 for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col])); 688 PetscCall(PetscLogFlops(2.0 * pnz)); 689 } 690 } /* end if (ao) */ 691 692 PetscFunctionReturn(PETSC_SUCCESS); 693 } 694 695 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *); 696 697 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C) 698 { 699 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 700 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 701 Mat_APMPI *ptap; 702 PetscHMapIV hmap; 703 PetscInt i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc; 704 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 705 PetscInt offset, ii, pocol; 706 const PetscInt *mappingindices; 707 IS map; 708 709 PetscFunctionBegin; 710 MatCheckProduct(C, 4); 711 ptap = (Mat_APMPI *)C->product->data; 712 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 713 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 714 715 PetscCall(MatZeroEntries(C)); 716 717 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 718 if (ptap->reuse == MAT_REUSE_MATRIX) { 719 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 720 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 721 } 722 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 723 724 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 725 pon *= dof; 726 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 727 PetscCall(MatGetLocalSize(A, &am, NULL)); 728 cmaxr = 0; 729 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 730 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 731 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 732 PetscCall(ISGetIndices(map, &mappingindices)); 733 for (i = 0; i < am && pon; i++) { 734 PetscCall(PetscHMapIVClear(hmap)); 735 offset = i % dof; 736 ii = i / dof; 737 nzi = po->i[ii + 1] - po->i[ii]; 738 if (!nzi) continue; 739 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 740 voff = 0; 741 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 742 if (!voff) continue; 743 744 /* Form C(ii, :) */ 745 poj = po->j + po->i[ii]; 746 poa = po->a + po->i[ii]; 747 for (j = 0; j < nzi; j++) { 748 pocol = poj[j] * dof + offset; 749 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 750 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 751 for (jj = 0; jj < voff; jj++) { 752 apvaluestmp[jj] = apvalues[jj] * poa[j]; 753 /* If the row is empty */ 754 if (!c_rmtc[pocol]) { 755 c_rmtjj[jj] = apindices[jj]; 756 c_rmtaa[jj] = apvaluestmp[jj]; 757 c_rmtc[pocol]++; 758 } else { 759 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 760 if (loc >= 0) { /* hit */ 761 c_rmtaa[loc] += apvaluestmp[jj]; 762 PetscCall(PetscLogFlops(1.0)); 763 } else { /* new element */ 764 loc = -(loc + 1); 765 /* Move data backward */ 766 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 767 c_rmtjj[kk] = c_rmtjj[kk - 1]; 768 c_rmtaa[kk] = c_rmtaa[kk - 1]; 769 } /* End kk */ 770 c_rmtjj[loc] = apindices[jj]; 771 c_rmtaa[loc] = apvaluestmp[jj]; 772 c_rmtc[pocol]++; 773 } 774 } 775 PetscCall(PetscLogFlops(voff)); 776 } /* End jj */ 777 } /* End j */ 778 } /* End i */ 779 780 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 781 PetscCall(PetscHMapIVDestroy(&hmap)); 782 783 PetscCall(MatGetLocalSize(P, NULL, &pn)); 784 pn *= dof; 785 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 786 787 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 788 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 789 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 790 pcstart = pcstart * dof; 791 pcend = pcend * dof; 792 cd = (Mat_SeqAIJ *)(c->A)->data; 793 co = (Mat_SeqAIJ *)(c->B)->data; 794 795 cmaxr = 0; 796 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 797 PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ)); 798 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 799 for (i = 0; i < am && pn; i++) { 800 PetscCall(PetscHMapIVClear(hmap)); 801 offset = i % dof; 802 ii = i / dof; 803 nzi = pd->i[ii + 1] - pd->i[ii]; 804 if (!nzi) continue; 805 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 806 voff = 0; 807 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 808 if (!voff) continue; 809 /* Form C(ii, :) */ 810 pdj = pd->j + pd->i[ii]; 811 pda = pd->a + pd->i[ii]; 812 for (j = 0; j < nzi; j++) { 813 row = pcstart + pdj[j] * dof + offset; 814 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; 815 PetscCall(PetscLogFlops(voff)); 816 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 817 } 818 } 819 PetscCall(ISRestoreIndices(map, &mappingindices)); 820 PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend)); 821 PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ)); 822 PetscCall(PetscHMapIVDestroy(&hmap)); 823 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 824 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 825 PetscCall(PetscFree2(c_rmtj, c_rmta)); 826 827 /* Add contributions from remote */ 828 for (i = 0; i < pn; i++) { 829 row = i + pcstart; 830 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 831 } 832 PetscCall(PetscFree2(c_othj, c_otha)); 833 834 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 835 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 836 837 ptap->reuse = MAT_REUSE_MATRIX; 838 PetscFunctionReturn(PETSC_SUCCESS); 839 } 840 841 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C) 842 { 843 PetscFunctionBegin; 844 845 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C)); 846 PetscFunctionReturn(PETSC_SUCCESS); 847 } 848 849 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C) 850 { 851 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 852 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 853 Mat_APMPI *ptap; 854 PetscHMapIV hmap; 855 PetscInt i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc; 856 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 857 PetscInt offset, ii, pocol; 858 const PetscInt *mappingindices; 859 IS map; 860 861 PetscFunctionBegin; 862 MatCheckProduct(C, 4); 863 ptap = (Mat_APMPI *)C->product->data; 864 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 865 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 866 867 PetscCall(MatZeroEntries(C)); 868 869 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 870 if (ptap->reuse == MAT_REUSE_MATRIX) { 871 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 872 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 873 } 874 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 875 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 876 pon *= dof; 877 PetscCall(MatGetLocalSize(P, NULL, &pn)); 878 pn *= dof; 879 880 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 881 PetscCall(MatGetLocalSize(A, &am, NULL)); 882 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 883 pcstart *= dof; 884 pcend *= dof; 885 cmaxr = 0; 886 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 887 cd = (Mat_SeqAIJ *)(c->A)->data; 888 co = (Mat_SeqAIJ *)(c->B)->data; 889 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 890 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 891 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 892 PetscCall(ISGetIndices(map, &mappingindices)); 893 for (i = 0; i < am && (pon || pn); i++) { 894 PetscCall(PetscHMapIVClear(hmap)); 895 offset = i % dof; 896 ii = i / dof; 897 nzi = po->i[ii + 1] - po->i[ii]; 898 dnzi = pd->i[ii + 1] - pd->i[ii]; 899 if (!nzi && !dnzi) continue; 900 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 901 voff = 0; 902 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 903 if (!voff) continue; 904 905 /* Form remote C(ii, :) */ 906 poj = po->j + po->i[ii]; 907 poa = po->a + po->i[ii]; 908 for (j = 0; j < nzi; j++) { 909 pocol = poj[j] * dof + offset; 910 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 911 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 912 for (jj = 0; jj < voff; jj++) { 913 apvaluestmp[jj] = apvalues[jj] * poa[j]; 914 /* If the row is empty */ 915 if (!c_rmtc[pocol]) { 916 c_rmtjj[jj] = apindices[jj]; 917 c_rmtaa[jj] = apvaluestmp[jj]; 918 c_rmtc[pocol]++; 919 } else { 920 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 921 if (loc >= 0) { /* hit */ 922 c_rmtaa[loc] += apvaluestmp[jj]; 923 PetscCall(PetscLogFlops(1.0)); 924 } else { /* new element */ 925 loc = -(loc + 1); 926 /* Move data backward */ 927 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 928 c_rmtjj[kk] = c_rmtjj[kk - 1]; 929 c_rmtaa[kk] = c_rmtaa[kk - 1]; 930 } /* End kk */ 931 c_rmtjj[loc] = apindices[jj]; 932 c_rmtaa[loc] = apvaluestmp[jj]; 933 c_rmtc[pocol]++; 934 } 935 } 936 } /* End jj */ 937 PetscCall(PetscLogFlops(voff)); 938 } /* End j */ 939 940 /* Form local C(ii, :) */ 941 pdj = pd->j + pd->i[ii]; 942 pda = pd->a + pd->i[ii]; 943 for (j = 0; j < dnzi; j++) { 944 row = pcstart + pdj[j] * dof + offset; 945 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */ 946 PetscCall(PetscLogFlops(voff)); 947 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 948 } /* End j */ 949 } /* End i */ 950 951 PetscCall(ISRestoreIndices(map, &mappingindices)); 952 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 953 PetscCall(PetscHMapIVDestroy(&hmap)); 954 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 955 956 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 957 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 958 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 959 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 960 PetscCall(PetscFree2(c_rmtj, c_rmta)); 961 962 /* Add contributions from remote */ 963 for (i = 0; i < pn; i++) { 964 row = i + pcstart; 965 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 966 } 967 PetscCall(PetscFree2(c_othj, c_otha)); 968 969 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 970 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 971 972 ptap->reuse = MAT_REUSE_MATRIX; 973 PetscFunctionReturn(PETSC_SUCCESS); 974 } 975 976 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C) 977 { 978 PetscFunctionBegin; 979 980 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C)); 981 PetscFunctionReturn(PETSC_SUCCESS); 982 } 983 984 /* TODO: move algorithm selection to MatProductSetFromOptions */ 985 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 986 { 987 Mat_APMPI *ptap; 988 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 989 MPI_Comm comm; 990 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 991 MatType mtype; 992 PetscSF sf; 993 PetscSFNode *iremote; 994 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 995 const PetscInt *rootdegrees; 996 PetscHSetI ht, oht, *hta, *hto; 997 PetscInt pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 998 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 999 PetscInt nalg = 2, alg = 0, offset, ii; 1000 PetscMPIInt owner; 1001 const PetscInt *mappingindices; 1002 PetscBool flg; 1003 const char *algTypes[2] = {"overlapping", "merged"}; 1004 IS map; 1005 1006 PetscFunctionBegin; 1007 MatCheckProduct(Cmpi, 5); 1008 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1009 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1010 1011 /* Create symbolic parallel matrix Cmpi */ 1012 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1013 pn *= dof; 1014 PetscCall(MatGetType(A, &mtype)); 1015 PetscCall(MatSetType(Cmpi, mtype)); 1016 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1017 1018 PetscCall(PetscNew(&ptap)); 1019 ptap->reuse = MAT_INITIAL_MATRIX; 1020 ptap->algType = 2; 1021 1022 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1023 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1024 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1025 /* This equals to the number of offdiag columns in P */ 1026 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1027 pon *= dof; 1028 /* offsets */ 1029 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1030 /* The number of columns we will send to remote ranks */ 1031 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1032 PetscCall(PetscMalloc1(pon, &hta)); 1033 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1034 PetscCall(MatGetLocalSize(A, &am, NULL)); 1035 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1036 /* Create hash table to merge all columns for C(i, :) */ 1037 PetscCall(PetscHSetICreate(&ht)); 1038 1039 PetscCall(ISGetIndices(map, &mappingindices)); 1040 ptap->c_rmti[0] = 0; 1041 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1042 for (i = 0; i < am && pon; i++) { 1043 /* Form one row of AP */ 1044 PetscCall(PetscHSetIClear(ht)); 1045 offset = i % dof; 1046 ii = i / dof; 1047 /* If the off diag is empty, we should not do any calculation */ 1048 nzi = po->i[ii + 1] - po->i[ii]; 1049 if (!nzi) continue; 1050 1051 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht)); 1052 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1053 /* If AP is empty, just continue */ 1054 if (!htsize) continue; 1055 /* Form C(ii, :) */ 1056 poj = po->j + po->i[ii]; 1057 for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1058 } 1059 1060 for (i = 0; i < pon; i++) { 1061 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1062 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1063 c_rmtc[i] = htsize; 1064 } 1065 1066 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1067 1068 for (i = 0; i < pon; i++) { 1069 off = 0; 1070 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1071 PetscCall(PetscHSetIDestroy(&hta[i])); 1072 } 1073 PetscCall(PetscFree(hta)); 1074 1075 PetscCall(PetscMalloc1(pon, &iremote)); 1076 for (i = 0; i < pon; i++) { 1077 owner = 0; 1078 lidx = 0; 1079 offset = i % dof; 1080 ii = i / dof; 1081 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1082 iremote[i].index = lidx * dof + offset; 1083 iremote[i].rank = owner; 1084 } 1085 1086 PetscCall(PetscSFCreate(comm, &sf)); 1087 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1088 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1089 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1090 PetscCall(PetscSFSetFromOptions(sf)); 1091 PetscCall(PetscSFSetUp(sf)); 1092 /* How many neighbors have contributions to my rows? */ 1093 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1094 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1095 rootspacesize = 0; 1096 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1097 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1098 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1099 /* Get information from leaves 1100 * Number of columns other people contribute to my rows 1101 * */ 1102 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1103 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1104 PetscCall(PetscFree(c_rmtc)); 1105 PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi)); 1106 /* The number of columns is received for each row */ 1107 ptap->c_othi[0] = 0; 1108 rootspacesize = 0; 1109 rootspaceoffsets[0] = 0; 1110 for (i = 0; i < pn; i++) { 1111 rcvncols = 0; 1112 for (j = 0; j < rootdegrees[i]; j++) { 1113 rcvncols += rootspace[rootspacesize]; 1114 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1115 rootspacesize++; 1116 } 1117 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1118 } 1119 PetscCall(PetscFree(rootspace)); 1120 1121 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1122 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1123 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1124 PetscCall(PetscSFDestroy(&sf)); 1125 PetscCall(PetscFree(rootspaceoffsets)); 1126 1127 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1128 nleaves = 0; 1129 for (i = 0; i < pon; i++) { 1130 owner = 0; 1131 ii = i / dof; 1132 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1133 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1134 for (j = 0; j < sendncols; j++) { 1135 iremote[nleaves].rank = owner; 1136 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1137 } 1138 } 1139 PetscCall(PetscFree(c_rmtoffsets)); 1140 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1141 1142 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1143 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1144 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1145 /* One to one map */ 1146 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1147 1148 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1149 PetscCall(PetscHSetICreate(&oht)); 1150 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1151 pcstart *= dof; 1152 pcend *= dof; 1153 PetscCall(PetscMalloc2(pn, &hta, pn, &hto)); 1154 for (i = 0; i < pn; i++) { 1155 PetscCall(PetscHSetICreate(&hta[i])); 1156 PetscCall(PetscHSetICreate(&hto[i])); 1157 } 1158 /* Work on local part */ 1159 /* 4) Pass 1: Estimate memory for C_loc */ 1160 for (i = 0; i < am && pn; i++) { 1161 PetscCall(PetscHSetIClear(ht)); 1162 PetscCall(PetscHSetIClear(oht)); 1163 offset = i % dof; 1164 ii = i / dof; 1165 nzi = pd->i[ii + 1] - pd->i[ii]; 1166 if (!nzi) continue; 1167 1168 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1169 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1170 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1171 if (!(htsize + htosize)) continue; 1172 /* Form C(ii, :) */ 1173 pdj = pd->j + pd->i[ii]; 1174 for (j = 0; j < nzi; j++) { 1175 PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht)); 1176 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1177 } 1178 } 1179 1180 PetscCall(ISRestoreIndices(map, &mappingindices)); 1181 1182 PetscCall(PetscHSetIDestroy(&ht)); 1183 PetscCall(PetscHSetIDestroy(&oht)); 1184 1185 /* Get remote data */ 1186 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1187 PetscCall(PetscFree(c_rmtj)); 1188 1189 for (i = 0; i < pn; i++) { 1190 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1191 rdj = c_othj + ptap->c_othi[i]; 1192 for (j = 0; j < nzi; j++) { 1193 col = rdj[j]; 1194 /* diag part */ 1195 if (col >= pcstart && col < pcend) { 1196 PetscCall(PetscHSetIAdd(hta[i], col)); 1197 } else { /* off diag */ 1198 PetscCall(PetscHSetIAdd(hto[i], col)); 1199 } 1200 } 1201 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1202 dnz[i] = htsize; 1203 PetscCall(PetscHSetIDestroy(&hta[i])); 1204 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1205 onz[i] = htsize; 1206 PetscCall(PetscHSetIDestroy(&hto[i])); 1207 } 1208 1209 PetscCall(PetscFree2(hta, hto)); 1210 PetscCall(PetscFree(c_othj)); 1211 1212 /* local sizes and preallocation */ 1213 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1214 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1215 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1216 PetscCall(MatSetUp(Cmpi)); 1217 PetscCall(PetscFree2(dnz, onz)); 1218 1219 /* attach the supporting struct to Cmpi for reuse */ 1220 Cmpi->product->data = ptap; 1221 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1222 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1223 1224 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1225 Cmpi->assembled = PETSC_FALSE; 1226 /* pick an algorithm */ 1227 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1228 alg = 0; 1229 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1230 PetscOptionsEnd(); 1231 switch (alg) { 1232 case 0: 1233 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1234 break; 1235 case 1: 1236 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1237 break; 1238 default: 1239 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1240 } 1241 PetscFunctionReturn(PETSC_SUCCESS); 1242 } 1243 1244 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C) 1245 { 1246 PetscFunctionBegin; 1247 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C)); 1248 PetscFunctionReturn(PETSC_SUCCESS); 1249 } 1250 1251 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 1252 { 1253 Mat_APMPI *ptap; 1254 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1255 MPI_Comm comm; 1256 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 1257 MatType mtype; 1258 PetscSF sf; 1259 PetscSFNode *iremote; 1260 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 1261 const PetscInt *rootdegrees; 1262 PetscHSetI ht, oht, *hta, *hto, *htd; 1263 PetscInt pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 1264 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 1265 PetscInt nalg = 2, alg = 0, offset, ii; 1266 PetscMPIInt owner; 1267 PetscBool flg; 1268 const char *algTypes[2] = {"merged", "overlapping"}; 1269 const PetscInt *mappingindices; 1270 IS map; 1271 1272 PetscFunctionBegin; 1273 MatCheckProduct(Cmpi, 5); 1274 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1275 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1276 1277 /* Create symbolic parallel matrix Cmpi */ 1278 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1279 pn *= dof; 1280 PetscCall(MatGetType(A, &mtype)); 1281 PetscCall(MatSetType(Cmpi, mtype)); 1282 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1283 1284 PetscCall(PetscNew(&ptap)); 1285 ptap->reuse = MAT_INITIAL_MATRIX; 1286 ptap->algType = 3; 1287 1288 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1289 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1290 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1291 1292 /* This equals to the number of offdiag columns in P */ 1293 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1294 pon *= dof; 1295 /* offsets */ 1296 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1297 /* The number of columns we will send to remote ranks */ 1298 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1299 PetscCall(PetscMalloc1(pon, &hta)); 1300 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1301 PetscCall(MatGetLocalSize(A, &am, NULL)); 1302 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1303 /* Create hash table to merge all columns for C(i, :) */ 1304 PetscCall(PetscHSetICreate(&ht)); 1305 PetscCall(PetscHSetICreate(&oht)); 1306 PetscCall(PetscMalloc2(pn, &htd, pn, &hto)); 1307 for (i = 0; i < pn; i++) { 1308 PetscCall(PetscHSetICreate(&htd[i])); 1309 PetscCall(PetscHSetICreate(&hto[i])); 1310 } 1311 1312 PetscCall(ISGetIndices(map, &mappingindices)); 1313 ptap->c_rmti[0] = 0; 1314 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1315 for (i = 0; i < am && (pon || pn); i++) { 1316 /* Form one row of AP */ 1317 PetscCall(PetscHSetIClear(ht)); 1318 PetscCall(PetscHSetIClear(oht)); 1319 offset = i % dof; 1320 ii = i / dof; 1321 /* If the off diag is empty, we should not do any calculation */ 1322 nzi = po->i[ii + 1] - po->i[ii]; 1323 dnzi = pd->i[ii + 1] - pd->i[ii]; 1324 if (!nzi && !dnzi) continue; 1325 1326 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1327 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1328 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1329 /* If AP is empty, just continue */ 1330 if (!(htsize + htosize)) continue; 1331 1332 /* Form remote C(ii, :) */ 1333 poj = po->j + po->i[ii]; 1334 for (j = 0; j < nzi; j++) { 1335 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1336 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht)); 1337 } 1338 1339 /* Form local C(ii, :) */ 1340 pdj = pd->j + pd->i[ii]; 1341 for (j = 0; j < dnzi; j++) { 1342 PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht)); 1343 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1344 } 1345 } 1346 1347 PetscCall(ISRestoreIndices(map, &mappingindices)); 1348 1349 PetscCall(PetscHSetIDestroy(&ht)); 1350 PetscCall(PetscHSetIDestroy(&oht)); 1351 1352 for (i = 0; i < pon; i++) { 1353 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1354 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1355 c_rmtc[i] = htsize; 1356 } 1357 1358 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1359 1360 for (i = 0; i < pon; i++) { 1361 off = 0; 1362 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1363 PetscCall(PetscHSetIDestroy(&hta[i])); 1364 } 1365 PetscCall(PetscFree(hta)); 1366 1367 PetscCall(PetscMalloc1(pon, &iremote)); 1368 for (i = 0; i < pon; i++) { 1369 owner = 0; 1370 lidx = 0; 1371 offset = i % dof; 1372 ii = i / dof; 1373 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1374 iremote[i].index = lidx * dof + offset; 1375 iremote[i].rank = owner; 1376 } 1377 1378 PetscCall(PetscSFCreate(comm, &sf)); 1379 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1380 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1381 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1382 PetscCall(PetscSFSetFromOptions(sf)); 1383 PetscCall(PetscSFSetUp(sf)); 1384 /* How many neighbors have contributions to my rows? */ 1385 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1386 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1387 rootspacesize = 0; 1388 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1389 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1390 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1391 /* Get information from leaves 1392 * Number of columns other people contribute to my rows 1393 * */ 1394 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1395 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1396 PetscCall(PetscFree(c_rmtc)); 1397 PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi)); 1398 /* The number of columns is received for each row */ 1399 ptap->c_othi[0] = 0; 1400 rootspacesize = 0; 1401 rootspaceoffsets[0] = 0; 1402 for (i = 0; i < pn; i++) { 1403 rcvncols = 0; 1404 for (j = 0; j < rootdegrees[i]; j++) { 1405 rcvncols += rootspace[rootspacesize]; 1406 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1407 rootspacesize++; 1408 } 1409 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1410 } 1411 PetscCall(PetscFree(rootspace)); 1412 1413 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1414 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1415 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1416 PetscCall(PetscSFDestroy(&sf)); 1417 PetscCall(PetscFree(rootspaceoffsets)); 1418 1419 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1420 nleaves = 0; 1421 for (i = 0; i < pon; i++) { 1422 owner = 0; 1423 ii = i / dof; 1424 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1425 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1426 for (j = 0; j < sendncols; j++) { 1427 iremote[nleaves].rank = owner; 1428 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1429 } 1430 } 1431 PetscCall(PetscFree(c_rmtoffsets)); 1432 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1433 1434 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1435 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1436 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1437 /* One to one map */ 1438 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1439 /* Get remote data */ 1440 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1441 PetscCall(PetscFree(c_rmtj)); 1442 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1443 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1444 pcstart *= dof; 1445 pcend *= dof; 1446 for (i = 0; i < pn; i++) { 1447 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1448 rdj = c_othj + ptap->c_othi[i]; 1449 for (j = 0; j < nzi; j++) { 1450 col = rdj[j]; 1451 /* diag part */ 1452 if (col >= pcstart && col < pcend) { 1453 PetscCall(PetscHSetIAdd(htd[i], col)); 1454 } else { /* off diag */ 1455 PetscCall(PetscHSetIAdd(hto[i], col)); 1456 } 1457 } 1458 PetscCall(PetscHSetIGetSize(htd[i], &htsize)); 1459 dnz[i] = htsize; 1460 PetscCall(PetscHSetIDestroy(&htd[i])); 1461 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1462 onz[i] = htsize; 1463 PetscCall(PetscHSetIDestroy(&hto[i])); 1464 } 1465 1466 PetscCall(PetscFree2(htd, hto)); 1467 PetscCall(PetscFree(c_othj)); 1468 1469 /* local sizes and preallocation */ 1470 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1471 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1472 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1473 PetscCall(PetscFree2(dnz, onz)); 1474 1475 /* attach the supporting struct to Cmpi for reuse */ 1476 Cmpi->product->data = ptap; 1477 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1478 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1479 1480 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1481 Cmpi->assembled = PETSC_FALSE; 1482 /* pick an algorithm */ 1483 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1484 alg = 0; 1485 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1486 PetscOptionsEnd(); 1487 switch (alg) { 1488 case 0: 1489 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1490 break; 1491 case 1: 1492 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1493 break; 1494 default: 1495 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1496 } 1497 PetscFunctionReturn(PETSC_SUCCESS); 1498 } 1499 1500 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C) 1501 { 1502 PetscFunctionBegin; 1503 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C)); 1504 PetscFunctionReturn(PETSC_SUCCESS); 1505 } 1506 1507 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi) 1508 { 1509 Mat_APMPI *ptap; 1510 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1511 MPI_Comm comm; 1512 PetscMPIInt size, rank; 1513 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1514 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 1515 PetscInt *lnk, i, k, pnz, row, nsend; 1516 PetscBT lnkbt; 1517 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 1518 PETSC_UNUSED PetscMPIInt icompleted = 0; 1519 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 1520 PetscInt len, proc, *dnz, *onz, *owners, nzi, nspacedouble; 1521 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1522 MPI_Request *swaits, *rwaits; 1523 MPI_Status *sstatus, rstatus; 1524 PetscLayout rowmap; 1525 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1526 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 1527 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi; 1528 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 1529 PetscScalar *apv; 1530 PetscHMapI ta; 1531 MatType mtype; 1532 const char *prefix; 1533 #if defined(PETSC_USE_INFO) 1534 PetscReal apfill; 1535 #endif 1536 1537 PetscFunctionBegin; 1538 MatCheckProduct(Cmpi, 4); 1539 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1540 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1541 PetscCallMPI(MPI_Comm_size(comm, &size)); 1542 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1543 1544 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 1545 1546 /* create symbolic parallel matrix Cmpi */ 1547 PetscCall(MatGetType(A, &mtype)); 1548 PetscCall(MatSetType(Cmpi, mtype)); 1549 1550 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1551 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1552 1553 /* create struct Mat_APMPI and attached it to C later */ 1554 PetscCall(PetscNew(&ptap)); 1555 ptap->reuse = MAT_INITIAL_MATRIX; 1556 ptap->algType = 1; 1557 1558 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1559 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1560 /* get P_loc by taking all local rows of P */ 1561 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 1562 1563 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1564 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 1565 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 1566 1567 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1568 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1569 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1570 1571 /* create and initialize a linked list */ 1572 PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */ 1573 MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta); 1574 MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta); 1575 PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 1576 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1577 1578 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1579 1580 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1581 if (ao) { 1582 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 1583 } else { 1584 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 1585 } 1586 current_space = free_space; 1587 nspacedouble = 0; 1588 1589 PetscCall(PetscMalloc1(am + 1, &api)); 1590 api[0] = 0; 1591 for (i = 0; i < am; i++) { 1592 /* diagonal portion: Ad[i,:]*P */ 1593 ai = ad->i; 1594 pi = p_loc->i; 1595 nzi = ai[i + 1] - ai[i]; 1596 aj = ad->j + ai[i]; 1597 for (j = 0; j < nzi; j++) { 1598 row = aj[j]; 1599 pnz = pi[row + 1] - pi[row]; 1600 Jptr = p_loc->j + pi[row]; 1601 /* add non-zero cols of P into the sorted linked list lnk */ 1602 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1603 } 1604 /* off-diagonal portion: Ao[i,:]*P */ 1605 if (ao) { 1606 ai = ao->i; 1607 pi = p_oth->i; 1608 nzi = ai[i + 1] - ai[i]; 1609 aj = ao->j + ai[i]; 1610 for (j = 0; j < nzi; j++) { 1611 row = aj[j]; 1612 pnz = pi[row + 1] - pi[row]; 1613 Jptr = p_oth->j + pi[row]; 1614 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1615 } 1616 } 1617 apnz = lnk[0]; 1618 api[i + 1] = api[i] + apnz; 1619 if (ap_rmax < apnz) ap_rmax = apnz; 1620 1621 /* if free space is not available, double the total space in the list */ 1622 if (current_space->local_remaining < apnz) { 1623 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 1624 nspacedouble++; 1625 } 1626 1627 /* Copy data into free space, then initialize lnk */ 1628 PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt)); 1629 1630 current_space->array += apnz; 1631 current_space->local_used += apnz; 1632 current_space->local_remaining -= apnz; 1633 } 1634 /* Allocate space for apj and apv, initialize apj, and */ 1635 /* destroy list of free space and other temporary array(s) */ 1636 PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv)); 1637 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 1638 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1639 1640 /* Create AP_loc for reuse */ 1641 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 1642 PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name)); 1643 #if defined(PETSC_USE_INFO) 1644 if (ao) { 1645 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 1646 } else { 1647 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 1648 } 1649 ptap->AP_loc->info.mallocs = nspacedouble; 1650 ptap->AP_loc->info.fill_ratio_given = fill; 1651 ptap->AP_loc->info.fill_ratio_needed = apfill; 1652 1653 if (api[am]) { 1654 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 1655 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 1656 } else { 1657 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n")); 1658 } 1659 #endif 1660 1661 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1662 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1663 PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix)); 1664 PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_")); 1665 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 1666 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1667 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 1668 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_")); 1669 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 1670 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default")); 1671 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 1672 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 1673 PetscCall(MatProductSymbolic(ptap->C_oth)); 1674 1675 /* (3) send coj of C_oth to other processors */ 1676 /* determine row ownership */ 1677 PetscCall(PetscLayoutCreate(comm, &rowmap)); 1678 rowmap->n = pn; 1679 rowmap->bs = 1; 1680 PetscCall(PetscLayoutSetUp(rowmap)); 1681 owners = rowmap->range; 1682 1683 /* determine the number of messages to send, their lengths */ 1684 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 1685 PetscCall(PetscArrayzero(len_s, size)); 1686 PetscCall(PetscArrayzero(len_si, size)); 1687 1688 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 1689 coi = c_oth->i; 1690 coj = c_oth->j; 1691 con = ptap->C_oth->rmap->n; 1692 proc = 0; 1693 for (i = 0; i < con; i++) { 1694 while (prmap[i] >= owners[proc + 1]) proc++; 1695 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1696 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1697 } 1698 1699 len = 0; /* max length of buf_si[], see (4) */ 1700 owners_co[0] = 0; 1701 nsend = 0; 1702 for (proc = 0; proc < size; proc++) { 1703 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1704 if (len_s[proc]) { 1705 nsend++; 1706 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1707 len += len_si[proc]; 1708 } 1709 } 1710 1711 /* determine the number and length of messages to receive for coi and coj */ 1712 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 1713 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 1714 1715 /* post the Irecv and Isend of coj */ 1716 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1717 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 1718 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 1719 for (proc = 0, k = 0; proc < size; proc++) { 1720 if (!len_s[proc]) continue; 1721 i = owners_co[proc]; 1722 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1723 k++; 1724 } 1725 1726 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1727 PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix)); 1728 PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_")); 1729 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 1730 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1731 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 1732 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_")); 1733 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 1734 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 1735 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 1736 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 1737 PetscCall(MatProductSymbolic(ptap->C_loc)); 1738 1739 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 1740 1741 /* receives coj are complete */ 1742 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1743 PetscCall(PetscFree(rwaits)); 1744 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1745 1746 /* add received column indices into ta to update Crmax */ 1747 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1748 Jptr = buf_rj[k]; 1749 for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1)); 1750 } 1751 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 1752 PetscCall(PetscHMapIDestroy(&ta)); 1753 1754 /* (4) send and recv coi */ 1755 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1756 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 1757 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1758 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1759 for (proc = 0, k = 0; proc < size; proc++) { 1760 if (!len_s[proc]) continue; 1761 /* form outgoing message for i-structure: 1762 buf_si[0]: nrows to be sent 1763 [1:nrows]: row index (global) 1764 [nrows+1:2*nrows+1]: i-structure index 1765 */ 1766 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 1767 buf_si_i = buf_si + nrows + 1; 1768 buf_si[0] = nrows; 1769 buf_si_i[0] = 0; 1770 nrows = 0; 1771 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1772 nzi = coi[i + 1] - coi[i]; 1773 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1774 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1775 nrows++; 1776 } 1777 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1778 k++; 1779 buf_si += len_si[proc]; 1780 } 1781 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1782 PetscCall(PetscFree(rwaits)); 1783 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1784 1785 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 1786 PetscCall(PetscFree(len_ri)); 1787 PetscCall(PetscFree(swaits)); 1788 PetscCall(PetscFree(buf_s)); 1789 1790 /* (5) compute the local portion of Cmpi */ 1791 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1792 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 1793 current_space = free_space; 1794 1795 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 1796 for (k = 0; k < nrecv; k++) { 1797 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1798 nrows = *buf_ri_k[k]; 1799 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1800 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1801 } 1802 1803 MatPreallocateBegin(comm, pn, pn, dnz, onz); 1804 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1805 for (i = 0; i < pn; i++) { 1806 /* add C_loc into Cmpi */ 1807 nzi = c_loc->i[i + 1] - c_loc->i[i]; 1808 Jptr = c_loc->j + c_loc->i[i]; 1809 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1810 1811 /* add received col data into lnk */ 1812 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1813 if (i == *nextrow[k]) { /* i-th row */ 1814 nzi = *(nextci[k] + 1) - *nextci[k]; 1815 Jptr = buf_rj[k] + *nextci[k]; 1816 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1817 nextrow[k]++; 1818 nextci[k]++; 1819 } 1820 } 1821 nzi = lnk[0]; 1822 1823 /* copy data into free space, then initialize lnk */ 1824 PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt)); 1825 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 1826 } 1827 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1828 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1829 PetscCall(PetscFreeSpaceDestroy(free_space)); 1830 1831 /* local sizes and preallocation */ 1832 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1833 if (P->cmap->bs > 0) { 1834 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 1835 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 1836 } 1837 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1838 MatPreallocateEnd(dnz, onz); 1839 1840 /* members in merge */ 1841 PetscCall(PetscFree(id_r)); 1842 PetscCall(PetscFree(len_r)); 1843 PetscCall(PetscFree(buf_ri[0])); 1844 PetscCall(PetscFree(buf_ri)); 1845 PetscCall(PetscFree(buf_rj[0])); 1846 PetscCall(PetscFree(buf_rj)); 1847 PetscCall(PetscLayoutDestroy(&rowmap)); 1848 1849 PetscCall(PetscCalloc1(pN, &ptap->apa)); 1850 1851 /* attach the supporting struct to Cmpi for reuse */ 1852 Cmpi->product->data = ptap; 1853 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1854 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1855 1856 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1857 Cmpi->assembled = PETSC_FALSE; 1858 PetscFunctionReturn(PETSC_SUCCESS); 1859 } 1860 1861 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) 1862 { 1863 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1864 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 1865 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 1866 Mat_APMPI *ptap; 1867 Mat AP_loc, C_loc, C_oth; 1868 PetscInt i, rstart, rend, cm, ncols, row; 1869 PetscInt *api, *apj, am = A->rmap->n, j, col, apnz; 1870 PetscScalar *apa; 1871 const PetscInt *cols; 1872 const PetscScalar *vals; 1873 1874 PetscFunctionBegin; 1875 MatCheckProduct(C, 3); 1876 ptap = (Mat_APMPI *)C->product->data; 1877 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1878 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 1879 1880 PetscCall(MatZeroEntries(C)); 1881 /* 1) get R = Pd^T,Ro = Po^T */ 1882 if (ptap->reuse == MAT_REUSE_MATRIX) { 1883 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 1884 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 1885 } 1886 1887 /* 2) get AP_loc */ 1888 AP_loc = ptap->AP_loc; 1889 ap = (Mat_SeqAIJ *)AP_loc->data; 1890 1891 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1892 if (ptap->reuse == MAT_REUSE_MATRIX) { 1893 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1894 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1895 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 1896 } 1897 1898 /* 2-2) compute numeric A_loc*P - dominating part */ 1899 /* get data from symbolic products */ 1900 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1901 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1902 apa = ptap->apa; 1903 api = ap->i; 1904 apj = ap->j; 1905 for (i = 0; i < am; i++) { 1906 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1907 AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa); 1908 apnz = api[i + 1] - api[i]; 1909 for (j = 0; j < apnz; j++) { 1910 col = apj[j + api[i]]; 1911 ap->a[j + ap->i[i]] = apa[col]; 1912 apa[col] = 0.0; 1913 } 1914 } 1915 /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */ 1916 PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc)); 1917 1918 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1919 PetscCall(MatProductNumeric(ptap->C_loc)); 1920 PetscCall(MatProductNumeric(ptap->C_oth)); 1921 C_loc = ptap->C_loc; 1922 C_oth = ptap->C_oth; 1923 1924 /* add C_loc and Co to to C */ 1925 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 1926 1927 /* C_loc -> C */ 1928 cm = C_loc->rmap->N; 1929 c_seq = (Mat_SeqAIJ *)C_loc->data; 1930 cols = c_seq->j; 1931 vals = c_seq->a; 1932 1933 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 1934 /* when there are no off-processor parts. */ 1935 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 1936 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 1937 /* a table, and other, more complex stuff has to be done. */ 1938 if (C->assembled) { 1939 C->was_assembled = PETSC_TRUE; 1940 C->assembled = PETSC_FALSE; 1941 } 1942 if (C->was_assembled) { 1943 for (i = 0; i < cm; i++) { 1944 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1945 row = rstart + i; 1946 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1947 cols += ncols; 1948 vals += ncols; 1949 } 1950 } else { 1951 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 1952 } 1953 1954 /* Co -> C, off-processor part */ 1955 cm = C_oth->rmap->N; 1956 c_seq = (Mat_SeqAIJ *)C_oth->data; 1957 cols = c_seq->j; 1958 vals = c_seq->a; 1959 for (i = 0; i < cm; i++) { 1960 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1961 row = p->garray[i]; 1962 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1963 cols += ncols; 1964 vals += ncols; 1965 } 1966 1967 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1968 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1969 1970 ptap->reuse = MAT_REUSE_MATRIX; 1971 PetscFunctionReturn(PETSC_SUCCESS); 1972 } 1973 1974 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) 1975 { 1976 Mat_Product *product = C->product; 1977 Mat A = product->A, P = product->B; 1978 MatProductAlgorithm alg = product->alg; 1979 PetscReal fill = product->fill; 1980 PetscBool flg; 1981 1982 PetscFunctionBegin; 1983 /* scalable: do R=P^T locally, then C=R*A*P */ 1984 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 1985 if (flg) { 1986 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C)); 1987 C->ops->productnumeric = MatProductNumeric_PtAP; 1988 goto next; 1989 } 1990 1991 /* nonscalable: do R=P^T locally, then C=R*A*P */ 1992 PetscCall(PetscStrcmp(alg, "nonscalable", &flg)); 1993 if (flg) { 1994 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C)); 1995 goto next; 1996 } 1997 1998 /* allatonce */ 1999 PetscCall(PetscStrcmp(alg, "allatonce", &flg)); 2000 if (flg) { 2001 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C)); 2002 goto next; 2003 } 2004 2005 /* allatonce_merged */ 2006 PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg)); 2007 if (flg) { 2008 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C)); 2009 goto next; 2010 } 2011 2012 /* backend general code */ 2013 PetscCall(PetscStrcmp(alg, "backend", &flg)); 2014 if (flg) { 2015 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 2016 PetscFunctionReturn(PETSC_SUCCESS); 2017 } 2018 2019 /* hypre */ 2020 #if defined(PETSC_HAVE_HYPRE) 2021 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 2022 if (flg) { 2023 PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C)); 2024 C->ops->productnumeric = MatProductNumeric_PtAP; 2025 PetscFunctionReturn(PETSC_SUCCESS); 2026 } 2027 #endif 2028 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 2029 2030 next: 2031 C->ops->productnumeric = MatProductNumeric_PtAP; 2032 PetscFunctionReturn(PETSC_SUCCESS); 2033 } 2034