1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmapiv.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer) 17 { 18 PetscBool iascii; 19 PetscViewerFormat format; 20 Mat_APMPI *ptap; 21 22 PetscFunctionBegin; 23 MatCheckProduct(A, 1); 24 ptap = (Mat_APMPI *)A->product->data; 25 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 26 if (iascii) { 27 PetscCall(PetscViewerGetFormat(viewer, &format)); 28 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 29 if (ptap->algType == 0) { 30 PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n")); 31 } else if (ptap->algType == 1) { 32 PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n")); 33 } else if (ptap->algType == 2) { 34 PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n")); 35 } else if (ptap->algType == 3) { 36 PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n")); 37 } 38 } 39 } 40 PetscFunctionReturn(0); 41 } 42 43 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) 44 { 45 Mat_APMPI *ptap = (Mat_APMPI *)data; 46 Mat_Merge_SeqsToMPI *merge; 47 48 PetscFunctionBegin; 49 PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r)); 50 PetscCall(PetscFree(ptap->bufa)); 51 PetscCall(MatDestroy(&ptap->P_loc)); 52 PetscCall(MatDestroy(&ptap->P_oth)); 53 PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */ 54 PetscCall(MatDestroy(&ptap->Rd)); 55 PetscCall(MatDestroy(&ptap->Ro)); 56 if (ptap->AP_loc) { /* used by alg_rap */ 57 Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data; 58 PetscCall(PetscFree(ap->i)); 59 PetscCall(PetscFree2(ap->j, ap->a)); 60 PetscCall(MatDestroy(&ptap->AP_loc)); 61 } else { /* used by alg_ptap */ 62 PetscCall(PetscFree(ptap->api)); 63 PetscCall(PetscFree(ptap->apj)); 64 } 65 PetscCall(MatDestroy(&ptap->C_loc)); 66 PetscCall(MatDestroy(&ptap->C_oth)); 67 if (ptap->apa) PetscCall(PetscFree(ptap->apa)); 68 69 PetscCall(MatDestroy(&ptap->Pt)); 70 71 merge = ptap->merge; 72 if (merge) { /* used by alg_ptap */ 73 PetscCall(PetscFree(merge->id_r)); 74 PetscCall(PetscFree(merge->len_s)); 75 PetscCall(PetscFree(merge->len_r)); 76 PetscCall(PetscFree(merge->bi)); 77 PetscCall(PetscFree(merge->bj)); 78 PetscCall(PetscFree(merge->buf_ri[0])); 79 PetscCall(PetscFree(merge->buf_ri)); 80 PetscCall(PetscFree(merge->buf_rj[0])); 81 PetscCall(PetscFree(merge->buf_rj)); 82 PetscCall(PetscFree(merge->coi)); 83 PetscCall(PetscFree(merge->coj)); 84 PetscCall(PetscFree(merge->owners_co)); 85 PetscCall(PetscLayoutDestroy(&merge->rowmap)); 86 PetscCall(PetscFree(ptap->merge)); 87 } 88 PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog)); 89 90 PetscCall(PetscSFDestroy(&ptap->sf)); 91 PetscCall(PetscFree(ptap->c_othi)); 92 PetscCall(PetscFree(ptap->c_rmti)); 93 PetscCall(PetscFree(ptap)); 94 PetscFunctionReturn(0); 95 } 96 97 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C) 98 { 99 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 100 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 101 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 102 Mat_APMPI *ptap; 103 Mat AP_loc, C_loc, C_oth; 104 PetscInt i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout; 105 PetscScalar *apa; 106 const PetscInt *cols; 107 const PetscScalar *vals; 108 109 PetscFunctionBegin; 110 MatCheckProduct(C, 3); 111 ptap = (Mat_APMPI *)C->product->data; 112 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 113 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 114 115 PetscCall(MatZeroEntries(C)); 116 117 /* 1) get R = Pd^T,Ro = Po^T */ 118 if (ptap->reuse == MAT_REUSE_MATRIX) { 119 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 120 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 121 } 122 123 /* 2) get AP_loc */ 124 AP_loc = ptap->AP_loc; 125 ap = (Mat_SeqAIJ *)AP_loc->data; 126 127 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 128 /*-----------------------------------------------------*/ 129 if (ptap->reuse == MAT_REUSE_MATRIX) { 130 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 131 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 132 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 133 } 134 135 /* 2-2) compute numeric A_loc*P - dominating part */ 136 /* ---------------------------------------------- */ 137 /* get data from symbolic products */ 138 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 139 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 140 141 api = ap->i; 142 apj = ap->j; 143 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj)); 144 for (i = 0; i < am; i++) { 145 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 146 apnz = api[i + 1] - api[i]; 147 apa = ap->a + api[i]; 148 PetscCall(PetscArrayzero(apa, apnz)); 149 AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa); 150 } 151 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj)); 152 PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout); 153 154 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 155 /* Always use scalable version since we are in the MPI scalable version */ 156 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc)); 157 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth)); 158 159 C_loc = ptap->C_loc; 160 C_oth = ptap->C_oth; 161 162 /* add C_loc and Co to to C */ 163 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 164 165 /* C_loc -> C */ 166 cm = C_loc->rmap->N; 167 c_seq = (Mat_SeqAIJ *)C_loc->data; 168 cols = c_seq->j; 169 vals = c_seq->a; 170 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j)); 171 172 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 173 /* when there are no off-processor parts. */ 174 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 175 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 176 /* a table, and other, more complex stuff has to be done. */ 177 if (C->assembled) { 178 C->was_assembled = PETSC_TRUE; 179 C->assembled = PETSC_FALSE; 180 } 181 if (C->was_assembled) { 182 for (i = 0; i < cm; i++) { 183 ncols = c_seq->i[i + 1] - c_seq->i[i]; 184 row = rstart + i; 185 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 186 cols += ncols; 187 vals += ncols; 188 } 189 } else { 190 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 191 } 192 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j)); 193 PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 194 195 /* Co -> C, off-processor part */ 196 cm = C_oth->rmap->N; 197 c_seq = (Mat_SeqAIJ *)C_oth->data; 198 cols = c_seq->j; 199 vals = c_seq->a; 200 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j)); 201 for (i = 0; i < cm; i++) { 202 ncols = c_seq->i[i + 1] - c_seq->i[i]; 203 row = p->garray[i]; 204 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 205 cols += ncols; 206 vals += ncols; 207 } 208 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 209 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 210 211 ptap->reuse = MAT_REUSE_MATRIX; 212 213 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j)); 214 PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 215 PetscFunctionReturn(0); 216 } 217 218 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi) 219 { 220 Mat_APMPI *ptap; 221 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 222 MPI_Comm comm; 223 PetscMPIInt size, rank; 224 Mat P_loc, P_oth; 225 PetscFreeSpaceList free_space = NULL, current_space = NULL; 226 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 227 PetscInt *lnk, i, k, pnz, row, nsend; 228 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 229 PETSC_UNUSED PetscMPIInt icompleted = 0; 230 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 231 const PetscInt *owners; 232 PetscInt len, proc, *dnz, *onz, nzi, nspacedouble; 233 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 234 MPI_Request *swaits, *rwaits; 235 MPI_Status *sstatus, rstatus; 236 PetscLayout rowmap; 237 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 238 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 239 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout; 240 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 241 PetscScalar *apv; 242 PetscTable ta; 243 MatType mtype; 244 const char *prefix; 245 #if defined(PETSC_USE_INFO) 246 PetscReal apfill; 247 #endif 248 249 PetscFunctionBegin; 250 MatCheckProduct(Cmpi, 4); 251 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 252 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 253 PetscCallMPI(MPI_Comm_size(comm, &size)); 254 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 255 256 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 257 258 /* create symbolic parallel matrix Cmpi */ 259 PetscCall(MatGetType(A, &mtype)); 260 PetscCall(MatSetType(Cmpi, mtype)); 261 262 /* create struct Mat_APMPI and attached it to C later */ 263 PetscCall(PetscNew(&ptap)); 264 ptap->reuse = MAT_INITIAL_MATRIX; 265 ptap->algType = 0; 266 267 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 268 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth)); 269 /* get P_loc by taking all local rows of P */ 270 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc)); 271 272 ptap->P_loc = P_loc; 273 ptap->P_oth = P_oth; 274 275 /* (0) compute Rd = Pd^T, Ro = Po^T */ 276 /* --------------------------------- */ 277 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 278 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 279 280 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 281 /* ----------------------------------------------------------------- */ 282 p_loc = (Mat_SeqAIJ *)P_loc->data; 283 if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data; 284 285 /* create and initialize a linked list */ 286 PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */ 287 MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta); 288 MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta); 289 PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 290 291 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 292 293 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 294 if (ao) { 295 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 296 } else { 297 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 298 } 299 current_space = free_space; 300 nspacedouble = 0; 301 302 PetscCall(PetscMalloc1(am + 1, &api)); 303 api[0] = 0; 304 for (i = 0; i < am; i++) { 305 /* diagonal portion: Ad[i,:]*P */ 306 ai = ad->i; 307 pi = p_loc->i; 308 nzi = ai[i + 1] - ai[i]; 309 aj = ad->j + ai[i]; 310 for (j = 0; j < nzi; j++) { 311 row = aj[j]; 312 pnz = pi[row + 1] - pi[row]; 313 Jptr = p_loc->j + pi[row]; 314 /* add non-zero cols of P into the sorted linked list lnk */ 315 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 316 } 317 /* off-diagonal portion: Ao[i,:]*P */ 318 if (ao) { 319 ai = ao->i; 320 pi = p_oth->i; 321 nzi = ai[i + 1] - ai[i]; 322 aj = ao->j + ai[i]; 323 for (j = 0; j < nzi; j++) { 324 row = aj[j]; 325 pnz = pi[row + 1] - pi[row]; 326 Jptr = p_oth->j + pi[row]; 327 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 328 } 329 } 330 apnz = lnk[0]; 331 api[i + 1] = api[i] + apnz; 332 333 /* if free space is not available, double the total space in the list */ 334 if (current_space->local_remaining < apnz) { 335 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 336 nspacedouble++; 337 } 338 339 /* Copy data into free space, then initialize lnk */ 340 PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk)); 341 342 current_space->array += apnz; 343 current_space->local_used += apnz; 344 current_space->local_remaining -= apnz; 345 } 346 /* Allocate space for apj and apv, initialize apj, and */ 347 /* destroy list of free space and other temporary array(s) */ 348 PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv)); 349 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 350 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 351 352 /* Create AP_loc for reuse */ 353 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 354 PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog)); 355 356 #if defined(PETSC_USE_INFO) 357 if (ao) { 358 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 359 } else { 360 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 361 } 362 ptap->AP_loc->info.mallocs = nspacedouble; 363 ptap->AP_loc->info.fill_ratio_given = fill; 364 ptap->AP_loc->info.fill_ratio_needed = apfill; 365 366 if (api[am]) { 367 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 368 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 369 } else { 370 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n")); 371 } 372 #endif 373 374 /* (2-1) compute symbolic Co = Ro*AP_loc */ 375 /* -------------------------------------- */ 376 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 377 PetscCall(MatGetOptionsPrefix(A, &prefix)); 378 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 379 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_")); 380 381 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 382 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted")); 383 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 384 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 385 PetscCall(MatProductSymbolic(ptap->C_oth)); 386 387 /* (3) send coj of C_oth to other processors */ 388 /* ------------------------------------------ */ 389 /* determine row ownership */ 390 PetscCall(PetscLayoutCreate(comm, &rowmap)); 391 PetscCall(PetscLayoutSetLocalSize(rowmap, pn)); 392 PetscCall(PetscLayoutSetBlockSize(rowmap, 1)); 393 PetscCall(PetscLayoutSetUp(rowmap)); 394 PetscCall(PetscLayoutGetRanges(rowmap, &owners)); 395 396 /* determine the number of messages to send, their lengths */ 397 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 398 PetscCall(PetscArrayzero(len_s, size)); 399 PetscCall(PetscArrayzero(len_si, size)); 400 401 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 402 coi = c_oth->i; 403 coj = c_oth->j; 404 con = ptap->C_oth->rmap->n; 405 proc = 0; 406 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj)); 407 for (i = 0; i < con; i++) { 408 while (prmap[i] >= owners[proc + 1]) proc++; 409 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 410 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 411 } 412 413 len = 0; /* max length of buf_si[], see (4) */ 414 owners_co[0] = 0; 415 nsend = 0; 416 for (proc = 0; proc < size; proc++) { 417 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 418 if (len_s[proc]) { 419 nsend++; 420 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 421 len += len_si[proc]; 422 } 423 } 424 425 /* determine the number and length of messages to receive for coi and coj */ 426 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 427 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 428 429 /* post the Irecv and Isend of coj */ 430 PetscCall(PetscCommGetNewTag(comm, &tagj)); 431 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 432 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 433 for (proc = 0, k = 0; proc < size; proc++) { 434 if (!len_s[proc]) continue; 435 i = owners_co[proc]; 436 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 437 k++; 438 } 439 440 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 441 /* ---------------------------------------- */ 442 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 443 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 444 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 445 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 446 447 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 448 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_")); 449 450 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 451 PetscCall(MatProductSymbolic(ptap->C_loc)); 452 453 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 454 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j)); 455 456 /* receives coj are complete */ 457 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 458 PetscCall(PetscFree(rwaits)); 459 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 460 461 /* add received column indices into ta to update Crmax */ 462 for (k = 0; k < nrecv; k++) { /* k-th received message */ 463 Jptr = buf_rj[k]; 464 for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES)); 465 } 466 PetscCall(PetscTableGetCount(ta, &Crmax)); 467 PetscCall(PetscTableDestroy(&ta)); 468 469 /* (4) send and recv coi */ 470 /*-----------------------*/ 471 PetscCall(PetscCommGetNewTag(comm, &tagi)); 472 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 473 PetscCall(PetscMalloc1(len + 1, &buf_s)); 474 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 475 for (proc = 0, k = 0; proc < size; proc++) { 476 if (!len_s[proc]) continue; 477 /* form outgoing message for i-structure: 478 buf_si[0]: nrows to be sent 479 [1:nrows]: row index (global) 480 [nrows+1:2*nrows+1]: i-structure index 481 */ 482 /*-------------------------------------------*/ 483 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 484 buf_si_i = buf_si + nrows + 1; 485 buf_si[0] = nrows; 486 buf_si_i[0] = 0; 487 nrows = 0; 488 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 489 nzi = coi[i + 1] - coi[i]; 490 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 491 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 492 nrows++; 493 } 494 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 495 k++; 496 buf_si += len_si[proc]; 497 } 498 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 499 PetscCall(PetscFree(rwaits)); 500 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 501 502 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 503 PetscCall(PetscFree(len_ri)); 504 PetscCall(PetscFree(swaits)); 505 PetscCall(PetscFree(buf_s)); 506 507 /* (5) compute the local portion of Cmpi */ 508 /* ------------------------------------------ */ 509 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 510 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 511 current_space = free_space; 512 513 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 514 for (k = 0; k < nrecv; k++) { 515 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 516 nrows = *buf_ri_k[k]; 517 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 518 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 519 } 520 521 MatPreallocateBegin(comm, pn, pn, dnz, onz); 522 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 523 for (i = 0; i < pn; i++) { 524 /* add C_loc into Cmpi */ 525 nzi = c_loc->i[i + 1] - c_loc->i[i]; 526 Jptr = c_loc->j + c_loc->i[i]; 527 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 528 529 /* add received col data into lnk */ 530 for (k = 0; k < nrecv; k++) { /* k-th received message */ 531 if (i == *nextrow[k]) { /* i-th row */ 532 nzi = *(nextci[k] + 1) - *nextci[k]; 533 Jptr = buf_rj[k] + *nextci[k]; 534 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 535 nextrow[k]++; 536 nextci[k]++; 537 } 538 } 539 nzi = lnk[0]; 540 541 /* copy data into free space, then initialize lnk */ 542 PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk)); 543 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 544 } 545 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 546 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 547 PetscCall(PetscFreeSpaceDestroy(free_space)); 548 549 /* local sizes and preallocation */ 550 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 551 if (P->cmap->bs > 0) { 552 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 553 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 554 } 555 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 556 MatPreallocateEnd(dnz, onz); 557 558 /* members in merge */ 559 PetscCall(PetscFree(id_r)); 560 PetscCall(PetscFree(len_r)); 561 PetscCall(PetscFree(buf_ri[0])); 562 PetscCall(PetscFree(buf_ri)); 563 PetscCall(PetscFree(buf_rj[0])); 564 PetscCall(PetscFree(buf_rj)); 565 PetscCall(PetscLayoutDestroy(&rowmap)); 566 567 nout = 0; 568 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j)); 569 PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout); 570 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j)); 571 PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout); 572 573 /* attach the supporting struct to Cmpi for reuse */ 574 Cmpi->product->data = ptap; 575 Cmpi->product->view = MatView_MPIAIJ_PtAP; 576 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 577 578 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 579 Cmpi->assembled = PETSC_FALSE; 580 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 581 PetscFunctionReturn(0); 582 } 583 584 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht) 585 { 586 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 587 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 588 PetscInt *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k; 589 PetscInt pcstart, pcend, column, offset; 590 591 PetscFunctionBegin; 592 pcstart = P->cmap->rstart; 593 pcstart *= dof; 594 pcend = P->cmap->rend; 595 pcend *= dof; 596 /* diagonal portion: Ad[i,:]*P */ 597 ai = ad->i; 598 nzi = ai[i + 1] - ai[i]; 599 aj = ad->j + ai[i]; 600 for (j = 0; j < nzi; j++) { 601 row = aj[j]; 602 offset = row % dof; 603 row /= dof; 604 nzpi = pd->i[row + 1] - pd->i[row]; 605 pj = pd->j + pd->i[row]; 606 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart)); 607 } 608 /* off diag P */ 609 for (j = 0; j < nzi; j++) { 610 row = aj[j]; 611 offset = row % dof; 612 row /= dof; 613 nzpi = po->i[row + 1] - po->i[row]; 614 pj = po->j + po->i[row]; 615 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset)); 616 } 617 618 /* off diagonal part: Ao[i, :]*P_oth */ 619 if (ao) { 620 ai = ao->i; 621 pi = p_oth->i; 622 nzi = ai[i + 1] - ai[i]; 623 aj = ao->j + ai[i]; 624 for (j = 0; j < nzi; j++) { 625 row = aj[j]; 626 offset = a->garray[row] % dof; 627 row = map[row]; 628 pnz = pi[row + 1] - pi[row]; 629 p_othcols = p_oth->j + pi[row]; 630 for (col = 0; col < pnz; col++) { 631 column = p_othcols[col] * dof + offset; 632 if (column >= pcstart && column < pcend) { 633 PetscCall(PetscHSetIAdd(dht, column)); 634 } else { 635 PetscCall(PetscHSetIAdd(oht, column)); 636 } 637 } 638 } 639 } /* end if (ao) */ 640 PetscFunctionReturn(0); 641 } 642 643 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap) 644 { 645 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 646 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 647 PetscInt *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset; 648 PetscScalar ra, *aa, *pa; 649 650 PetscFunctionBegin; 651 pcstart = P->cmap->rstart; 652 pcstart *= dof; 653 654 /* diagonal portion: Ad[i,:]*P */ 655 ai = ad->i; 656 nzi = ai[i + 1] - ai[i]; 657 aj = ad->j + ai[i]; 658 aa = ad->a + ai[i]; 659 for (j = 0; j < nzi; j++) { 660 ra = aa[j]; 661 row = aj[j]; 662 offset = row % dof; 663 row /= dof; 664 nzpi = pd->i[row + 1] - pd->i[row]; 665 pj = pd->j + pd->i[row]; 666 pa = pd->a + pd->i[row]; 667 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k])); 668 PetscCall(PetscLogFlops(2.0 * nzpi)); 669 } 670 for (j = 0; j < nzi; j++) { 671 ra = aa[j]; 672 row = aj[j]; 673 offset = row % dof; 674 row /= dof; 675 nzpi = po->i[row + 1] - po->i[row]; 676 pj = po->j + po->i[row]; 677 pa = po->a + po->i[row]; 678 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k])); 679 PetscCall(PetscLogFlops(2.0 * nzpi)); 680 } 681 682 /* off diagonal part: Ao[i, :]*P_oth */ 683 if (ao) { 684 ai = ao->i; 685 pi = p_oth->i; 686 nzi = ai[i + 1] - ai[i]; 687 aj = ao->j + ai[i]; 688 aa = ao->a + ai[i]; 689 for (j = 0; j < nzi; j++) { 690 row = aj[j]; 691 offset = a->garray[row] % dof; 692 row = map[row]; 693 ra = aa[j]; 694 pnz = pi[row + 1] - pi[row]; 695 p_othcols = p_oth->j + pi[row]; 696 pa = p_oth->a + pi[row]; 697 for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col])); 698 PetscCall(PetscLogFlops(2.0 * pnz)); 699 } 700 } /* end if (ao) */ 701 702 PetscFunctionReturn(0); 703 } 704 705 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *); 706 707 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C) 708 { 709 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 710 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 711 Mat_APMPI *ptap; 712 PetscHMapIV hmap; 713 PetscInt i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc; 714 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 715 PetscInt offset, ii, pocol; 716 const PetscInt *mappingindices; 717 IS map; 718 719 PetscFunctionBegin; 720 MatCheckProduct(C, 4); 721 ptap = (Mat_APMPI *)C->product->data; 722 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 723 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 724 725 PetscCall(MatZeroEntries(C)); 726 727 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 728 /*-----------------------------------------------------*/ 729 if (ptap->reuse == MAT_REUSE_MATRIX) { 730 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 731 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 732 } 733 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 734 735 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 736 pon *= dof; 737 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 738 PetscCall(MatGetLocalSize(A, &am, NULL)); 739 cmaxr = 0; 740 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 741 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 742 PetscCall(PetscHMapIVCreate(&hmap)); 743 PetscCall(PetscHMapIVResize(hmap, cmaxr)); 744 PetscCall(ISGetIndices(map, &mappingindices)); 745 for (i = 0; i < am && pon; i++) { 746 PetscCall(PetscHMapIVClear(hmap)); 747 offset = i % dof; 748 ii = i / dof; 749 nzi = po->i[ii + 1] - po->i[ii]; 750 if (!nzi) continue; 751 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 752 voff = 0; 753 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 754 if (!voff) continue; 755 756 /* Form C(ii, :) */ 757 poj = po->j + po->i[ii]; 758 poa = po->a + po->i[ii]; 759 for (j = 0; j < nzi; j++) { 760 pocol = poj[j] * dof + offset; 761 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 762 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 763 for (jj = 0; jj < voff; jj++) { 764 apvaluestmp[jj] = apvalues[jj] * poa[j]; 765 /* If the row is empty */ 766 if (!c_rmtc[pocol]) { 767 c_rmtjj[jj] = apindices[jj]; 768 c_rmtaa[jj] = apvaluestmp[jj]; 769 c_rmtc[pocol]++; 770 } else { 771 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 772 if (loc >= 0) { /* hit */ 773 c_rmtaa[loc] += apvaluestmp[jj]; 774 PetscCall(PetscLogFlops(1.0)); 775 } else { /* new element */ 776 loc = -(loc + 1); 777 /* Move data backward */ 778 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 779 c_rmtjj[kk] = c_rmtjj[kk - 1]; 780 c_rmtaa[kk] = c_rmtaa[kk - 1]; 781 } /* End kk */ 782 c_rmtjj[loc] = apindices[jj]; 783 c_rmtaa[loc] = apvaluestmp[jj]; 784 c_rmtc[pocol]++; 785 } 786 } 787 PetscCall(PetscLogFlops(voff)); 788 } /* End jj */ 789 } /* End j */ 790 } /* End i */ 791 792 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 793 PetscCall(PetscHMapIVDestroy(&hmap)); 794 795 PetscCall(MatGetLocalSize(P, NULL, &pn)); 796 pn *= dof; 797 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 798 799 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 800 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 801 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 802 pcstart = pcstart * dof; 803 pcend = pcend * dof; 804 cd = (Mat_SeqAIJ *)(c->A)->data; 805 co = (Mat_SeqAIJ *)(c->B)->data; 806 807 cmaxr = 0; 808 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 809 PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ)); 810 PetscCall(PetscHMapIVCreate(&hmap)); 811 PetscCall(PetscHMapIVResize(hmap, cmaxr)); 812 for (i = 0; i < am && pn; i++) { 813 PetscCall(PetscHMapIVClear(hmap)); 814 offset = i % dof; 815 ii = i / dof; 816 nzi = pd->i[ii + 1] - pd->i[ii]; 817 if (!nzi) continue; 818 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 819 voff = 0; 820 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 821 if (!voff) continue; 822 /* Form C(ii, :) */ 823 pdj = pd->j + pd->i[ii]; 824 pda = pd->a + pd->i[ii]; 825 for (j = 0; j < nzi; j++) { 826 row = pcstart + pdj[j] * dof + offset; 827 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; 828 PetscCall(PetscLogFlops(voff)); 829 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 830 } 831 } 832 PetscCall(ISRestoreIndices(map, &mappingindices)); 833 PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend)); 834 PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ)); 835 PetscCall(PetscHMapIVDestroy(&hmap)); 836 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 837 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 838 PetscCall(PetscFree2(c_rmtj, c_rmta)); 839 840 /* Add contributions from remote */ 841 for (i = 0; i < pn; i++) { 842 row = i + pcstart; 843 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 844 } 845 PetscCall(PetscFree2(c_othj, c_otha)); 846 847 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 848 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 849 850 ptap->reuse = MAT_REUSE_MATRIX; 851 PetscFunctionReturn(0); 852 } 853 854 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C) 855 { 856 PetscFunctionBegin; 857 858 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C)); 859 PetscFunctionReturn(0); 860 } 861 862 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C) 863 { 864 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 865 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 866 Mat_APMPI *ptap; 867 PetscHMapIV hmap; 868 PetscInt i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc; 869 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 870 PetscInt offset, ii, pocol; 871 const PetscInt *mappingindices; 872 IS map; 873 874 PetscFunctionBegin; 875 MatCheckProduct(C, 4); 876 ptap = (Mat_APMPI *)C->product->data; 877 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 878 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 879 880 PetscCall(MatZeroEntries(C)); 881 882 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 883 /*-----------------------------------------------------*/ 884 if (ptap->reuse == MAT_REUSE_MATRIX) { 885 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 886 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 887 } 888 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 889 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 890 pon *= dof; 891 PetscCall(MatGetLocalSize(P, NULL, &pn)); 892 pn *= dof; 893 894 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 895 PetscCall(MatGetLocalSize(A, &am, NULL)); 896 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 897 pcstart *= dof; 898 pcend *= dof; 899 cmaxr = 0; 900 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 901 cd = (Mat_SeqAIJ *)(c->A)->data; 902 co = (Mat_SeqAIJ *)(c->B)->data; 903 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 904 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 905 PetscCall(PetscHMapIVCreate(&hmap)); 906 PetscCall(PetscHMapIVResize(hmap, cmaxr)); 907 PetscCall(ISGetIndices(map, &mappingindices)); 908 for (i = 0; i < am && (pon || pn); i++) { 909 PetscCall(PetscHMapIVClear(hmap)); 910 offset = i % dof; 911 ii = i / dof; 912 nzi = po->i[ii + 1] - po->i[ii]; 913 dnzi = pd->i[ii + 1] - pd->i[ii]; 914 if (!nzi && !dnzi) continue; 915 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 916 voff = 0; 917 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 918 if (!voff) continue; 919 920 /* Form remote C(ii, :) */ 921 poj = po->j + po->i[ii]; 922 poa = po->a + po->i[ii]; 923 for (j = 0; j < nzi; j++) { 924 pocol = poj[j] * dof + offset; 925 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 926 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 927 for (jj = 0; jj < voff; jj++) { 928 apvaluestmp[jj] = apvalues[jj] * poa[j]; 929 /* If the row is empty */ 930 if (!c_rmtc[pocol]) { 931 c_rmtjj[jj] = apindices[jj]; 932 c_rmtaa[jj] = apvaluestmp[jj]; 933 c_rmtc[pocol]++; 934 } else { 935 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 936 if (loc >= 0) { /* hit */ 937 c_rmtaa[loc] += apvaluestmp[jj]; 938 PetscCall(PetscLogFlops(1.0)); 939 } else { /* new element */ 940 loc = -(loc + 1); 941 /* Move data backward */ 942 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 943 c_rmtjj[kk] = c_rmtjj[kk - 1]; 944 c_rmtaa[kk] = c_rmtaa[kk - 1]; 945 } /* End kk */ 946 c_rmtjj[loc] = apindices[jj]; 947 c_rmtaa[loc] = apvaluestmp[jj]; 948 c_rmtc[pocol]++; 949 } 950 } 951 } /* End jj */ 952 PetscCall(PetscLogFlops(voff)); 953 } /* End j */ 954 955 /* Form local C(ii, :) */ 956 pdj = pd->j + pd->i[ii]; 957 pda = pd->a + pd->i[ii]; 958 for (j = 0; j < dnzi; j++) { 959 row = pcstart + pdj[j] * dof + offset; 960 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */ 961 PetscCall(PetscLogFlops(voff)); 962 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 963 } /* End j */ 964 } /* End i */ 965 966 PetscCall(ISRestoreIndices(map, &mappingindices)); 967 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 968 PetscCall(PetscHMapIVDestroy(&hmap)); 969 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 970 971 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 972 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 973 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 974 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 975 PetscCall(PetscFree2(c_rmtj, c_rmta)); 976 977 /* Add contributions from remote */ 978 for (i = 0; i < pn; i++) { 979 row = i + pcstart; 980 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 981 } 982 PetscCall(PetscFree2(c_othj, c_otha)); 983 984 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 985 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 986 987 ptap->reuse = MAT_REUSE_MATRIX; 988 PetscFunctionReturn(0); 989 } 990 991 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C) 992 { 993 PetscFunctionBegin; 994 995 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C)); 996 PetscFunctionReturn(0); 997 } 998 999 /* TODO: move algorithm selection to MatProductSetFromOptions */ 1000 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 1001 { 1002 Mat_APMPI *ptap; 1003 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1004 MPI_Comm comm; 1005 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 1006 MatType mtype; 1007 PetscSF sf; 1008 PetscSFNode *iremote; 1009 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 1010 const PetscInt *rootdegrees; 1011 PetscHSetI ht, oht, *hta, *hto; 1012 PetscInt pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 1013 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 1014 PetscInt nalg = 2, alg = 0, offset, ii; 1015 PetscMPIInt owner; 1016 const PetscInt *mappingindices; 1017 PetscBool flg; 1018 const char *algTypes[2] = {"overlapping", "merged"}; 1019 IS map; 1020 1021 PetscFunctionBegin; 1022 MatCheckProduct(Cmpi, 5); 1023 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1024 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1025 1026 /* Create symbolic parallel matrix Cmpi */ 1027 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1028 pn *= dof; 1029 PetscCall(MatGetType(A, &mtype)); 1030 PetscCall(MatSetType(Cmpi, mtype)); 1031 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1032 1033 PetscCall(PetscNew(&ptap)); 1034 ptap->reuse = MAT_INITIAL_MATRIX; 1035 ptap->algType = 2; 1036 1037 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1038 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1039 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1040 /* This equals to the number of offdiag columns in P */ 1041 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1042 pon *= dof; 1043 /* offsets */ 1044 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1045 /* The number of columns we will send to remote ranks */ 1046 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1047 PetscCall(PetscMalloc1(pon, &hta)); 1048 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1049 PetscCall(MatGetLocalSize(A, &am, NULL)); 1050 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1051 /* Create hash table to merge all columns for C(i, :) */ 1052 PetscCall(PetscHSetICreate(&ht)); 1053 1054 PetscCall(ISGetIndices(map, &mappingindices)); 1055 ptap->c_rmti[0] = 0; 1056 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1057 for (i = 0; i < am && pon; i++) { 1058 /* Form one row of AP */ 1059 PetscCall(PetscHSetIClear(ht)); 1060 offset = i % dof; 1061 ii = i / dof; 1062 /* If the off diag is empty, we should not do any calculation */ 1063 nzi = po->i[ii + 1] - po->i[ii]; 1064 if (!nzi) continue; 1065 1066 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht)); 1067 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1068 /* If AP is empty, just continue */ 1069 if (!htsize) continue; 1070 /* Form C(ii, :) */ 1071 poj = po->j + po->i[ii]; 1072 for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1073 } 1074 1075 for (i = 0; i < pon; i++) { 1076 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1077 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1078 c_rmtc[i] = htsize; 1079 } 1080 1081 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1082 1083 for (i = 0; i < pon; i++) { 1084 off = 0; 1085 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1086 PetscCall(PetscHSetIDestroy(&hta[i])); 1087 } 1088 PetscCall(PetscFree(hta)); 1089 1090 PetscCall(PetscMalloc1(pon, &iremote)); 1091 for (i = 0; i < pon; i++) { 1092 owner = 0; 1093 lidx = 0; 1094 offset = i % dof; 1095 ii = i / dof; 1096 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1097 iremote[i].index = lidx * dof + offset; 1098 iremote[i].rank = owner; 1099 } 1100 1101 PetscCall(PetscSFCreate(comm, &sf)); 1102 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1103 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1104 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1105 PetscCall(PetscSFSetFromOptions(sf)); 1106 PetscCall(PetscSFSetUp(sf)); 1107 /* How many neighbors have contributions to my rows? */ 1108 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1109 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1110 rootspacesize = 0; 1111 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1112 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1113 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1114 /* Get information from leaves 1115 * Number of columns other people contribute to my rows 1116 * */ 1117 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1118 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1119 PetscCall(PetscFree(c_rmtc)); 1120 PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi)); 1121 /* The number of columns is received for each row */ 1122 ptap->c_othi[0] = 0; 1123 rootspacesize = 0; 1124 rootspaceoffsets[0] = 0; 1125 for (i = 0; i < pn; i++) { 1126 rcvncols = 0; 1127 for (j = 0; j < rootdegrees[i]; j++) { 1128 rcvncols += rootspace[rootspacesize]; 1129 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1130 rootspacesize++; 1131 } 1132 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1133 } 1134 PetscCall(PetscFree(rootspace)); 1135 1136 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1137 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1138 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1139 PetscCall(PetscSFDestroy(&sf)); 1140 PetscCall(PetscFree(rootspaceoffsets)); 1141 1142 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1143 nleaves = 0; 1144 for (i = 0; i < pon; i++) { 1145 owner = 0; 1146 ii = i / dof; 1147 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1148 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1149 for (j = 0; j < sendncols; j++) { 1150 iremote[nleaves].rank = owner; 1151 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1152 } 1153 } 1154 PetscCall(PetscFree(c_rmtoffsets)); 1155 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1156 1157 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1158 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1159 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1160 /* One to one map */ 1161 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1162 1163 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1164 PetscCall(PetscHSetICreate(&oht)); 1165 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1166 pcstart *= dof; 1167 pcend *= dof; 1168 PetscCall(PetscMalloc2(pn, &hta, pn, &hto)); 1169 for (i = 0; i < pn; i++) { 1170 PetscCall(PetscHSetICreate(&hta[i])); 1171 PetscCall(PetscHSetICreate(&hto[i])); 1172 } 1173 /* Work on local part */ 1174 /* 4) Pass 1: Estimate memory for C_loc */ 1175 for (i = 0; i < am && pn; i++) { 1176 PetscCall(PetscHSetIClear(ht)); 1177 PetscCall(PetscHSetIClear(oht)); 1178 offset = i % dof; 1179 ii = i / dof; 1180 nzi = pd->i[ii + 1] - pd->i[ii]; 1181 if (!nzi) continue; 1182 1183 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1184 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1185 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1186 if (!(htsize + htosize)) continue; 1187 /* Form C(ii, :) */ 1188 pdj = pd->j + pd->i[ii]; 1189 for (j = 0; j < nzi; j++) { 1190 PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht)); 1191 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1192 } 1193 } 1194 1195 PetscCall(ISRestoreIndices(map, &mappingindices)); 1196 1197 PetscCall(PetscHSetIDestroy(&ht)); 1198 PetscCall(PetscHSetIDestroy(&oht)); 1199 1200 /* Get remote data */ 1201 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1202 PetscCall(PetscFree(c_rmtj)); 1203 1204 for (i = 0; i < pn; i++) { 1205 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1206 rdj = c_othj + ptap->c_othi[i]; 1207 for (j = 0; j < nzi; j++) { 1208 col = rdj[j]; 1209 /* diag part */ 1210 if (col >= pcstart && col < pcend) { 1211 PetscCall(PetscHSetIAdd(hta[i], col)); 1212 } else { /* off diag */ 1213 PetscCall(PetscHSetIAdd(hto[i], col)); 1214 } 1215 } 1216 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1217 dnz[i] = htsize; 1218 PetscCall(PetscHSetIDestroy(&hta[i])); 1219 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1220 onz[i] = htsize; 1221 PetscCall(PetscHSetIDestroy(&hto[i])); 1222 } 1223 1224 PetscCall(PetscFree2(hta, hto)); 1225 PetscCall(PetscFree(c_othj)); 1226 1227 /* local sizes and preallocation */ 1228 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1229 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1230 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1231 PetscCall(MatSetUp(Cmpi)); 1232 PetscCall(PetscFree2(dnz, onz)); 1233 1234 /* attach the supporting struct to Cmpi for reuse */ 1235 Cmpi->product->data = ptap; 1236 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1237 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1238 1239 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1240 Cmpi->assembled = PETSC_FALSE; 1241 /* pick an algorithm */ 1242 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1243 alg = 0; 1244 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1245 PetscOptionsEnd(); 1246 switch (alg) { 1247 case 0: 1248 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1249 break; 1250 case 1: 1251 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1252 break; 1253 default: 1254 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1255 } 1256 PetscFunctionReturn(0); 1257 } 1258 1259 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C) 1260 { 1261 PetscFunctionBegin; 1262 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C)); 1263 PetscFunctionReturn(0); 1264 } 1265 1266 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 1267 { 1268 Mat_APMPI *ptap; 1269 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1270 MPI_Comm comm; 1271 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 1272 MatType mtype; 1273 PetscSF sf; 1274 PetscSFNode *iremote; 1275 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 1276 const PetscInt *rootdegrees; 1277 PetscHSetI ht, oht, *hta, *hto, *htd; 1278 PetscInt pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 1279 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 1280 PetscInt nalg = 2, alg = 0, offset, ii; 1281 PetscMPIInt owner; 1282 PetscBool flg; 1283 const char *algTypes[2] = {"merged", "overlapping"}; 1284 const PetscInt *mappingindices; 1285 IS map; 1286 1287 PetscFunctionBegin; 1288 MatCheckProduct(Cmpi, 5); 1289 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1290 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1291 1292 /* Create symbolic parallel matrix Cmpi */ 1293 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1294 pn *= dof; 1295 PetscCall(MatGetType(A, &mtype)); 1296 PetscCall(MatSetType(Cmpi, mtype)); 1297 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1298 1299 PetscCall(PetscNew(&ptap)); 1300 ptap->reuse = MAT_INITIAL_MATRIX; 1301 ptap->algType = 3; 1302 1303 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1304 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1305 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1306 1307 /* This equals to the number of offdiag columns in P */ 1308 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1309 pon *= dof; 1310 /* offsets */ 1311 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1312 /* The number of columns we will send to remote ranks */ 1313 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1314 PetscCall(PetscMalloc1(pon, &hta)); 1315 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1316 PetscCall(MatGetLocalSize(A, &am, NULL)); 1317 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1318 /* Create hash table to merge all columns for C(i, :) */ 1319 PetscCall(PetscHSetICreate(&ht)); 1320 PetscCall(PetscHSetICreate(&oht)); 1321 PetscCall(PetscMalloc2(pn, &htd, pn, &hto)); 1322 for (i = 0; i < pn; i++) { 1323 PetscCall(PetscHSetICreate(&htd[i])); 1324 PetscCall(PetscHSetICreate(&hto[i])); 1325 } 1326 1327 PetscCall(ISGetIndices(map, &mappingindices)); 1328 ptap->c_rmti[0] = 0; 1329 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1330 for (i = 0; i < am && (pon || pn); i++) { 1331 /* Form one row of AP */ 1332 PetscCall(PetscHSetIClear(ht)); 1333 PetscCall(PetscHSetIClear(oht)); 1334 offset = i % dof; 1335 ii = i / dof; 1336 /* If the off diag is empty, we should not do any calculation */ 1337 nzi = po->i[ii + 1] - po->i[ii]; 1338 dnzi = pd->i[ii + 1] - pd->i[ii]; 1339 if (!nzi && !dnzi) continue; 1340 1341 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1342 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1343 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1344 /* If AP is empty, just continue */ 1345 if (!(htsize + htosize)) continue; 1346 1347 /* Form remote C(ii, :) */ 1348 poj = po->j + po->i[ii]; 1349 for (j = 0; j < nzi; j++) { 1350 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1351 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht)); 1352 } 1353 1354 /* Form local C(ii, :) */ 1355 pdj = pd->j + pd->i[ii]; 1356 for (j = 0; j < dnzi; j++) { 1357 PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht)); 1358 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1359 } 1360 } 1361 1362 PetscCall(ISRestoreIndices(map, &mappingindices)); 1363 1364 PetscCall(PetscHSetIDestroy(&ht)); 1365 PetscCall(PetscHSetIDestroy(&oht)); 1366 1367 for (i = 0; i < pon; i++) { 1368 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1369 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1370 c_rmtc[i] = htsize; 1371 } 1372 1373 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1374 1375 for (i = 0; i < pon; i++) { 1376 off = 0; 1377 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1378 PetscCall(PetscHSetIDestroy(&hta[i])); 1379 } 1380 PetscCall(PetscFree(hta)); 1381 1382 PetscCall(PetscMalloc1(pon, &iremote)); 1383 for (i = 0; i < pon; i++) { 1384 owner = 0; 1385 lidx = 0; 1386 offset = i % dof; 1387 ii = i / dof; 1388 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1389 iremote[i].index = lidx * dof + offset; 1390 iremote[i].rank = owner; 1391 } 1392 1393 PetscCall(PetscSFCreate(comm, &sf)); 1394 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1395 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1396 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1397 PetscCall(PetscSFSetFromOptions(sf)); 1398 PetscCall(PetscSFSetUp(sf)); 1399 /* How many neighbors have contributions to my rows? */ 1400 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1401 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1402 rootspacesize = 0; 1403 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1404 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1405 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1406 /* Get information from leaves 1407 * Number of columns other people contribute to my rows 1408 * */ 1409 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1410 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1411 PetscCall(PetscFree(c_rmtc)); 1412 PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi)); 1413 /* The number of columns is received for each row */ 1414 ptap->c_othi[0] = 0; 1415 rootspacesize = 0; 1416 rootspaceoffsets[0] = 0; 1417 for (i = 0; i < pn; i++) { 1418 rcvncols = 0; 1419 for (j = 0; j < rootdegrees[i]; j++) { 1420 rcvncols += rootspace[rootspacesize]; 1421 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1422 rootspacesize++; 1423 } 1424 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1425 } 1426 PetscCall(PetscFree(rootspace)); 1427 1428 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1429 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1430 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1431 PetscCall(PetscSFDestroy(&sf)); 1432 PetscCall(PetscFree(rootspaceoffsets)); 1433 1434 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1435 nleaves = 0; 1436 for (i = 0; i < pon; i++) { 1437 owner = 0; 1438 ii = i / dof; 1439 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1440 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1441 for (j = 0; j < sendncols; j++) { 1442 iremote[nleaves].rank = owner; 1443 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1444 } 1445 } 1446 PetscCall(PetscFree(c_rmtoffsets)); 1447 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1448 1449 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1450 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1451 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1452 /* One to one map */ 1453 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1454 /* Get remote data */ 1455 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1456 PetscCall(PetscFree(c_rmtj)); 1457 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1458 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1459 pcstart *= dof; 1460 pcend *= dof; 1461 for (i = 0; i < pn; i++) { 1462 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1463 rdj = c_othj + ptap->c_othi[i]; 1464 for (j = 0; j < nzi; j++) { 1465 col = rdj[j]; 1466 /* diag part */ 1467 if (col >= pcstart && col < pcend) { 1468 PetscCall(PetscHSetIAdd(htd[i], col)); 1469 } else { /* off diag */ 1470 PetscCall(PetscHSetIAdd(hto[i], col)); 1471 } 1472 } 1473 PetscCall(PetscHSetIGetSize(htd[i], &htsize)); 1474 dnz[i] = htsize; 1475 PetscCall(PetscHSetIDestroy(&htd[i])); 1476 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1477 onz[i] = htsize; 1478 PetscCall(PetscHSetIDestroy(&hto[i])); 1479 } 1480 1481 PetscCall(PetscFree2(htd, hto)); 1482 PetscCall(PetscFree(c_othj)); 1483 1484 /* local sizes and preallocation */ 1485 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1486 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1487 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1488 PetscCall(PetscFree2(dnz, onz)); 1489 1490 /* attach the supporting struct to Cmpi for reuse */ 1491 Cmpi->product->data = ptap; 1492 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1493 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1494 1495 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1496 Cmpi->assembled = PETSC_FALSE; 1497 /* pick an algorithm */ 1498 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1499 alg = 0; 1500 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1501 PetscOptionsEnd(); 1502 switch (alg) { 1503 case 0: 1504 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1505 break; 1506 case 1: 1507 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1508 break; 1509 default: 1510 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1511 } 1512 PetscFunctionReturn(0); 1513 } 1514 1515 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C) 1516 { 1517 PetscFunctionBegin; 1518 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C)); 1519 PetscFunctionReturn(0); 1520 } 1521 1522 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi) 1523 { 1524 Mat_APMPI *ptap; 1525 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1526 MPI_Comm comm; 1527 PetscMPIInt size, rank; 1528 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1529 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 1530 PetscInt *lnk, i, k, pnz, row, nsend; 1531 PetscBT lnkbt; 1532 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 1533 PETSC_UNUSED PetscMPIInt icompleted = 0; 1534 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 1535 PetscInt len, proc, *dnz, *onz, *owners, nzi, nspacedouble; 1536 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1537 MPI_Request *swaits, *rwaits; 1538 MPI_Status *sstatus, rstatus; 1539 PetscLayout rowmap; 1540 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1541 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 1542 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi; 1543 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 1544 PetscScalar *apv; 1545 PetscTable ta; 1546 MatType mtype; 1547 const char *prefix; 1548 #if defined(PETSC_USE_INFO) 1549 PetscReal apfill; 1550 #endif 1551 1552 PetscFunctionBegin; 1553 MatCheckProduct(Cmpi, 4); 1554 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1555 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1556 PetscCallMPI(MPI_Comm_size(comm, &size)); 1557 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1558 1559 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 1560 1561 /* create symbolic parallel matrix Cmpi */ 1562 PetscCall(MatGetType(A, &mtype)); 1563 PetscCall(MatSetType(Cmpi, mtype)); 1564 1565 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1566 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1567 1568 /* create struct Mat_APMPI and attached it to C later */ 1569 PetscCall(PetscNew(&ptap)); 1570 ptap->reuse = MAT_INITIAL_MATRIX; 1571 ptap->algType = 1; 1572 1573 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1574 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1575 /* get P_loc by taking all local rows of P */ 1576 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 1577 1578 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1579 /* --------------------------------- */ 1580 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 1581 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 1582 1583 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1584 /* ----------------------------------------------------------------- */ 1585 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1586 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1587 1588 /* create and initialize a linked list */ 1589 PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */ 1590 MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta); 1591 MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta); 1592 PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 1593 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1594 1595 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1596 1597 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1598 if (ao) { 1599 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 1600 } else { 1601 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 1602 } 1603 current_space = free_space; 1604 nspacedouble = 0; 1605 1606 PetscCall(PetscMalloc1(am + 1, &api)); 1607 api[0] = 0; 1608 for (i = 0; i < am; i++) { 1609 /* diagonal portion: Ad[i,:]*P */ 1610 ai = ad->i; 1611 pi = p_loc->i; 1612 nzi = ai[i + 1] - ai[i]; 1613 aj = ad->j + ai[i]; 1614 for (j = 0; j < nzi; j++) { 1615 row = aj[j]; 1616 pnz = pi[row + 1] - pi[row]; 1617 Jptr = p_loc->j + pi[row]; 1618 /* add non-zero cols of P into the sorted linked list lnk */ 1619 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1620 } 1621 /* off-diagonal portion: Ao[i,:]*P */ 1622 if (ao) { 1623 ai = ao->i; 1624 pi = p_oth->i; 1625 nzi = ai[i + 1] - ai[i]; 1626 aj = ao->j + ai[i]; 1627 for (j = 0; j < nzi; j++) { 1628 row = aj[j]; 1629 pnz = pi[row + 1] - pi[row]; 1630 Jptr = p_oth->j + pi[row]; 1631 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1632 } 1633 } 1634 apnz = lnk[0]; 1635 api[i + 1] = api[i] + apnz; 1636 if (ap_rmax < apnz) ap_rmax = apnz; 1637 1638 /* if free space is not available, double the total space in the list */ 1639 if (current_space->local_remaining < apnz) { 1640 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 1641 nspacedouble++; 1642 } 1643 1644 /* Copy data into free space, then initialize lnk */ 1645 PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt)); 1646 1647 current_space->array += apnz; 1648 current_space->local_used += apnz; 1649 current_space->local_remaining -= apnz; 1650 } 1651 /* Allocate space for apj and apv, initialize apj, and */ 1652 /* destroy list of free space and other temporary array(s) */ 1653 PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv)); 1654 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 1655 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1656 1657 /* Create AP_loc for reuse */ 1658 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 1659 PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name)); 1660 #if defined(PETSC_USE_INFO) 1661 if (ao) { 1662 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 1663 } else { 1664 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 1665 } 1666 ptap->AP_loc->info.mallocs = nspacedouble; 1667 ptap->AP_loc->info.fill_ratio_given = fill; 1668 ptap->AP_loc->info.fill_ratio_needed = apfill; 1669 1670 if (api[am]) { 1671 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 1672 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 1673 } else { 1674 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n")); 1675 } 1676 #endif 1677 1678 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1679 /* ------------------------------------ */ 1680 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1681 PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix)); 1682 PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_")); 1683 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 1684 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1685 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 1686 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_")); 1687 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 1688 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default")); 1689 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 1690 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 1691 PetscCall(MatProductSymbolic(ptap->C_oth)); 1692 1693 /* (3) send coj of C_oth to other processors */ 1694 /* ------------------------------------------ */ 1695 /* determine row ownership */ 1696 PetscCall(PetscLayoutCreate(comm, &rowmap)); 1697 rowmap->n = pn; 1698 rowmap->bs = 1; 1699 PetscCall(PetscLayoutSetUp(rowmap)); 1700 owners = rowmap->range; 1701 1702 /* determine the number of messages to send, their lengths */ 1703 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 1704 PetscCall(PetscArrayzero(len_s, size)); 1705 PetscCall(PetscArrayzero(len_si, size)); 1706 1707 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 1708 coi = c_oth->i; 1709 coj = c_oth->j; 1710 con = ptap->C_oth->rmap->n; 1711 proc = 0; 1712 for (i = 0; i < con; i++) { 1713 while (prmap[i] >= owners[proc + 1]) proc++; 1714 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1715 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1716 } 1717 1718 len = 0; /* max length of buf_si[], see (4) */ 1719 owners_co[0] = 0; 1720 nsend = 0; 1721 for (proc = 0; proc < size; proc++) { 1722 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1723 if (len_s[proc]) { 1724 nsend++; 1725 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1726 len += len_si[proc]; 1727 } 1728 } 1729 1730 /* determine the number and length of messages to receive for coi and coj */ 1731 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 1732 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 1733 1734 /* post the Irecv and Isend of coj */ 1735 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1736 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 1737 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 1738 for (proc = 0, k = 0; proc < size; proc++) { 1739 if (!len_s[proc]) continue; 1740 i = owners_co[proc]; 1741 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1742 k++; 1743 } 1744 1745 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1746 /* ---------------------------------------- */ 1747 PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix)); 1748 PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_")); 1749 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 1750 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1751 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 1752 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_")); 1753 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 1754 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 1755 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 1756 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 1757 PetscCall(MatProductSymbolic(ptap->C_loc)); 1758 1759 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 1760 1761 /* receives coj are complete */ 1762 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1763 PetscCall(PetscFree(rwaits)); 1764 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1765 1766 /* add received column indices into ta to update Crmax */ 1767 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1768 Jptr = buf_rj[k]; 1769 for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES)); 1770 } 1771 PetscCall(PetscTableGetCount(ta, &Crmax)); 1772 PetscCall(PetscTableDestroy(&ta)); 1773 1774 /* (4) send and recv coi */ 1775 /*-----------------------*/ 1776 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1777 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 1778 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1779 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1780 for (proc = 0, k = 0; proc < size; proc++) { 1781 if (!len_s[proc]) continue; 1782 /* form outgoing message for i-structure: 1783 buf_si[0]: nrows to be sent 1784 [1:nrows]: row index (global) 1785 [nrows+1:2*nrows+1]: i-structure index 1786 */ 1787 /*-------------------------------------------*/ 1788 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 1789 buf_si_i = buf_si + nrows + 1; 1790 buf_si[0] = nrows; 1791 buf_si_i[0] = 0; 1792 nrows = 0; 1793 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1794 nzi = coi[i + 1] - coi[i]; 1795 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1796 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1797 nrows++; 1798 } 1799 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1800 k++; 1801 buf_si += len_si[proc]; 1802 } 1803 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1804 PetscCall(PetscFree(rwaits)); 1805 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1806 1807 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 1808 PetscCall(PetscFree(len_ri)); 1809 PetscCall(PetscFree(swaits)); 1810 PetscCall(PetscFree(buf_s)); 1811 1812 /* (5) compute the local portion of Cmpi */ 1813 /* ------------------------------------------ */ 1814 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1815 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 1816 current_space = free_space; 1817 1818 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 1819 for (k = 0; k < nrecv; k++) { 1820 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1821 nrows = *buf_ri_k[k]; 1822 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1823 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1824 } 1825 1826 MatPreallocateBegin(comm, pn, pn, dnz, onz); 1827 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1828 for (i = 0; i < pn; i++) { 1829 /* add C_loc into Cmpi */ 1830 nzi = c_loc->i[i + 1] - c_loc->i[i]; 1831 Jptr = c_loc->j + c_loc->i[i]; 1832 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1833 1834 /* add received col data into lnk */ 1835 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1836 if (i == *nextrow[k]) { /* i-th row */ 1837 nzi = *(nextci[k] + 1) - *nextci[k]; 1838 Jptr = buf_rj[k] + *nextci[k]; 1839 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1840 nextrow[k]++; 1841 nextci[k]++; 1842 } 1843 } 1844 nzi = lnk[0]; 1845 1846 /* copy data into free space, then initialize lnk */ 1847 PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt)); 1848 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 1849 } 1850 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1851 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1852 PetscCall(PetscFreeSpaceDestroy(free_space)); 1853 1854 /* local sizes and preallocation */ 1855 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1856 if (P->cmap->bs > 0) { 1857 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 1858 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 1859 } 1860 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1861 MatPreallocateEnd(dnz, onz); 1862 1863 /* members in merge */ 1864 PetscCall(PetscFree(id_r)); 1865 PetscCall(PetscFree(len_r)); 1866 PetscCall(PetscFree(buf_ri[0])); 1867 PetscCall(PetscFree(buf_ri)); 1868 PetscCall(PetscFree(buf_rj[0])); 1869 PetscCall(PetscFree(buf_rj)); 1870 PetscCall(PetscLayoutDestroy(&rowmap)); 1871 1872 PetscCall(PetscCalloc1(pN, &ptap->apa)); 1873 1874 /* attach the supporting struct to Cmpi for reuse */ 1875 Cmpi->product->data = ptap; 1876 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1877 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1878 1879 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1880 Cmpi->assembled = PETSC_FALSE; 1881 PetscFunctionReturn(0); 1882 } 1883 1884 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) 1885 { 1886 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1887 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 1888 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 1889 Mat_APMPI *ptap; 1890 Mat AP_loc, C_loc, C_oth; 1891 PetscInt i, rstart, rend, cm, ncols, row; 1892 PetscInt *api, *apj, am = A->rmap->n, j, col, apnz; 1893 PetscScalar *apa; 1894 const PetscInt *cols; 1895 const PetscScalar *vals; 1896 1897 PetscFunctionBegin; 1898 MatCheckProduct(C, 3); 1899 ptap = (Mat_APMPI *)C->product->data; 1900 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1901 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 1902 1903 PetscCall(MatZeroEntries(C)); 1904 /* 1) get R = Pd^T,Ro = Po^T */ 1905 if (ptap->reuse == MAT_REUSE_MATRIX) { 1906 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 1907 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 1908 } 1909 1910 /* 2) get AP_loc */ 1911 AP_loc = ptap->AP_loc; 1912 ap = (Mat_SeqAIJ *)AP_loc->data; 1913 1914 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1915 /*-----------------------------------------------------*/ 1916 if (ptap->reuse == MAT_REUSE_MATRIX) { 1917 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1918 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1919 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 1920 } 1921 1922 /* 2-2) compute numeric A_loc*P - dominating part */ 1923 /* ---------------------------------------------- */ 1924 /* get data from symbolic products */ 1925 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1926 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1927 apa = ptap->apa; 1928 api = ap->i; 1929 apj = ap->j; 1930 for (i = 0; i < am; i++) { 1931 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1932 AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa); 1933 apnz = api[i + 1] - api[i]; 1934 for (j = 0; j < apnz; j++) { 1935 col = apj[j + api[i]]; 1936 ap->a[j + ap->i[i]] = apa[col]; 1937 apa[col] = 0.0; 1938 } 1939 } 1940 /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */ 1941 PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc)); 1942 1943 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1944 PetscCall(MatProductNumeric(ptap->C_loc)); 1945 PetscCall(MatProductNumeric(ptap->C_oth)); 1946 C_loc = ptap->C_loc; 1947 C_oth = ptap->C_oth; 1948 1949 /* add C_loc and Co to to C */ 1950 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 1951 1952 /* C_loc -> C */ 1953 cm = C_loc->rmap->N; 1954 c_seq = (Mat_SeqAIJ *)C_loc->data; 1955 cols = c_seq->j; 1956 vals = c_seq->a; 1957 1958 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 1959 /* when there are no off-processor parts. */ 1960 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 1961 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 1962 /* a table, and other, more complex stuff has to be done. */ 1963 if (C->assembled) { 1964 C->was_assembled = PETSC_TRUE; 1965 C->assembled = PETSC_FALSE; 1966 } 1967 if (C->was_assembled) { 1968 for (i = 0; i < cm; i++) { 1969 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1970 row = rstart + i; 1971 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1972 cols += ncols; 1973 vals += ncols; 1974 } 1975 } else { 1976 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 1977 } 1978 1979 /* Co -> C, off-processor part */ 1980 cm = C_oth->rmap->N; 1981 c_seq = (Mat_SeqAIJ *)C_oth->data; 1982 cols = c_seq->j; 1983 vals = c_seq->a; 1984 for (i = 0; i < cm; i++) { 1985 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1986 row = p->garray[i]; 1987 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1988 cols += ncols; 1989 vals += ncols; 1990 } 1991 1992 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1993 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1994 1995 ptap->reuse = MAT_REUSE_MATRIX; 1996 PetscFunctionReturn(0); 1997 } 1998 1999 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) 2000 { 2001 Mat_Product *product = C->product; 2002 Mat A = product->A, P = product->B; 2003 MatProductAlgorithm alg = product->alg; 2004 PetscReal fill = product->fill; 2005 PetscBool flg; 2006 2007 PetscFunctionBegin; 2008 /* scalable: do R=P^T locally, then C=R*A*P */ 2009 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 2010 if (flg) { 2011 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C)); 2012 C->ops->productnumeric = MatProductNumeric_PtAP; 2013 goto next; 2014 } 2015 2016 /* nonscalable: do R=P^T locally, then C=R*A*P */ 2017 PetscCall(PetscStrcmp(alg, "nonscalable", &flg)); 2018 if (flg) { 2019 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C)); 2020 goto next; 2021 } 2022 2023 /* allatonce */ 2024 PetscCall(PetscStrcmp(alg, "allatonce", &flg)); 2025 if (flg) { 2026 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C)); 2027 goto next; 2028 } 2029 2030 /* allatonce_merged */ 2031 PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg)); 2032 if (flg) { 2033 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C)); 2034 goto next; 2035 } 2036 2037 /* backend general code */ 2038 PetscCall(PetscStrcmp(alg, "backend", &flg)); 2039 if (flg) { 2040 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 2041 PetscFunctionReturn(0); 2042 } 2043 2044 /* hypre */ 2045 #if defined(PETSC_HAVE_HYPRE) 2046 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 2047 if (flg) { 2048 PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C)); 2049 C->ops->productnumeric = MatProductNumeric_PtAP; 2050 PetscFunctionReturn(0); 2051 } 2052 #endif 2053 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 2054 2055 next: 2056 C->ops->productnumeric = MatProductNumeric_PtAP; 2057 PetscFunctionReturn(0); 2058 } 2059