1 2 /* 3 Defines projective product routines where A is a MPIAIJ matrix 4 C = P^T * A * P 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <../src/mat/impls/aij/mpi/mpiaij.h> 10 #include <petscbt.h> 11 #include <petsctime.h> 12 #include <petsc/private/hashmapiv.h> 13 #include <petsc/private/hashseti.h> 14 #include <petscsf.h> 15 16 PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer) { 17 PetscBool iascii; 18 PetscViewerFormat format; 19 Mat_APMPI *ptap; 20 21 PetscFunctionBegin; 22 MatCheckProduct(A, 1); 23 ptap = (Mat_APMPI *)A->product->data; 24 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii)); 25 if (iascii) { 26 PetscCall(PetscViewerGetFormat(viewer, &format)); 27 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 28 if (ptap->algType == 0) { 29 PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n")); 30 } else if (ptap->algType == 1) { 31 PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n")); 32 } else if (ptap->algType == 2) { 33 PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n")); 34 } else if (ptap->algType == 3) { 35 PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n")); 36 } 37 } 38 } 39 PetscFunctionReturn(0); 40 } 41 42 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) { 43 Mat_APMPI *ptap = (Mat_APMPI *)data; 44 Mat_Merge_SeqsToMPI *merge; 45 46 PetscFunctionBegin; 47 PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r)); 48 PetscCall(PetscFree(ptap->bufa)); 49 PetscCall(MatDestroy(&ptap->P_loc)); 50 PetscCall(MatDestroy(&ptap->P_oth)); 51 PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */ 52 PetscCall(MatDestroy(&ptap->Rd)); 53 PetscCall(MatDestroy(&ptap->Ro)); 54 if (ptap->AP_loc) { /* used by alg_rap */ 55 Mat_SeqAIJ *ap = (Mat_SeqAIJ *)(ptap->AP_loc)->data; 56 PetscCall(PetscFree(ap->i)); 57 PetscCall(PetscFree2(ap->j, ap->a)); 58 PetscCall(MatDestroy(&ptap->AP_loc)); 59 } else { /* used by alg_ptap */ 60 PetscCall(PetscFree(ptap->api)); 61 PetscCall(PetscFree(ptap->apj)); 62 } 63 PetscCall(MatDestroy(&ptap->C_loc)); 64 PetscCall(MatDestroy(&ptap->C_oth)); 65 if (ptap->apa) PetscCall(PetscFree(ptap->apa)); 66 67 PetscCall(MatDestroy(&ptap->Pt)); 68 69 merge = ptap->merge; 70 if (merge) { /* used by alg_ptap */ 71 PetscCall(PetscFree(merge->id_r)); 72 PetscCall(PetscFree(merge->len_s)); 73 PetscCall(PetscFree(merge->len_r)); 74 PetscCall(PetscFree(merge->bi)); 75 PetscCall(PetscFree(merge->bj)); 76 PetscCall(PetscFree(merge->buf_ri[0])); 77 PetscCall(PetscFree(merge->buf_ri)); 78 PetscCall(PetscFree(merge->buf_rj[0])); 79 PetscCall(PetscFree(merge->buf_rj)); 80 PetscCall(PetscFree(merge->coi)); 81 PetscCall(PetscFree(merge->coj)); 82 PetscCall(PetscFree(merge->owners_co)); 83 PetscCall(PetscLayoutDestroy(&merge->rowmap)); 84 PetscCall(PetscFree(ptap->merge)); 85 } 86 PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog)); 87 88 PetscCall(PetscSFDestroy(&ptap->sf)); 89 PetscCall(PetscFree(ptap->c_othi)); 90 PetscCall(PetscFree(ptap->c_rmti)); 91 PetscCall(PetscFree(ptap)); 92 PetscFunctionReturn(0); 93 } 94 95 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C) { 96 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 97 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 98 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 99 Mat_APMPI *ptap; 100 Mat AP_loc, C_loc, C_oth; 101 PetscInt i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout; 102 PetscScalar *apa; 103 const PetscInt *cols; 104 const PetscScalar *vals; 105 106 PetscFunctionBegin; 107 MatCheckProduct(C, 3); 108 ptap = (Mat_APMPI *)C->product->data; 109 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 110 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 111 112 PetscCall(MatZeroEntries(C)); 113 114 /* 1) get R = Pd^T,Ro = Po^T */ 115 if (ptap->reuse == MAT_REUSE_MATRIX) { 116 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 117 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 118 } 119 120 /* 2) get AP_loc */ 121 AP_loc = ptap->AP_loc; 122 ap = (Mat_SeqAIJ *)AP_loc->data; 123 124 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 125 /*-----------------------------------------------------*/ 126 if (ptap->reuse == MAT_REUSE_MATRIX) { 127 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 128 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 129 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 130 } 131 132 /* 2-2) compute numeric A_loc*P - dominating part */ 133 /* ---------------------------------------------- */ 134 /* get data from symbolic products */ 135 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 136 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 137 138 api = ap->i; 139 apj = ap->j; 140 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj)); 141 for (i = 0; i < am; i++) { 142 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 143 apnz = api[i + 1] - api[i]; 144 apa = ap->a + api[i]; 145 PetscCall(PetscArrayzero(apa, apnz)); 146 AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa); 147 } 148 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj)); 149 PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout); 150 151 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 152 /* Always use scalable version since we are in the MPI scalable version */ 153 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc)); 154 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth)); 155 156 C_loc = ptap->C_loc; 157 C_oth = ptap->C_oth; 158 159 /* add C_loc and Co to to C */ 160 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 161 162 /* C_loc -> C */ 163 cm = C_loc->rmap->N; 164 c_seq = (Mat_SeqAIJ *)C_loc->data; 165 cols = c_seq->j; 166 vals = c_seq->a; 167 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j)); 168 169 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 170 /* when there are no off-processor parts. */ 171 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 172 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 173 /* a table, and other, more complex stuff has to be done. */ 174 if (C->assembled) { 175 C->was_assembled = PETSC_TRUE; 176 C->assembled = PETSC_FALSE; 177 } 178 if (C->was_assembled) { 179 for (i = 0; i < cm; i++) { 180 ncols = c_seq->i[i + 1] - c_seq->i[i]; 181 row = rstart + i; 182 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 183 cols += ncols; 184 vals += ncols; 185 } 186 } else { 187 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 188 } 189 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j)); 190 PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 191 192 /* Co -> C, off-processor part */ 193 cm = C_oth->rmap->N; 194 c_seq = (Mat_SeqAIJ *)C_oth->data; 195 cols = c_seq->j; 196 vals = c_seq->a; 197 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j)); 198 for (i = 0; i < cm; i++) { 199 ncols = c_seq->i[i + 1] - c_seq->i[i]; 200 row = p->garray[i]; 201 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 202 cols += ncols; 203 vals += ncols; 204 } 205 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 206 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 207 208 ptap->reuse = MAT_REUSE_MATRIX; 209 210 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j)); 211 PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 212 PetscFunctionReturn(0); 213 } 214 215 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi) { 216 Mat_APMPI *ptap; 217 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 218 MPI_Comm comm; 219 PetscMPIInt size, rank; 220 Mat P_loc, P_oth; 221 PetscFreeSpaceList free_space = NULL, current_space = NULL; 222 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 223 PetscInt *lnk, i, k, pnz, row, nsend; 224 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 225 PETSC_UNUSED PetscMPIInt icompleted = 0; 226 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 227 const PetscInt *owners; 228 PetscInt len, proc, *dnz, *onz, nzi, nspacedouble; 229 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 230 MPI_Request *swaits, *rwaits; 231 MPI_Status *sstatus, rstatus; 232 PetscLayout rowmap; 233 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 234 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 235 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout; 236 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 237 PetscScalar *apv; 238 PetscTable ta; 239 MatType mtype; 240 const char *prefix; 241 #if defined(PETSC_USE_INFO) 242 PetscReal apfill; 243 #endif 244 245 PetscFunctionBegin; 246 MatCheckProduct(Cmpi, 4); 247 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 248 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 249 PetscCallMPI(MPI_Comm_size(comm, &size)); 250 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 251 252 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 253 254 /* create symbolic parallel matrix Cmpi */ 255 PetscCall(MatGetType(A, &mtype)); 256 PetscCall(MatSetType(Cmpi, mtype)); 257 258 /* create struct Mat_APMPI and attached it to C later */ 259 PetscCall(PetscNew(&ptap)); 260 ptap->reuse = MAT_INITIAL_MATRIX; 261 ptap->algType = 0; 262 263 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 264 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth)); 265 /* get P_loc by taking all local rows of P */ 266 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc)); 267 268 ptap->P_loc = P_loc; 269 ptap->P_oth = P_oth; 270 271 /* (0) compute Rd = Pd^T, Ro = Po^T */ 272 /* --------------------------------- */ 273 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 274 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 275 276 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 277 /* ----------------------------------------------------------------- */ 278 p_loc = (Mat_SeqAIJ *)P_loc->data; 279 if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data; 280 281 /* create and initialize a linked list */ 282 PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */ 283 MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta); 284 MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta); 285 PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 286 287 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 288 289 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 290 if (ao) { 291 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 292 } else { 293 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 294 } 295 current_space = free_space; 296 nspacedouble = 0; 297 298 PetscCall(PetscMalloc1(am + 1, &api)); 299 api[0] = 0; 300 for (i = 0; i < am; i++) { 301 /* diagonal portion: Ad[i,:]*P */ 302 ai = ad->i; 303 pi = p_loc->i; 304 nzi = ai[i + 1] - ai[i]; 305 aj = ad->j + ai[i]; 306 for (j = 0; j < nzi; j++) { 307 row = aj[j]; 308 pnz = pi[row + 1] - pi[row]; 309 Jptr = p_loc->j + pi[row]; 310 /* add non-zero cols of P into the sorted linked list lnk */ 311 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 312 } 313 /* off-diagonal portion: Ao[i,:]*P */ 314 if (ao) { 315 ai = ao->i; 316 pi = p_oth->i; 317 nzi = ai[i + 1] - ai[i]; 318 aj = ao->j + ai[i]; 319 for (j = 0; j < nzi; j++) { 320 row = aj[j]; 321 pnz = pi[row + 1] - pi[row]; 322 Jptr = p_oth->j + pi[row]; 323 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 324 } 325 } 326 apnz = lnk[0]; 327 api[i + 1] = api[i] + apnz; 328 329 /* if free space is not available, double the total space in the list */ 330 if (current_space->local_remaining < apnz) { 331 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 332 nspacedouble++; 333 } 334 335 /* Copy data into free space, then initialize lnk */ 336 PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk)); 337 338 current_space->array += apnz; 339 current_space->local_used += apnz; 340 current_space->local_remaining -= apnz; 341 } 342 /* Allocate space for apj and apv, initialize apj, and */ 343 /* destroy list of free space and other temporary array(s) */ 344 PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv)); 345 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 346 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 347 348 /* Create AP_loc for reuse */ 349 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 350 PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog)); 351 352 #if defined(PETSC_USE_INFO) 353 if (ao) { 354 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 355 } else { 356 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 357 } 358 ptap->AP_loc->info.mallocs = nspacedouble; 359 ptap->AP_loc->info.fill_ratio_given = fill; 360 ptap->AP_loc->info.fill_ratio_needed = apfill; 361 362 if (api[am]) { 363 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 364 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 365 } else { 366 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n")); 367 } 368 #endif 369 370 /* (2-1) compute symbolic Co = Ro*AP_loc */ 371 /* -------------------------------------- */ 372 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 373 PetscCall(MatGetOptionsPrefix(A, &prefix)); 374 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 375 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_")); 376 377 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 378 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted")); 379 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 380 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 381 PetscCall(MatProductSymbolic(ptap->C_oth)); 382 383 /* (3) send coj of C_oth to other processors */ 384 /* ------------------------------------------ */ 385 /* determine row ownership */ 386 PetscCall(PetscLayoutCreate(comm, &rowmap)); 387 PetscCall(PetscLayoutSetLocalSize(rowmap, pn)); 388 PetscCall(PetscLayoutSetBlockSize(rowmap, 1)); 389 PetscCall(PetscLayoutSetUp(rowmap)); 390 PetscCall(PetscLayoutGetRanges(rowmap, &owners)); 391 392 /* determine the number of messages to send, their lengths */ 393 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 394 PetscCall(PetscArrayzero(len_s, size)); 395 PetscCall(PetscArrayzero(len_si, size)); 396 397 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 398 coi = c_oth->i; 399 coj = c_oth->j; 400 con = ptap->C_oth->rmap->n; 401 proc = 0; 402 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj)); 403 for (i = 0; i < con; i++) { 404 while (prmap[i] >= owners[proc + 1]) proc++; 405 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 406 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 407 } 408 409 len = 0; /* max length of buf_si[], see (4) */ 410 owners_co[0] = 0; 411 nsend = 0; 412 for (proc = 0; proc < size; proc++) { 413 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 414 if (len_s[proc]) { 415 nsend++; 416 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 417 len += len_si[proc]; 418 } 419 } 420 421 /* determine the number and length of messages to receive for coi and coj */ 422 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 423 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 424 425 /* post the Irecv and Isend of coj */ 426 PetscCall(PetscCommGetNewTag(comm, &tagj)); 427 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 428 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 429 for (proc = 0, k = 0; proc < size; proc++) { 430 if (!len_s[proc]) continue; 431 i = owners_co[proc]; 432 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 433 k++; 434 } 435 436 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 437 /* ---------------------------------------- */ 438 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 439 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 440 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 441 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 442 443 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 444 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_")); 445 446 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 447 PetscCall(MatProductSymbolic(ptap->C_loc)); 448 449 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 450 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j)); 451 452 /* receives coj are complete */ 453 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 454 PetscCall(PetscFree(rwaits)); 455 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 456 457 /* add received column indices into ta to update Crmax */ 458 for (k = 0; k < nrecv; k++) { /* k-th received message */ 459 Jptr = buf_rj[k]; 460 for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES)); 461 } 462 PetscCall(PetscTableGetCount(ta, &Crmax)); 463 PetscCall(PetscTableDestroy(&ta)); 464 465 /* (4) send and recv coi */ 466 /*-----------------------*/ 467 PetscCall(PetscCommGetNewTag(comm, &tagi)); 468 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 469 PetscCall(PetscMalloc1(len + 1, &buf_s)); 470 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 471 for (proc = 0, k = 0; proc < size; proc++) { 472 if (!len_s[proc]) continue; 473 /* form outgoing message for i-structure: 474 buf_si[0]: nrows to be sent 475 [1:nrows]: row index (global) 476 [nrows+1:2*nrows+1]: i-structure index 477 */ 478 /*-------------------------------------------*/ 479 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 480 buf_si_i = buf_si + nrows + 1; 481 buf_si[0] = nrows; 482 buf_si_i[0] = 0; 483 nrows = 0; 484 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 485 nzi = coi[i + 1] - coi[i]; 486 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 487 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 488 nrows++; 489 } 490 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 491 k++; 492 buf_si += len_si[proc]; 493 } 494 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 495 PetscCall(PetscFree(rwaits)); 496 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 497 498 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 499 PetscCall(PetscFree(len_ri)); 500 PetscCall(PetscFree(swaits)); 501 PetscCall(PetscFree(buf_s)); 502 503 /* (5) compute the local portion of Cmpi */ 504 /* ------------------------------------------ */ 505 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 506 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 507 current_space = free_space; 508 509 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 510 for (k = 0; k < nrecv; k++) { 511 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 512 nrows = *buf_ri_k[k]; 513 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 514 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 515 } 516 517 MatPreallocateBegin(comm, pn, pn, dnz, onz); 518 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 519 for (i = 0; i < pn; i++) { 520 /* add C_loc into Cmpi */ 521 nzi = c_loc->i[i + 1] - c_loc->i[i]; 522 Jptr = c_loc->j + c_loc->i[i]; 523 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 524 525 /* add received col data into lnk */ 526 for (k = 0; k < nrecv; k++) { /* k-th received message */ 527 if (i == *nextrow[k]) { /* i-th row */ 528 nzi = *(nextci[k] + 1) - *nextci[k]; 529 Jptr = buf_rj[k] + *nextci[k]; 530 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 531 nextrow[k]++; 532 nextci[k]++; 533 } 534 } 535 nzi = lnk[0]; 536 537 /* copy data into free space, then initialize lnk */ 538 PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk)); 539 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 540 } 541 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 542 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 543 PetscCall(PetscFreeSpaceDestroy(free_space)); 544 545 /* local sizes and preallocation */ 546 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 547 if (P->cmap->bs > 0) { 548 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 549 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 550 } 551 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 552 MatPreallocateEnd(dnz, onz); 553 554 /* members in merge */ 555 PetscCall(PetscFree(id_r)); 556 PetscCall(PetscFree(len_r)); 557 PetscCall(PetscFree(buf_ri[0])); 558 PetscCall(PetscFree(buf_ri)); 559 PetscCall(PetscFree(buf_rj[0])); 560 PetscCall(PetscFree(buf_rj)); 561 PetscCall(PetscLayoutDestroy(&rowmap)); 562 563 nout = 0; 564 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j)); 565 PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout); 566 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j)); 567 PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout); 568 569 /* attach the supporting struct to Cmpi for reuse */ 570 Cmpi->product->data = ptap; 571 Cmpi->product->view = MatView_MPIAIJ_PtAP; 572 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 573 574 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 575 Cmpi->assembled = PETSC_FALSE; 576 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 577 PetscFunctionReturn(0); 578 } 579 580 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht) { 581 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 582 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 583 PetscInt *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k; 584 PetscInt pcstart, pcend, column, offset; 585 586 PetscFunctionBegin; 587 pcstart = P->cmap->rstart; 588 pcstart *= dof; 589 pcend = P->cmap->rend; 590 pcend *= dof; 591 /* diagonal portion: Ad[i,:]*P */ 592 ai = ad->i; 593 nzi = ai[i + 1] - ai[i]; 594 aj = ad->j + ai[i]; 595 for (j = 0; j < nzi; j++) { 596 row = aj[j]; 597 offset = row % dof; 598 row /= dof; 599 nzpi = pd->i[row + 1] - pd->i[row]; 600 pj = pd->j + pd->i[row]; 601 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart)); 602 } 603 /* off diag P */ 604 for (j = 0; j < nzi; j++) { 605 row = aj[j]; 606 offset = row % dof; 607 row /= dof; 608 nzpi = po->i[row + 1] - po->i[row]; 609 pj = po->j + po->i[row]; 610 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset)); 611 } 612 613 /* off diagonal part: Ao[i, :]*P_oth */ 614 if (ao) { 615 ai = ao->i; 616 pi = p_oth->i; 617 nzi = ai[i + 1] - ai[i]; 618 aj = ao->j + ai[i]; 619 for (j = 0; j < nzi; j++) { 620 row = aj[j]; 621 offset = a->garray[row] % dof; 622 row = map[row]; 623 pnz = pi[row + 1] - pi[row]; 624 p_othcols = p_oth->j + pi[row]; 625 for (col = 0; col < pnz; col++) { 626 column = p_othcols[col] * dof + offset; 627 if (column >= pcstart && column < pcend) { 628 PetscCall(PetscHSetIAdd(dht, column)); 629 } else { 630 PetscCall(PetscHSetIAdd(oht, column)); 631 } 632 } 633 } 634 } /* end if (ao) */ 635 PetscFunctionReturn(0); 636 } 637 638 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap) { 639 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 640 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 641 PetscInt *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset; 642 PetscScalar ra, *aa, *pa; 643 644 PetscFunctionBegin; 645 pcstart = P->cmap->rstart; 646 pcstart *= dof; 647 648 /* diagonal portion: Ad[i,:]*P */ 649 ai = ad->i; 650 nzi = ai[i + 1] - ai[i]; 651 aj = ad->j + ai[i]; 652 aa = ad->a + ai[i]; 653 for (j = 0; j < nzi; j++) { 654 ra = aa[j]; 655 row = aj[j]; 656 offset = row % dof; 657 row /= dof; 658 nzpi = pd->i[row + 1] - pd->i[row]; 659 pj = pd->j + pd->i[row]; 660 pa = pd->a + pd->i[row]; 661 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k])); 662 PetscCall(PetscLogFlops(2.0 * nzpi)); 663 } 664 for (j = 0; j < nzi; j++) { 665 ra = aa[j]; 666 row = aj[j]; 667 offset = row % dof; 668 row /= dof; 669 nzpi = po->i[row + 1] - po->i[row]; 670 pj = po->j + po->i[row]; 671 pa = po->a + po->i[row]; 672 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k])); 673 PetscCall(PetscLogFlops(2.0 * nzpi)); 674 } 675 676 /* off diagonal part: Ao[i, :]*P_oth */ 677 if (ao) { 678 ai = ao->i; 679 pi = p_oth->i; 680 nzi = ai[i + 1] - ai[i]; 681 aj = ao->j + ai[i]; 682 aa = ao->a + ai[i]; 683 for (j = 0; j < nzi; j++) { 684 row = aj[j]; 685 offset = a->garray[row] % dof; 686 row = map[row]; 687 ra = aa[j]; 688 pnz = pi[row + 1] - pi[row]; 689 p_othcols = p_oth->j + pi[row]; 690 pa = p_oth->a + pi[row]; 691 for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col])); 692 PetscCall(PetscLogFlops(2.0 * pnz)); 693 } 694 } /* end if (ao) */ 695 696 PetscFunctionReturn(0); 697 } 698 699 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *); 700 701 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C) { 702 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 703 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 704 Mat_APMPI *ptap; 705 PetscHMapIV hmap; 706 PetscInt i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc; 707 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 708 PetscInt offset, ii, pocol; 709 const PetscInt *mappingindices; 710 IS map; 711 712 PetscFunctionBegin; 713 MatCheckProduct(C, 4); 714 ptap = (Mat_APMPI *)C->product->data; 715 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 716 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 717 718 PetscCall(MatZeroEntries(C)); 719 720 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 721 /*-----------------------------------------------------*/ 722 if (ptap->reuse == MAT_REUSE_MATRIX) { 723 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 724 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 725 } 726 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 727 728 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 729 pon *= dof; 730 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 731 PetscCall(MatGetLocalSize(A, &am, NULL)); 732 cmaxr = 0; 733 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 734 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 735 PetscCall(PetscHMapIVCreate(&hmap)); 736 PetscCall(PetscHMapIVResize(hmap, cmaxr)); 737 PetscCall(ISGetIndices(map, &mappingindices)); 738 for (i = 0; i < am && pon; i++) { 739 PetscCall(PetscHMapIVClear(hmap)); 740 offset = i % dof; 741 ii = i / dof; 742 nzi = po->i[ii + 1] - po->i[ii]; 743 if (!nzi) continue; 744 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 745 voff = 0; 746 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 747 if (!voff) continue; 748 749 /* Form C(ii, :) */ 750 poj = po->j + po->i[ii]; 751 poa = po->a + po->i[ii]; 752 for (j = 0; j < nzi; j++) { 753 pocol = poj[j] * dof + offset; 754 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 755 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 756 for (jj = 0; jj < voff; jj++) { 757 apvaluestmp[jj] = apvalues[jj] * poa[j]; 758 /*If the row is empty */ 759 if (!c_rmtc[pocol]) { 760 c_rmtjj[jj] = apindices[jj]; 761 c_rmtaa[jj] = apvaluestmp[jj]; 762 c_rmtc[pocol]++; 763 } else { 764 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 765 if (loc >= 0) { /* hit */ 766 c_rmtaa[loc] += apvaluestmp[jj]; 767 PetscCall(PetscLogFlops(1.0)); 768 } else { /* new element */ 769 loc = -(loc + 1); 770 /* Move data backward */ 771 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 772 c_rmtjj[kk] = c_rmtjj[kk - 1]; 773 c_rmtaa[kk] = c_rmtaa[kk - 1]; 774 } /* End kk */ 775 c_rmtjj[loc] = apindices[jj]; 776 c_rmtaa[loc] = apvaluestmp[jj]; 777 c_rmtc[pocol]++; 778 } 779 } 780 PetscCall(PetscLogFlops(voff)); 781 } /* End jj */ 782 } /* End j */ 783 } /* End i */ 784 785 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 786 PetscCall(PetscHMapIVDestroy(&hmap)); 787 788 PetscCall(MatGetLocalSize(P, NULL, &pn)); 789 pn *= dof; 790 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 791 792 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 793 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 794 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 795 pcstart = pcstart * dof; 796 pcend = pcend * dof; 797 cd = (Mat_SeqAIJ *)(c->A)->data; 798 co = (Mat_SeqAIJ *)(c->B)->data; 799 800 cmaxr = 0; 801 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 802 PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ)); 803 PetscCall(PetscHMapIVCreate(&hmap)); 804 PetscCall(PetscHMapIVResize(hmap, cmaxr)); 805 for (i = 0; i < am && pn; i++) { 806 PetscCall(PetscHMapIVClear(hmap)); 807 offset = i % dof; 808 ii = i / dof; 809 nzi = pd->i[ii + 1] - pd->i[ii]; 810 if (!nzi) continue; 811 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 812 voff = 0; 813 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 814 if (!voff) continue; 815 /* Form C(ii, :) */ 816 pdj = pd->j + pd->i[ii]; 817 pda = pd->a + pd->i[ii]; 818 for (j = 0; j < nzi; j++) { 819 row = pcstart + pdj[j] * dof + offset; 820 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; 821 PetscCall(PetscLogFlops(voff)); 822 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 823 } 824 } 825 PetscCall(ISRestoreIndices(map, &mappingindices)); 826 PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend)); 827 PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ)); 828 PetscCall(PetscHMapIVDestroy(&hmap)); 829 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 830 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 831 PetscCall(PetscFree2(c_rmtj, c_rmta)); 832 833 /* Add contributions from remote */ 834 for (i = 0; i < pn; i++) { 835 row = i + pcstart; 836 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 837 } 838 PetscCall(PetscFree2(c_othj, c_otha)); 839 840 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 841 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 842 843 ptap->reuse = MAT_REUSE_MATRIX; 844 PetscFunctionReturn(0); 845 } 846 847 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C) { 848 PetscFunctionBegin; 849 850 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C)); 851 PetscFunctionReturn(0); 852 } 853 854 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C) { 855 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 856 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 857 Mat_APMPI *ptap; 858 PetscHMapIV hmap; 859 PetscInt i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc; 860 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 861 PetscInt offset, ii, pocol; 862 const PetscInt *mappingindices; 863 IS map; 864 865 PetscFunctionBegin; 866 MatCheckProduct(C, 4); 867 ptap = (Mat_APMPI *)C->product->data; 868 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 869 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 870 871 PetscCall(MatZeroEntries(C)); 872 873 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 874 /*-----------------------------------------------------*/ 875 if (ptap->reuse == MAT_REUSE_MATRIX) { 876 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 877 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 878 } 879 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 880 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 881 pon *= dof; 882 PetscCall(MatGetLocalSize(P, NULL, &pn)); 883 pn *= dof; 884 885 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 886 PetscCall(MatGetLocalSize(A, &am, NULL)); 887 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 888 pcstart *= dof; 889 pcend *= dof; 890 cmaxr = 0; 891 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 892 cd = (Mat_SeqAIJ *)(c->A)->data; 893 co = (Mat_SeqAIJ *)(c->B)->data; 894 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 895 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 896 PetscCall(PetscHMapIVCreate(&hmap)); 897 PetscCall(PetscHMapIVResize(hmap, cmaxr)); 898 PetscCall(ISGetIndices(map, &mappingindices)); 899 for (i = 0; i < am && (pon || pn); i++) { 900 PetscCall(PetscHMapIVClear(hmap)); 901 offset = i % dof; 902 ii = i / dof; 903 nzi = po->i[ii + 1] - po->i[ii]; 904 dnzi = pd->i[ii + 1] - pd->i[ii]; 905 if (!nzi && !dnzi) continue; 906 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 907 voff = 0; 908 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 909 if (!voff) continue; 910 911 /* Form remote C(ii, :) */ 912 poj = po->j + po->i[ii]; 913 poa = po->a + po->i[ii]; 914 for (j = 0; j < nzi; j++) { 915 pocol = poj[j] * dof + offset; 916 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 917 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 918 for (jj = 0; jj < voff; jj++) { 919 apvaluestmp[jj] = apvalues[jj] * poa[j]; 920 /*If the row is empty */ 921 if (!c_rmtc[pocol]) { 922 c_rmtjj[jj] = apindices[jj]; 923 c_rmtaa[jj] = apvaluestmp[jj]; 924 c_rmtc[pocol]++; 925 } else { 926 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 927 if (loc >= 0) { /* hit */ 928 c_rmtaa[loc] += apvaluestmp[jj]; 929 PetscCall(PetscLogFlops(1.0)); 930 } else { /* new element */ 931 loc = -(loc + 1); 932 /* Move data backward */ 933 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 934 c_rmtjj[kk] = c_rmtjj[kk - 1]; 935 c_rmtaa[kk] = c_rmtaa[kk - 1]; 936 } /* End kk */ 937 c_rmtjj[loc] = apindices[jj]; 938 c_rmtaa[loc] = apvaluestmp[jj]; 939 c_rmtc[pocol]++; 940 } 941 } 942 } /* End jj */ 943 PetscCall(PetscLogFlops(voff)); 944 } /* End j */ 945 946 /* Form local C(ii, :) */ 947 pdj = pd->j + pd->i[ii]; 948 pda = pd->a + pd->i[ii]; 949 for (j = 0; j < dnzi; j++) { 950 row = pcstart + pdj[j] * dof + offset; 951 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */ 952 PetscCall(PetscLogFlops(voff)); 953 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 954 } /* End j */ 955 } /* End i */ 956 957 PetscCall(ISRestoreIndices(map, &mappingindices)); 958 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 959 PetscCall(PetscHMapIVDestroy(&hmap)); 960 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 961 962 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 963 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 964 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 965 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 966 PetscCall(PetscFree2(c_rmtj, c_rmta)); 967 968 /* Add contributions from remote */ 969 for (i = 0; i < pn; i++) { 970 row = i + pcstart; 971 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], c_othj + ptap->c_othi[i], c_otha + ptap->c_othi[i], ADD_VALUES)); 972 } 973 PetscCall(PetscFree2(c_othj, c_otha)); 974 975 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 976 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 977 978 ptap->reuse = MAT_REUSE_MATRIX; 979 PetscFunctionReturn(0); 980 } 981 982 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C) { 983 PetscFunctionBegin; 984 985 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C)); 986 PetscFunctionReturn(0); 987 } 988 989 /* TODO: move algorithm selection to MatProductSetFromOptions */ 990 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) { 991 Mat_APMPI *ptap; 992 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 993 MPI_Comm comm; 994 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 995 MatType mtype; 996 PetscSF sf; 997 PetscSFNode *iremote; 998 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 999 const PetscInt *rootdegrees; 1000 PetscHSetI ht, oht, *hta, *hto; 1001 PetscInt pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 1002 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 1003 PetscInt nalg = 2, alg = 0, offset, ii; 1004 PetscMPIInt owner; 1005 const PetscInt *mappingindices; 1006 PetscBool flg; 1007 const char *algTypes[2] = {"overlapping", "merged"}; 1008 IS map; 1009 1010 PetscFunctionBegin; 1011 MatCheckProduct(Cmpi, 5); 1012 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1013 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1014 1015 /* Create symbolic parallel matrix Cmpi */ 1016 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1017 pn *= dof; 1018 PetscCall(MatGetType(A, &mtype)); 1019 PetscCall(MatSetType(Cmpi, mtype)); 1020 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1021 1022 PetscCall(PetscNew(&ptap)); 1023 ptap->reuse = MAT_INITIAL_MATRIX; 1024 ptap->algType = 2; 1025 1026 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1027 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1028 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1029 /* This equals to the number of offdiag columns in P */ 1030 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1031 pon *= dof; 1032 /* offsets */ 1033 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1034 /* The number of columns we will send to remote ranks */ 1035 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1036 PetscCall(PetscMalloc1(pon, &hta)); 1037 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1038 PetscCall(MatGetLocalSize(A, &am, NULL)); 1039 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1040 /* Create hash table to merge all columns for C(i, :) */ 1041 PetscCall(PetscHSetICreate(&ht)); 1042 1043 PetscCall(ISGetIndices(map, &mappingindices)); 1044 ptap->c_rmti[0] = 0; 1045 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1046 for (i = 0; i < am && pon; i++) { 1047 /* Form one row of AP */ 1048 PetscCall(PetscHSetIClear(ht)); 1049 offset = i % dof; 1050 ii = i / dof; 1051 /* If the off diag is empty, we should not do any calculation */ 1052 nzi = po->i[ii + 1] - po->i[ii]; 1053 if (!nzi) continue; 1054 1055 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht)); 1056 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1057 /* If AP is empty, just continue */ 1058 if (!htsize) continue; 1059 /* Form C(ii, :) */ 1060 poj = po->j + po->i[ii]; 1061 for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1062 } 1063 1064 for (i = 0; i < pon; i++) { 1065 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1066 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1067 c_rmtc[i] = htsize; 1068 } 1069 1070 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1071 1072 for (i = 0; i < pon; i++) { 1073 off = 0; 1074 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1075 PetscCall(PetscHSetIDestroy(&hta[i])); 1076 } 1077 PetscCall(PetscFree(hta)); 1078 1079 PetscCall(PetscMalloc1(pon, &iremote)); 1080 for (i = 0; i < pon; i++) { 1081 owner = 0; 1082 lidx = 0; 1083 offset = i % dof; 1084 ii = i / dof; 1085 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1086 iremote[i].index = lidx * dof + offset; 1087 iremote[i].rank = owner; 1088 } 1089 1090 PetscCall(PetscSFCreate(comm, &sf)); 1091 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1092 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1093 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1094 PetscCall(PetscSFSetFromOptions(sf)); 1095 PetscCall(PetscSFSetUp(sf)); 1096 /* How many neighbors have contributions to my rows? */ 1097 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1098 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1099 rootspacesize = 0; 1100 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1101 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1102 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1103 /* Get information from leaves 1104 * Number of columns other people contribute to my rows 1105 * */ 1106 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1107 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1108 PetscCall(PetscFree(c_rmtc)); 1109 PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi)); 1110 /* The number of columns is received for each row */ 1111 ptap->c_othi[0] = 0; 1112 rootspacesize = 0; 1113 rootspaceoffsets[0] = 0; 1114 for (i = 0; i < pn; i++) { 1115 rcvncols = 0; 1116 for (j = 0; j < rootdegrees[i]; j++) { 1117 rcvncols += rootspace[rootspacesize]; 1118 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1119 rootspacesize++; 1120 } 1121 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1122 } 1123 PetscCall(PetscFree(rootspace)); 1124 1125 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1126 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1127 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1128 PetscCall(PetscSFDestroy(&sf)); 1129 PetscCall(PetscFree(rootspaceoffsets)); 1130 1131 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1132 nleaves = 0; 1133 for (i = 0; i < pon; i++) { 1134 owner = 0; 1135 ii = i / dof; 1136 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1137 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1138 for (j = 0; j < sendncols; j++) { 1139 iremote[nleaves].rank = owner; 1140 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1141 } 1142 } 1143 PetscCall(PetscFree(c_rmtoffsets)); 1144 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1145 1146 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1147 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1148 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1149 /* One to one map */ 1150 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1151 1152 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1153 PetscCall(PetscHSetICreate(&oht)); 1154 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1155 pcstart *= dof; 1156 pcend *= dof; 1157 PetscCall(PetscMalloc2(pn, &hta, pn, &hto)); 1158 for (i = 0; i < pn; i++) { 1159 PetscCall(PetscHSetICreate(&hta[i])); 1160 PetscCall(PetscHSetICreate(&hto[i])); 1161 } 1162 /* Work on local part */ 1163 /* 4) Pass 1: Estimate memory for C_loc */ 1164 for (i = 0; i < am && pn; i++) { 1165 PetscCall(PetscHSetIClear(ht)); 1166 PetscCall(PetscHSetIClear(oht)); 1167 offset = i % dof; 1168 ii = i / dof; 1169 nzi = pd->i[ii + 1] - pd->i[ii]; 1170 if (!nzi) continue; 1171 1172 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1173 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1174 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1175 if (!(htsize + htosize)) continue; 1176 /* Form C(ii, :) */ 1177 pdj = pd->j + pd->i[ii]; 1178 for (j = 0; j < nzi; j++) { 1179 PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht)); 1180 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1181 } 1182 } 1183 1184 PetscCall(ISRestoreIndices(map, &mappingindices)); 1185 1186 PetscCall(PetscHSetIDestroy(&ht)); 1187 PetscCall(PetscHSetIDestroy(&oht)); 1188 1189 /* Get remote data */ 1190 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1191 PetscCall(PetscFree(c_rmtj)); 1192 1193 for (i = 0; i < pn; i++) { 1194 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1195 rdj = c_othj + ptap->c_othi[i]; 1196 for (j = 0; j < nzi; j++) { 1197 col = rdj[j]; 1198 /* diag part */ 1199 if (col >= pcstart && col < pcend) { 1200 PetscCall(PetscHSetIAdd(hta[i], col)); 1201 } else { /* off diag */ 1202 PetscCall(PetscHSetIAdd(hto[i], col)); 1203 } 1204 } 1205 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1206 dnz[i] = htsize; 1207 PetscCall(PetscHSetIDestroy(&hta[i])); 1208 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1209 onz[i] = htsize; 1210 PetscCall(PetscHSetIDestroy(&hto[i])); 1211 } 1212 1213 PetscCall(PetscFree2(hta, hto)); 1214 PetscCall(PetscFree(c_othj)); 1215 1216 /* local sizes and preallocation */ 1217 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1218 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1219 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1220 PetscCall(MatSetUp(Cmpi)); 1221 PetscCall(PetscFree2(dnz, onz)); 1222 1223 /* attach the supporting struct to Cmpi for reuse */ 1224 Cmpi->product->data = ptap; 1225 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1226 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1227 1228 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1229 Cmpi->assembled = PETSC_FALSE; 1230 /* pick an algorithm */ 1231 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1232 alg = 0; 1233 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1234 PetscOptionsEnd(); 1235 switch (alg) { 1236 case 0: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; break; 1237 case 1: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; break; 1238 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1239 } 1240 PetscFunctionReturn(0); 1241 } 1242 1243 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C) { 1244 PetscFunctionBegin; 1245 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C)); 1246 PetscFunctionReturn(0); 1247 } 1248 1249 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) { 1250 Mat_APMPI *ptap; 1251 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1252 MPI_Comm comm; 1253 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 1254 MatType mtype; 1255 PetscSF sf; 1256 PetscSFNode *iremote; 1257 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 1258 const PetscInt *rootdegrees; 1259 PetscHSetI ht, oht, *hta, *hto, *htd; 1260 PetscInt pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 1261 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 1262 PetscInt nalg = 2, alg = 0, offset, ii; 1263 PetscMPIInt owner; 1264 PetscBool flg; 1265 const char *algTypes[2] = {"merged", "overlapping"}; 1266 const PetscInt *mappingindices; 1267 IS map; 1268 1269 PetscFunctionBegin; 1270 MatCheckProduct(Cmpi, 5); 1271 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1272 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1273 1274 /* Create symbolic parallel matrix Cmpi */ 1275 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1276 pn *= dof; 1277 PetscCall(MatGetType(A, &mtype)); 1278 PetscCall(MatSetType(Cmpi, mtype)); 1279 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1280 1281 PetscCall(PetscNew(&ptap)); 1282 ptap->reuse = MAT_INITIAL_MATRIX; 1283 ptap->algType = 3; 1284 1285 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1286 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1287 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1288 1289 /* This equals to the number of offdiag columns in P */ 1290 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1291 pon *= dof; 1292 /* offsets */ 1293 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1294 /* The number of columns we will send to remote ranks */ 1295 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1296 PetscCall(PetscMalloc1(pon, &hta)); 1297 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1298 PetscCall(MatGetLocalSize(A, &am, NULL)); 1299 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1300 /* Create hash table to merge all columns for C(i, :) */ 1301 PetscCall(PetscHSetICreate(&ht)); 1302 PetscCall(PetscHSetICreate(&oht)); 1303 PetscCall(PetscMalloc2(pn, &htd, pn, &hto)); 1304 for (i = 0; i < pn; i++) { 1305 PetscCall(PetscHSetICreate(&htd[i])); 1306 PetscCall(PetscHSetICreate(&hto[i])); 1307 } 1308 1309 PetscCall(ISGetIndices(map, &mappingindices)); 1310 ptap->c_rmti[0] = 0; 1311 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1312 for (i = 0; i < am && (pon || pn); i++) { 1313 /* Form one row of AP */ 1314 PetscCall(PetscHSetIClear(ht)); 1315 PetscCall(PetscHSetIClear(oht)); 1316 offset = i % dof; 1317 ii = i / dof; 1318 /* If the off diag is empty, we should not do any calculation */ 1319 nzi = po->i[ii + 1] - po->i[ii]; 1320 dnzi = pd->i[ii + 1] - pd->i[ii]; 1321 if (!nzi && !dnzi) continue; 1322 1323 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1324 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1325 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1326 /* If AP is empty, just continue */ 1327 if (!(htsize + htosize)) continue; 1328 1329 /* Form remote C(ii, :) */ 1330 poj = po->j + po->i[ii]; 1331 for (j = 0; j < nzi; j++) { 1332 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1333 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht)); 1334 } 1335 1336 /* Form local C(ii, :) */ 1337 pdj = pd->j + pd->i[ii]; 1338 for (j = 0; j < dnzi; j++) { 1339 PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht)); 1340 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1341 } 1342 } 1343 1344 PetscCall(ISRestoreIndices(map, &mappingindices)); 1345 1346 PetscCall(PetscHSetIDestroy(&ht)); 1347 PetscCall(PetscHSetIDestroy(&oht)); 1348 1349 for (i = 0; i < pon; i++) { 1350 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1351 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1352 c_rmtc[i] = htsize; 1353 } 1354 1355 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1356 1357 for (i = 0; i < pon; i++) { 1358 off = 0; 1359 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1360 PetscCall(PetscHSetIDestroy(&hta[i])); 1361 } 1362 PetscCall(PetscFree(hta)); 1363 1364 PetscCall(PetscMalloc1(pon, &iremote)); 1365 for (i = 0; i < pon; i++) { 1366 owner = 0; 1367 lidx = 0; 1368 offset = i % dof; 1369 ii = i / dof; 1370 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1371 iremote[i].index = lidx * dof + offset; 1372 iremote[i].rank = owner; 1373 } 1374 1375 PetscCall(PetscSFCreate(comm, &sf)); 1376 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1377 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1378 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1379 PetscCall(PetscSFSetFromOptions(sf)); 1380 PetscCall(PetscSFSetUp(sf)); 1381 /* How many neighbors have contributions to my rows? */ 1382 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1383 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1384 rootspacesize = 0; 1385 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1386 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1387 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1388 /* Get information from leaves 1389 * Number of columns other people contribute to my rows 1390 * */ 1391 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1392 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1393 PetscCall(PetscFree(c_rmtc)); 1394 PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi)); 1395 /* The number of columns is received for each row */ 1396 ptap->c_othi[0] = 0; 1397 rootspacesize = 0; 1398 rootspaceoffsets[0] = 0; 1399 for (i = 0; i < pn; i++) { 1400 rcvncols = 0; 1401 for (j = 0; j < rootdegrees[i]; j++) { 1402 rcvncols += rootspace[rootspacesize]; 1403 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1404 rootspacesize++; 1405 } 1406 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1407 } 1408 PetscCall(PetscFree(rootspace)); 1409 1410 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1411 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1412 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1413 PetscCall(PetscSFDestroy(&sf)); 1414 PetscCall(PetscFree(rootspaceoffsets)); 1415 1416 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1417 nleaves = 0; 1418 for (i = 0; i < pon; i++) { 1419 owner = 0; 1420 ii = i / dof; 1421 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1422 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1423 for (j = 0; j < sendncols; j++) { 1424 iremote[nleaves].rank = owner; 1425 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1426 } 1427 } 1428 PetscCall(PetscFree(c_rmtoffsets)); 1429 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1430 1431 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1432 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1433 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1434 /* One to one map */ 1435 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1436 /* Get remote data */ 1437 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1438 PetscCall(PetscFree(c_rmtj)); 1439 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1440 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1441 pcstart *= dof; 1442 pcend *= dof; 1443 for (i = 0; i < pn; i++) { 1444 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1445 rdj = c_othj + ptap->c_othi[i]; 1446 for (j = 0; j < nzi; j++) { 1447 col = rdj[j]; 1448 /* diag part */ 1449 if (col >= pcstart && col < pcend) { 1450 PetscCall(PetscHSetIAdd(htd[i], col)); 1451 } else { /* off diag */ 1452 PetscCall(PetscHSetIAdd(hto[i], col)); 1453 } 1454 } 1455 PetscCall(PetscHSetIGetSize(htd[i], &htsize)); 1456 dnz[i] = htsize; 1457 PetscCall(PetscHSetIDestroy(&htd[i])); 1458 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1459 onz[i] = htsize; 1460 PetscCall(PetscHSetIDestroy(&hto[i])); 1461 } 1462 1463 PetscCall(PetscFree2(htd, hto)); 1464 PetscCall(PetscFree(c_othj)); 1465 1466 /* local sizes and preallocation */ 1467 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1468 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1469 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1470 PetscCall(PetscFree2(dnz, onz)); 1471 1472 /* attach the supporting struct to Cmpi for reuse */ 1473 Cmpi->product->data = ptap; 1474 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1475 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1476 1477 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1478 Cmpi->assembled = PETSC_FALSE; 1479 /* pick an algorithm */ 1480 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1481 alg = 0; 1482 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1483 PetscOptionsEnd(); 1484 switch (alg) { 1485 case 0: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; break; 1486 case 1: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; break; 1487 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1488 } 1489 PetscFunctionReturn(0); 1490 } 1491 1492 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C) { 1493 PetscFunctionBegin; 1494 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C)); 1495 PetscFunctionReturn(0); 1496 } 1497 1498 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi) { 1499 Mat_APMPI *ptap; 1500 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1501 MPI_Comm comm; 1502 PetscMPIInt size, rank; 1503 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1504 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 1505 PetscInt *lnk, i, k, pnz, row, nsend; 1506 PetscBT lnkbt; 1507 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 1508 PETSC_UNUSED PetscMPIInt icompleted = 0; 1509 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 1510 PetscInt len, proc, *dnz, *onz, *owners, nzi, nspacedouble; 1511 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1512 MPI_Request *swaits, *rwaits; 1513 MPI_Status *sstatus, rstatus; 1514 PetscLayout rowmap; 1515 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1516 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 1517 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi; 1518 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = NULL, *c_loc, *c_oth; 1519 PetscScalar *apv; 1520 PetscTable ta; 1521 MatType mtype; 1522 const char *prefix; 1523 #if defined(PETSC_USE_INFO) 1524 PetscReal apfill; 1525 #endif 1526 1527 PetscFunctionBegin; 1528 MatCheckProduct(Cmpi, 4); 1529 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1530 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1531 PetscCallMPI(MPI_Comm_size(comm, &size)); 1532 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1533 1534 if (size > 1) ao = (Mat_SeqAIJ *)(a->B)->data; 1535 1536 /* create symbolic parallel matrix Cmpi */ 1537 PetscCall(MatGetType(A, &mtype)); 1538 PetscCall(MatSetType(Cmpi, mtype)); 1539 1540 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1541 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1542 1543 /* create struct Mat_APMPI and attached it to C later */ 1544 PetscCall(PetscNew(&ptap)); 1545 ptap->reuse = MAT_INITIAL_MATRIX; 1546 ptap->algType = 1; 1547 1548 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1549 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1550 /* get P_loc by taking all local rows of P */ 1551 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 1552 1553 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1554 /* --------------------------------- */ 1555 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 1556 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 1557 1558 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1559 /* ----------------------------------------------------------------- */ 1560 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1561 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1562 1563 /* create and initialize a linked list */ 1564 PetscCall(PetscTableCreate(pn, pN, &ta)); /* for compute AP_loc and Cmpi */ 1565 MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta); 1566 MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta); 1567 PetscCall(PetscTableGetCount(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 1568 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1569 1570 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1571 1572 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1573 if (ao) { 1574 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 1575 } else { 1576 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 1577 } 1578 current_space = free_space; 1579 nspacedouble = 0; 1580 1581 PetscCall(PetscMalloc1(am + 1, &api)); 1582 api[0] = 0; 1583 for (i = 0; i < am; i++) { 1584 /* diagonal portion: Ad[i,:]*P */ 1585 ai = ad->i; 1586 pi = p_loc->i; 1587 nzi = ai[i + 1] - ai[i]; 1588 aj = ad->j + ai[i]; 1589 for (j = 0; j < nzi; j++) { 1590 row = aj[j]; 1591 pnz = pi[row + 1] - pi[row]; 1592 Jptr = p_loc->j + pi[row]; 1593 /* add non-zero cols of P into the sorted linked list lnk */ 1594 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1595 } 1596 /* off-diagonal portion: Ao[i,:]*P */ 1597 if (ao) { 1598 ai = ao->i; 1599 pi = p_oth->i; 1600 nzi = ai[i + 1] - ai[i]; 1601 aj = ao->j + ai[i]; 1602 for (j = 0; j < nzi; j++) { 1603 row = aj[j]; 1604 pnz = pi[row + 1] - pi[row]; 1605 Jptr = p_oth->j + pi[row]; 1606 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1607 } 1608 } 1609 apnz = lnk[0]; 1610 api[i + 1] = api[i] + apnz; 1611 if (ap_rmax < apnz) ap_rmax = apnz; 1612 1613 /* if free space is not available, double the total space in the list */ 1614 if (current_space->local_remaining < apnz) { 1615 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 1616 nspacedouble++; 1617 } 1618 1619 /* Copy data into free space, then initialize lnk */ 1620 PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt)); 1621 1622 current_space->array += apnz; 1623 current_space->local_used += apnz; 1624 current_space->local_remaining -= apnz; 1625 } 1626 /* Allocate space for apj and apv, initialize apj, and */ 1627 /* destroy list of free space and other temporary array(s) */ 1628 PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv)); 1629 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 1630 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1631 1632 /* Create AP_loc for reuse */ 1633 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 1634 PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name)); 1635 #if defined(PETSC_USE_INFO) 1636 if (ao) { 1637 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 1638 } else { 1639 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 1640 } 1641 ptap->AP_loc->info.mallocs = nspacedouble; 1642 ptap->AP_loc->info.fill_ratio_given = fill; 1643 ptap->AP_loc->info.fill_ratio_needed = apfill; 1644 1645 if (api[am]) { 1646 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 1647 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 1648 } else { 1649 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n")); 1650 } 1651 #endif 1652 1653 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1654 /* ------------------------------------ */ 1655 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1656 PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix)); 1657 PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_")); 1658 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 1659 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1660 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 1661 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_")); 1662 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 1663 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default")); 1664 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 1665 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 1666 PetscCall(MatProductSymbolic(ptap->C_oth)); 1667 1668 /* (3) send coj of C_oth to other processors */ 1669 /* ------------------------------------------ */ 1670 /* determine row ownership */ 1671 PetscCall(PetscLayoutCreate(comm, &rowmap)); 1672 rowmap->n = pn; 1673 rowmap->bs = 1; 1674 PetscCall(PetscLayoutSetUp(rowmap)); 1675 owners = rowmap->range; 1676 1677 /* determine the number of messages to send, their lengths */ 1678 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 1679 PetscCall(PetscArrayzero(len_s, size)); 1680 PetscCall(PetscArrayzero(len_si, size)); 1681 1682 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 1683 coi = c_oth->i; 1684 coj = c_oth->j; 1685 con = ptap->C_oth->rmap->n; 1686 proc = 0; 1687 for (i = 0; i < con; i++) { 1688 while (prmap[i] >= owners[proc + 1]) proc++; 1689 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1690 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1691 } 1692 1693 len = 0; /* max length of buf_si[], see (4) */ 1694 owners_co[0] = 0; 1695 nsend = 0; 1696 for (proc = 0; proc < size; proc++) { 1697 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1698 if (len_s[proc]) { 1699 nsend++; 1700 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1701 len += len_si[proc]; 1702 } 1703 } 1704 1705 /* determine the number and length of messages to receive for coi and coj */ 1706 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 1707 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 1708 1709 /* post the Irecv and Isend of coj */ 1710 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1711 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 1712 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 1713 for (proc = 0, k = 0; proc < size; proc++) { 1714 if (!len_s[proc]) continue; 1715 i = owners_co[proc]; 1716 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1717 k++; 1718 } 1719 1720 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1721 /* ---------------------------------------- */ 1722 PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix)); 1723 PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_")); 1724 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 1725 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1726 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 1727 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_")); 1728 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 1729 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 1730 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 1731 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 1732 PetscCall(MatProductSymbolic(ptap->C_loc)); 1733 1734 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 1735 1736 /* receives coj are complete */ 1737 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1738 PetscCall(PetscFree(rwaits)); 1739 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1740 1741 /* add received column indices into ta to update Crmax */ 1742 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1743 Jptr = buf_rj[k]; 1744 for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES)); 1745 } 1746 PetscCall(PetscTableGetCount(ta, &Crmax)); 1747 PetscCall(PetscTableDestroy(&ta)); 1748 1749 /* (4) send and recv coi */ 1750 /*-----------------------*/ 1751 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1752 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 1753 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1754 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1755 for (proc = 0, k = 0; proc < size; proc++) { 1756 if (!len_s[proc]) continue; 1757 /* form outgoing message for i-structure: 1758 buf_si[0]: nrows to be sent 1759 [1:nrows]: row index (global) 1760 [nrows+1:2*nrows+1]: i-structure index 1761 */ 1762 /*-------------------------------------------*/ 1763 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 1764 buf_si_i = buf_si + nrows + 1; 1765 buf_si[0] = nrows; 1766 buf_si_i[0] = 0; 1767 nrows = 0; 1768 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1769 nzi = coi[i + 1] - coi[i]; 1770 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1771 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1772 nrows++; 1773 } 1774 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1775 k++; 1776 buf_si += len_si[proc]; 1777 } 1778 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1779 PetscCall(PetscFree(rwaits)); 1780 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1781 1782 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 1783 PetscCall(PetscFree(len_ri)); 1784 PetscCall(PetscFree(swaits)); 1785 PetscCall(PetscFree(buf_s)); 1786 1787 /* (5) compute the local portion of Cmpi */ 1788 /* ------------------------------------------ */ 1789 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */ 1790 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 1791 current_space = free_space; 1792 1793 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 1794 for (k = 0; k < nrecv; k++) { 1795 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1796 nrows = *buf_ri_k[k]; 1797 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1798 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1799 } 1800 1801 MatPreallocateBegin(comm, pn, pn, dnz, onz); 1802 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1803 for (i = 0; i < pn; i++) { 1804 /* add C_loc into Cmpi */ 1805 nzi = c_loc->i[i + 1] - c_loc->i[i]; 1806 Jptr = c_loc->j + c_loc->i[i]; 1807 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1808 1809 /* add received col data into lnk */ 1810 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1811 if (i == *nextrow[k]) { /* i-th row */ 1812 nzi = *(nextci[k] + 1) - *nextci[k]; 1813 Jptr = buf_rj[k] + *nextci[k]; 1814 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1815 nextrow[k]++; 1816 nextci[k]++; 1817 } 1818 } 1819 nzi = lnk[0]; 1820 1821 /* copy data into free space, then initialize lnk */ 1822 PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt)); 1823 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 1824 } 1825 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1826 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1827 PetscCall(PetscFreeSpaceDestroy(free_space)); 1828 1829 /* local sizes and preallocation */ 1830 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1831 if (P->cmap->bs > 0) { 1832 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 1833 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 1834 } 1835 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1836 MatPreallocateEnd(dnz, onz); 1837 1838 /* members in merge */ 1839 PetscCall(PetscFree(id_r)); 1840 PetscCall(PetscFree(len_r)); 1841 PetscCall(PetscFree(buf_ri[0])); 1842 PetscCall(PetscFree(buf_ri)); 1843 PetscCall(PetscFree(buf_rj[0])); 1844 PetscCall(PetscFree(buf_rj)); 1845 PetscCall(PetscLayoutDestroy(&rowmap)); 1846 1847 PetscCall(PetscCalloc1(pN, &ptap->apa)); 1848 1849 /* attach the supporting struct to Cmpi for reuse */ 1850 Cmpi->product->data = ptap; 1851 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1852 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1853 1854 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1855 Cmpi->assembled = PETSC_FALSE; 1856 PetscFunctionReturn(0); 1857 } 1858 1859 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) { 1860 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1861 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 1862 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 1863 Mat_APMPI *ptap; 1864 Mat AP_loc, C_loc, C_oth; 1865 PetscInt i, rstart, rend, cm, ncols, row; 1866 PetscInt *api, *apj, am = A->rmap->n, j, col, apnz; 1867 PetscScalar *apa; 1868 const PetscInt *cols; 1869 const PetscScalar *vals; 1870 1871 PetscFunctionBegin; 1872 MatCheckProduct(C, 3); 1873 ptap = (Mat_APMPI *)C->product->data; 1874 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1875 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 1876 1877 PetscCall(MatZeroEntries(C)); 1878 /* 1) get R = Pd^T,Ro = Po^T */ 1879 if (ptap->reuse == MAT_REUSE_MATRIX) { 1880 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 1881 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 1882 } 1883 1884 /* 2) get AP_loc */ 1885 AP_loc = ptap->AP_loc; 1886 ap = (Mat_SeqAIJ *)AP_loc->data; 1887 1888 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1889 /*-----------------------------------------------------*/ 1890 if (ptap->reuse == MAT_REUSE_MATRIX) { 1891 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1892 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1893 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 1894 } 1895 1896 /* 2-2) compute numeric A_loc*P - dominating part */ 1897 /* ---------------------------------------------- */ 1898 /* get data from symbolic products */ 1899 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1900 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 1901 apa = ptap->apa; 1902 api = ap->i; 1903 apj = ap->j; 1904 for (i = 0; i < am; i++) { 1905 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1906 AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa); 1907 apnz = api[i + 1] - api[i]; 1908 for (j = 0; j < apnz; j++) { 1909 col = apj[j + api[i]]; 1910 ap->a[j + ap->i[i]] = apa[col]; 1911 apa[col] = 0.0; 1912 } 1913 } 1914 /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */ 1915 PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc)); 1916 1917 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1918 PetscCall(MatProductNumeric(ptap->C_loc)); 1919 PetscCall(MatProductNumeric(ptap->C_oth)); 1920 C_loc = ptap->C_loc; 1921 C_oth = ptap->C_oth; 1922 1923 /* add C_loc and Co to to C */ 1924 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 1925 1926 /* C_loc -> C */ 1927 cm = C_loc->rmap->N; 1928 c_seq = (Mat_SeqAIJ *)C_loc->data; 1929 cols = c_seq->j; 1930 vals = c_seq->a; 1931 1932 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 1933 /* when there are no off-processor parts. */ 1934 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 1935 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 1936 /* a table, and other, more complex stuff has to be done. */ 1937 if (C->assembled) { 1938 C->was_assembled = PETSC_TRUE; 1939 C->assembled = PETSC_FALSE; 1940 } 1941 if (C->was_assembled) { 1942 for (i = 0; i < cm; i++) { 1943 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1944 row = rstart + i; 1945 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1946 cols += ncols; 1947 vals += ncols; 1948 } 1949 } else { 1950 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 1951 } 1952 1953 /* Co -> C, off-processor part */ 1954 cm = C_oth->rmap->N; 1955 c_seq = (Mat_SeqAIJ *)C_oth->data; 1956 cols = c_seq->j; 1957 vals = c_seq->a; 1958 for (i = 0; i < cm; i++) { 1959 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1960 row = p->garray[i]; 1961 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1962 cols += ncols; 1963 vals += ncols; 1964 } 1965 1966 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1967 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1968 1969 ptap->reuse = MAT_REUSE_MATRIX; 1970 PetscFunctionReturn(0); 1971 } 1972 1973 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) { 1974 Mat_Product *product = C->product; 1975 Mat A = product->A, P = product->B; 1976 MatProductAlgorithm alg = product->alg; 1977 PetscReal fill = product->fill; 1978 PetscBool flg; 1979 1980 PetscFunctionBegin; 1981 /* scalable: do R=P^T locally, then C=R*A*P */ 1982 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 1983 if (flg) { 1984 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C)); 1985 C->ops->productnumeric = MatProductNumeric_PtAP; 1986 goto next; 1987 } 1988 1989 /* nonscalable: do R=P^T locally, then C=R*A*P */ 1990 PetscCall(PetscStrcmp(alg, "nonscalable", &flg)); 1991 if (flg) { 1992 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C)); 1993 goto next; 1994 } 1995 1996 /* allatonce */ 1997 PetscCall(PetscStrcmp(alg, "allatonce", &flg)); 1998 if (flg) { 1999 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C)); 2000 goto next; 2001 } 2002 2003 /* allatonce_merged */ 2004 PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg)); 2005 if (flg) { 2006 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C)); 2007 goto next; 2008 } 2009 2010 /* backend general code */ 2011 PetscCall(PetscStrcmp(alg, "backend", &flg)); 2012 if (flg) { 2013 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 2014 PetscFunctionReturn(0); 2015 } 2016 2017 /* hypre */ 2018 #if defined(PETSC_HAVE_HYPRE) 2019 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 2020 if (flg) { 2021 PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C)); 2022 C->ops->productnumeric = MatProductNumeric_PtAP; 2023 PetscFunctionReturn(0); 2024 } 2025 #endif 2026 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 2027 2028 next: 2029 C->ops->productnumeric = MatProductNumeric_PtAP; 2030 PetscFunctionReturn(0); 2031 } 2032