1 /* 2 Defines projective product routines where A is a MPIAIJ matrix 3 C = P^T * A * P 4 */ 5 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <../src/mat/impls/aij/mpi/mpiaij.h> 9 #include <petscbt.h> 10 #include <petsctime.h> 11 #include <petsc/private/hashmapiv.h> 12 #include <petsc/private/hashseti.h> 13 #include <petscsf.h> 14 15 static PetscErrorCode MatView_MPIAIJ_PtAP(Mat A, PetscViewer viewer) 16 { 17 PetscBool isascii; 18 PetscViewerFormat format; 19 Mat_APMPI *ptap; 20 21 PetscFunctionBegin; 22 MatCheckProduct(A, 1); 23 ptap = (Mat_APMPI *)A->product->data; 24 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii)); 25 if (isascii) { 26 PetscCall(PetscViewerGetFormat(viewer, &format)); 27 if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) { 28 if (ptap->algType == 0) { 29 PetscCall(PetscViewerASCIIPrintf(viewer, "using scalable MatPtAP() implementation\n")); 30 } else if (ptap->algType == 1) { 31 PetscCall(PetscViewerASCIIPrintf(viewer, "using nonscalable MatPtAP() implementation\n")); 32 } else if (ptap->algType == 2) { 33 PetscCall(PetscViewerASCIIPrintf(viewer, "using allatonce MatPtAP() implementation\n")); 34 } else if (ptap->algType == 3) { 35 PetscCall(PetscViewerASCIIPrintf(viewer, "using merged allatonce MatPtAP() implementation\n")); 36 } 37 } 38 } 39 PetscFunctionReturn(PETSC_SUCCESS); 40 } 41 42 PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data) 43 { 44 Mat_APMPI *ptap = (Mat_APMPI *)data; 45 Mat_Merge_SeqsToMPI *merge; 46 47 PetscFunctionBegin; 48 PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r)); 49 PetscCall(PetscFree(ptap->bufa)); 50 PetscCall(MatDestroy(&ptap->P_loc)); 51 PetscCall(MatDestroy(&ptap->P_oth)); 52 PetscCall(MatDestroy(&ptap->A_loc)); /* used by MatTransposeMatMult() */ 53 PetscCall(MatDestroy(&ptap->Rd)); 54 PetscCall(MatDestroy(&ptap->Ro)); 55 if (ptap->AP_loc) { /* used by alg_rap */ 56 Mat_SeqAIJ *ap = (Mat_SeqAIJ *)ptap->AP_loc->data; 57 PetscCall(PetscFree(ap->i)); 58 PetscCall(PetscFree2(ap->j, ap->a)); 59 PetscCall(MatDestroy(&ptap->AP_loc)); 60 } else { /* used by alg_ptap */ 61 PetscCall(PetscFree(ptap->api)); 62 PetscCall(PetscFree(ptap->apj)); 63 } 64 PetscCall(MatDestroy(&ptap->C_loc)); 65 PetscCall(MatDestroy(&ptap->C_oth)); 66 if (ptap->apa) PetscCall(PetscFree(ptap->apa)); 67 68 PetscCall(MatDestroy(&ptap->Pt)); 69 70 merge = ptap->merge; 71 if (merge) { /* used by alg_ptap */ 72 PetscCall(PetscFree(merge->id_r)); 73 PetscCall(PetscFree(merge->len_s)); 74 PetscCall(PetscFree(merge->len_r)); 75 PetscCall(PetscFree(merge->bi)); 76 PetscCall(PetscFree(merge->bj)); 77 PetscCall(PetscFree(merge->buf_ri[0])); 78 PetscCall(PetscFree(merge->buf_ri)); 79 PetscCall(PetscFree(merge->buf_rj[0])); 80 PetscCall(PetscFree(merge->buf_rj)); 81 PetscCall(PetscFree(merge->coi)); 82 PetscCall(PetscFree(merge->coj)); 83 PetscCall(PetscFree(merge->owners_co)); 84 PetscCall(PetscLayoutDestroy(&merge->rowmap)); 85 PetscCall(PetscFree(ptap->merge)); 86 } 87 PetscCall(ISLocalToGlobalMappingDestroy(&ptap->ltog)); 88 89 PetscCall(PetscSFDestroy(&ptap->sf)); 90 PetscCall(PetscFree(ptap->c_othi)); 91 PetscCall(PetscFree(ptap->c_rmti)); 92 PetscCall(PetscFree(ptap)); 93 PetscFunctionReturn(PETSC_SUCCESS); 94 } 95 96 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, Mat C) 97 { 98 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 99 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data; 100 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 101 Mat_APMPI *ptap; 102 Mat AP_loc, C_loc, C_oth; 103 PetscInt i, rstart, rend, cm, ncols, row, *api, *apj, am = A->rmap->n, apnz, nout; 104 PetscScalar *apa; 105 const PetscInt *cols; 106 const PetscScalar *vals; 107 108 PetscFunctionBegin; 109 MatCheckProduct(C, 3); 110 ptap = (Mat_APMPI *)C->product->data; 111 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 112 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 113 114 PetscCall(MatZeroEntries(C)); 115 116 /* 1) get R = Pd^T,Ro = Po^T */ 117 if (ptap->reuse == MAT_REUSE_MATRIX) { 118 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 119 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 120 } 121 122 /* 2) get AP_loc */ 123 AP_loc = ptap->AP_loc; 124 ap = (Mat_SeqAIJ *)AP_loc->data; 125 126 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 127 if (ptap->reuse == MAT_REUSE_MATRIX) { 128 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 129 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 130 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 131 } 132 133 /* 2-2) compute numeric A_loc*P - dominating part */ 134 /* get data from symbolic products */ 135 p_loc = (Mat_SeqAIJ *)ptap->P_loc->data; 136 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data; 137 138 api = ap->i; 139 apj = ap->j; 140 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, api[AP_loc->rmap->n], apj, apj)); 141 for (i = 0; i < am; i++) { 142 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 143 apnz = api[i + 1] - api[i]; 144 apa = ap->a + api[i]; 145 PetscCall(PetscArrayzero(apa, apnz)); 146 AProw_scalable(i, ad, ao, p_loc, p_oth, api, apj, apa); 147 } 148 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, api[AP_loc->rmap->n], apj, &nout, apj)); 149 PetscCheck(api[AP_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, api[AP_loc->rmap->n], nout); 150 151 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 152 /* Always use scalable version since we are in the MPI scalable version */ 153 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd, AP_loc, ptap->C_loc)); 154 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro, AP_loc, ptap->C_oth)); 155 156 C_loc = ptap->C_loc; 157 C_oth = ptap->C_oth; 158 159 /* add C_loc and Co to C */ 160 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 161 162 /* C_loc -> C */ 163 cm = C_loc->rmap->N; 164 c_seq = (Mat_SeqAIJ *)C_loc->data; 165 cols = c_seq->j; 166 vals = c_seq->a; 167 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_loc->rmap->n], c_seq->j, c_seq->j)); 168 169 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 170 /* when there are no off-processor parts. */ 171 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 172 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 173 /* a table, and other, more complex stuff has to be done. */ 174 if (C->assembled) { 175 C->was_assembled = PETSC_TRUE; 176 C->assembled = PETSC_FALSE; 177 } 178 if (C->was_assembled) { 179 for (i = 0; i < cm; i++) { 180 ncols = c_seq->i[i + 1] - c_seq->i[i]; 181 row = rstart + i; 182 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 183 cols += ncols; 184 vals += ncols; 185 } 186 } else { 187 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 188 } 189 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_loc->rmap->n], c_seq->j, &nout, c_seq->j)); 190 PetscCheck(c_seq->i[C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 191 192 /* Co -> C, off-processor part */ 193 cm = C_oth->rmap->N; 194 c_seq = (Mat_SeqAIJ *)C_oth->data; 195 cols = c_seq->j; 196 vals = c_seq->a; 197 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_seq->i[C_oth->rmap->n], c_seq->j, c_seq->j)); 198 for (i = 0; i < cm; i++) { 199 ncols = c_seq->i[i + 1] - c_seq->i[i]; 200 row = p->garray[i]; 201 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 202 cols += ncols; 203 vals += ncols; 204 } 205 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 206 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 207 208 ptap->reuse = MAT_REUSE_MATRIX; 209 210 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_seq->i[C_oth->rmap->n], c_seq->j, &nout, c_seq->j)); 211 PetscCheck(c_seq->i[C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_seq->i[C_loc->rmap->n], nout); 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A, Mat P, PetscReal fill, Mat Cmpi) 216 { 217 Mat_APMPI *ptap; 218 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 219 MPI_Comm comm; 220 PetscMPIInt size, rank; 221 Mat P_loc, P_oth; 222 PetscFreeSpaceList free_space = NULL, current_space = NULL; 223 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 224 PetscInt *lnk, i, k, pnz, row; 225 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv, nsend, proc; 226 PETSC_UNUSED PetscMPIInt icompleted = 0; 227 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 228 const PetscInt *owners; 229 PetscInt len, *dnz, *onz, nzi, nspacedouble; 230 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 231 MPI_Request *swaits, *rwaits; 232 MPI_Status *sstatus, rstatus; 233 PetscLayout rowmap; 234 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 235 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 236 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, Crmax, *aj, *ai, *pi, nout; 237 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)a->A->data, *ao = NULL, *c_loc, *c_oth; 238 PetscScalar *apv; 239 PetscHMapI ta; 240 MatType mtype; 241 const char *prefix; 242 #if defined(PETSC_USE_INFO) 243 PetscReal apfill; 244 #endif 245 246 PetscFunctionBegin; 247 MatCheckProduct(Cmpi, 4); 248 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 249 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 250 PetscCallMPI(MPI_Comm_size(comm, &size)); 251 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 252 253 if (size > 1) ao = (Mat_SeqAIJ *)a->B->data; 254 255 /* create symbolic parallel matrix Cmpi */ 256 PetscCall(MatGetType(A, &mtype)); 257 PetscCall(MatSetType(Cmpi, mtype)); 258 259 /* create struct Mat_APMPI and attached it to C later */ 260 PetscCall(PetscNew(&ptap)); 261 ptap->reuse = MAT_INITIAL_MATRIX; 262 ptap->algType = 0; 263 264 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 265 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &P_oth)); 266 /* get P_loc by taking all local rows of P */ 267 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &P_loc)); 268 269 ptap->P_loc = P_loc; 270 ptap->P_oth = P_oth; 271 272 /* (0) compute Rd = Pd^T, Ro = Po^T */ 273 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 274 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 275 276 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 277 p_loc = (Mat_SeqAIJ *)P_loc->data; 278 if (P_oth) p_oth = (Mat_SeqAIJ *)P_oth->data; 279 280 /* create and initialize a linked list */ 281 PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */ 282 MatRowMergeMax_SeqAIJ(p_loc, P_loc->rmap->N, ta); 283 MatRowMergeMax_SeqAIJ(p_oth, P_oth->rmap->N, ta); 284 PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 285 286 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 287 288 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 289 if (ao) { 290 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 291 } else { 292 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 293 } 294 current_space = free_space; 295 nspacedouble = 0; 296 297 PetscCall(PetscMalloc1(am + 1, &api)); 298 api[0] = 0; 299 for (i = 0; i < am; i++) { 300 /* diagonal portion: Ad[i,:]*P */ 301 ai = ad->i; 302 pi = p_loc->i; 303 nzi = ai[i + 1] - ai[i]; 304 aj = PetscSafePointerPlusOffset(ad->j, ai[i]); 305 for (j = 0; j < nzi; j++) { 306 row = aj[j]; 307 pnz = pi[row + 1] - pi[row]; 308 Jptr = p_loc->j + pi[row]; 309 /* add non-zero cols of P into the sorted linked list lnk */ 310 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 311 } 312 /* off-diagonal portion: Ao[i,:]*P */ 313 if (ao) { 314 ai = ao->i; 315 pi = p_oth->i; 316 nzi = ai[i + 1] - ai[i]; 317 aj = PetscSafePointerPlusOffset(ao->j, ai[i]); 318 for (j = 0; j < nzi; j++) { 319 row = aj[j]; 320 pnz = pi[row + 1] - pi[row]; 321 Jptr = p_oth->j + pi[row]; 322 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 323 } 324 } 325 apnz = lnk[0]; 326 api[i + 1] = api[i] + apnz; 327 328 /* if free space is not available, double the total space in the list */ 329 if (current_space->local_remaining < apnz) { 330 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 331 nspacedouble++; 332 } 333 334 /* Copy data into free space, then initialize lnk */ 335 PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk)); 336 337 current_space->array += apnz; 338 current_space->local_used += apnz; 339 current_space->local_remaining -= apnz; 340 } 341 /* Allocate space for apj and apv, initialize apj, and */ 342 /* destroy list of free space and other temporary array(s) */ 343 PetscCall(PetscCalloc2(api[am], &apj, api[am], &apv)); 344 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 345 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 346 347 /* Create AP_loc for reuse */ 348 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 349 PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog)); 350 351 #if defined(PETSC_USE_INFO) 352 if (ao) { 353 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 354 } else { 355 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 356 } 357 ptap->AP_loc->info.mallocs = nspacedouble; 358 ptap->AP_loc->info.fill_ratio_given = fill; 359 ptap->AP_loc->info.fill_ratio_needed = apfill; 360 361 if (api[am]) { 362 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 363 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 364 } else { 365 PetscCall(PetscInfo(ptap->AP_loc, "Scalable algorithm, AP_loc is empty \n")); 366 } 367 #endif 368 369 /* (2-1) compute symbolic Co = Ro*AP_loc */ 370 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 371 PetscCall(MatGetOptionsPrefix(A, &prefix)); 372 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 373 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_offdiag_")); 374 375 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 376 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "sorted")); 377 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 378 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 379 PetscCall(MatProductSymbolic(ptap->C_oth)); 380 381 /* (3) send coj of C_oth to other processors */ 382 /* determine row ownership */ 383 PetscCall(PetscLayoutCreate(comm, &rowmap)); 384 PetscCall(PetscLayoutSetLocalSize(rowmap, pn)); 385 PetscCall(PetscLayoutSetBlockSize(rowmap, 1)); 386 PetscCall(PetscLayoutSetUp(rowmap)); 387 PetscCall(PetscLayoutGetRanges(rowmap, &owners)); 388 389 /* determine the number of messages to send, their lengths */ 390 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 391 PetscCall(PetscArrayzero(len_s, size)); 392 PetscCall(PetscArrayzero(len_si, size)); 393 394 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 395 coi = c_oth->i; 396 coj = c_oth->j; 397 con = ptap->C_oth->rmap->n; 398 proc = 0; 399 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, coi[con], coj, coj)); 400 for (i = 0; i < con; i++) { 401 while (prmap[i] >= owners[proc + 1]) proc++; 402 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 403 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 404 } 405 406 len = 0; /* max length of buf_si[], see (4) */ 407 owners_co[0] = 0; 408 nsend = 0; 409 for (proc = 0; proc < size; proc++) { 410 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 411 if (len_s[proc]) { 412 nsend++; 413 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 414 len += len_si[proc]; 415 } 416 } 417 418 /* determine the number and length of messages to receive for coi and coj */ 419 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 420 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 421 422 /* post the Irecv and Isend of coj */ 423 PetscCall(PetscCommGetNewTag(comm, &tagj)); 424 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 425 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 426 for (proc = 0, k = 0; proc < size; proc++) { 427 if (!len_s[proc]) continue; 428 i = owners_co[proc]; 429 PetscCallMPI(MPIU_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 430 k++; 431 } 432 433 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 434 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 435 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 436 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 437 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 438 439 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 440 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_diag_")); 441 442 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 443 PetscCall(MatProductSymbolic(ptap->C_loc)); 444 445 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 446 PetscCall(ISLocalToGlobalMappingApply(ptap->ltog, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, c_loc->j)); 447 448 /* receives coj are complete */ 449 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 450 PetscCall(PetscFree(rwaits)); 451 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 452 453 /* add received column indices into ta to update Crmax */ 454 for (k = 0; k < nrecv; k++) { /* k-th received message */ 455 Jptr = buf_rj[k]; 456 for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1)); 457 } 458 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 459 PetscCall(PetscHMapIDestroy(&ta)); 460 461 /* (4) send and recv coi */ 462 PetscCall(PetscCommGetNewTag(comm, &tagi)); 463 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 464 PetscCall(PetscMalloc1(len + 1, &buf_s)); 465 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 466 for (proc = 0, k = 0; proc < size; proc++) { 467 if (!len_s[proc]) continue; 468 /* form outgoing message for i-structure: 469 buf_si[0]: nrows to be sent 470 [1:nrows]: row index (global) 471 [nrows+1:2*nrows+1]: i-structure index 472 */ 473 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 474 buf_si_i = buf_si + nrows + 1; 475 buf_si[0] = nrows; 476 buf_si_i[0] = 0; 477 nrows = 0; 478 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 479 nzi = coi[i + 1] - coi[i]; 480 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 481 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 482 nrows++; 483 } 484 PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 485 k++; 486 buf_si += len_si[proc]; 487 } 488 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 489 PetscCall(PetscFree(rwaits)); 490 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 491 492 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 493 PetscCall(PetscFree(len_ri)); 494 PetscCall(PetscFree(swaits)); 495 PetscCall(PetscFree(buf_s)); 496 497 /* (5) compute the local portion of Cmpi */ 498 /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */ 499 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 500 current_space = free_space; 501 502 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 503 for (k = 0; k < nrecv; k++) { 504 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 505 nrows = *buf_ri_k[k]; 506 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 507 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 508 } 509 510 MatPreallocateBegin(comm, pn, pn, dnz, onz); 511 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 512 for (i = 0; i < pn; i++) { 513 /* add C_loc into Cmpi */ 514 nzi = c_loc->i[i + 1] - c_loc->i[i]; 515 Jptr = c_loc->j + c_loc->i[i]; 516 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 517 518 /* add received col data into lnk */ 519 for (k = 0; k < nrecv; k++) { /* k-th received message */ 520 if (i == *nextrow[k]) { /* i-th row */ 521 nzi = *(nextci[k] + 1) - *nextci[k]; 522 Jptr = buf_rj[k] + *nextci[k]; 523 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 524 nextrow[k]++; 525 nextci[k]++; 526 } 527 } 528 nzi = lnk[0]; 529 530 /* copy data into free space, then initialize lnk */ 531 PetscCall(PetscLLCondensedClean_Scalable(nzi, current_space->array, lnk)); 532 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 533 } 534 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 535 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 536 PetscCall(PetscFreeSpaceDestroy(free_space)); 537 538 /* local sizes and preallocation */ 539 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 540 if (P->cmap->bs > 0) { 541 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 542 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 543 } 544 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 545 MatPreallocateEnd(dnz, onz); 546 547 /* members in merge */ 548 PetscCall(PetscFree(id_r)); 549 PetscCall(PetscFree(len_r)); 550 PetscCall(PetscFree(buf_ri[0])); 551 PetscCall(PetscFree(buf_ri)); 552 PetscCall(PetscFree(buf_rj[0])); 553 PetscCall(PetscFree(buf_rj)); 554 PetscCall(PetscLayoutDestroy(&rowmap)); 555 556 nout = 0; 557 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_oth->i[ptap->C_oth->rmap->n], c_oth->j, &nout, c_oth->j)); 558 PetscCheck(c_oth->i[ptap->C_oth->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_oth->i[ptap->C_oth->rmap->n], nout); 559 PetscCall(ISGlobalToLocalMappingApply(ptap->ltog, IS_GTOLM_DROP, c_loc->i[ptap->C_loc->rmap->n], c_loc->j, &nout, c_loc->j)); 560 PetscCheck(c_loc->i[ptap->C_loc->rmap->n] == nout, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Incorrect mapping %" PetscInt_FMT " != %" PetscInt_FMT, c_loc->i[ptap->C_loc->rmap->n], nout); 561 562 /* attach the supporting struct to Cmpi for reuse */ 563 Cmpi->product->data = ptap; 564 Cmpi->product->view = MatView_MPIAIJ_PtAP; 565 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 566 567 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 568 Cmpi->assembled = PETSC_FALSE; 569 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable; 570 PetscFunctionReturn(PETSC_SUCCESS); 571 } 572 573 static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHSetI dht, PetscHSetI oht) 574 { 575 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 576 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 577 PetscInt *ai, nzi, j, *aj, row, col, *pi, *pj, pnz, nzpi, *p_othcols, k; 578 PetscInt pcstart, pcend, column, offset; 579 580 PetscFunctionBegin; 581 pcstart = P->cmap->rstart; 582 pcstart *= dof; 583 pcend = P->cmap->rend; 584 pcend *= dof; 585 /* diagonal portion: Ad[i,:]*P */ 586 ai = ad->i; 587 nzi = ai[i + 1] - ai[i]; 588 aj = ad->j + ai[i]; 589 for (j = 0; j < nzi; j++) { 590 row = aj[j]; 591 offset = row % dof; 592 row /= dof; 593 nzpi = pd->i[row + 1] - pd->i[row]; 594 pj = pd->j + pd->i[row]; 595 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(dht, pj[k] * dof + offset + pcstart)); 596 } 597 /* off-diagonal P */ 598 for (j = 0; j < nzi; j++) { 599 row = aj[j]; 600 offset = row % dof; 601 row /= dof; 602 nzpi = po->i[row + 1] - po->i[row]; 603 pj = PetscSafePointerPlusOffset(po->j, po->i[row]); 604 for (k = 0; k < nzpi; k++) PetscCall(PetscHSetIAdd(oht, p->garray[pj[k]] * dof + offset)); 605 } 606 607 /* off-diagonal part: Ao[i, :]*P_oth */ 608 if (ao) { 609 ai = ao->i; 610 pi = p_oth->i; 611 nzi = ai[i + 1] - ai[i]; 612 aj = ao->j + ai[i]; 613 for (j = 0; j < nzi; j++) { 614 row = aj[j]; 615 offset = a->garray[row] % dof; 616 row = map[row]; 617 pnz = pi[row + 1] - pi[row]; 618 p_othcols = p_oth->j + pi[row]; 619 for (col = 0; col < pnz; col++) { 620 column = p_othcols[col] * dof + offset; 621 if (column >= pcstart && column < pcend) { 622 PetscCall(PetscHSetIAdd(dht, column)); 623 } else { 624 PetscCall(PetscHSetIAdd(oht, column)); 625 } 626 } 627 } 628 } /* end if (ao) */ 629 PetscFunctionReturn(PETSC_SUCCESS); 630 } 631 632 static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A, Mat P, Mat P_oth, const PetscInt *map, PetscInt dof, PetscInt i, PetscHMapIV hmap) 633 { 634 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 635 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data, *p_oth = (Mat_SeqAIJ *)P_oth->data, *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 636 PetscInt *ai, nzi, j, *aj, row, col, *pi, pnz, *p_othcols, pcstart, *pj, k, nzpi, offset; 637 PetscScalar ra, *aa, *pa; 638 639 PetscFunctionBegin; 640 pcstart = P->cmap->rstart; 641 pcstart *= dof; 642 643 /* diagonal portion: Ad[i,:]*P */ 644 ai = ad->i; 645 nzi = ai[i + 1] - ai[i]; 646 aj = ad->j + ai[i]; 647 aa = ad->a + ai[i]; 648 for (j = 0; j < nzi; j++) { 649 ra = aa[j]; 650 row = aj[j]; 651 offset = row % dof; 652 row /= dof; 653 nzpi = pd->i[row + 1] - pd->i[row]; 654 pj = pd->j + pd->i[row]; 655 pa = pd->a + pd->i[row]; 656 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, pj[k] * dof + offset + pcstart, ra * pa[k])); 657 PetscCall(PetscLogFlops(2.0 * nzpi)); 658 } 659 for (j = 0; j < nzi; j++) { 660 ra = aa[j]; 661 row = aj[j]; 662 offset = row % dof; 663 row /= dof; 664 nzpi = po->i[row + 1] - po->i[row]; 665 pj = PetscSafePointerPlusOffset(po->j, po->i[row]); 666 pa = PetscSafePointerPlusOffset(po->a, po->i[row]); 667 for (k = 0; k < nzpi; k++) PetscCall(PetscHMapIVAddValue(hmap, p->garray[pj[k]] * dof + offset, ra * pa[k])); 668 PetscCall(PetscLogFlops(2.0 * nzpi)); 669 } 670 671 /* off-diagonal part: Ao[i, :]*P_oth */ 672 if (ao) { 673 ai = ao->i; 674 pi = p_oth->i; 675 nzi = ai[i + 1] - ai[i]; 676 aj = ao->j + ai[i]; 677 aa = ao->a + ai[i]; 678 for (j = 0; j < nzi; j++) { 679 row = aj[j]; 680 offset = a->garray[row] % dof; 681 row = map[row]; 682 ra = aa[j]; 683 pnz = pi[row + 1] - pi[row]; 684 p_othcols = p_oth->j + pi[row]; 685 pa = p_oth->a + pi[row]; 686 for (col = 0; col < pnz; col++) PetscCall(PetscHMapIVAddValue(hmap, p_othcols[col] * dof + offset, ra * pa[col])); 687 PetscCall(PetscLogFlops(2.0 * pnz)); 688 } 689 } /* end if (ao) */ 690 PetscFunctionReturn(PETSC_SUCCESS); 691 } 692 693 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat, Mat, PetscInt dof, MatReuse, Mat *); 694 695 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, Mat C) 696 { 697 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 698 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 699 Mat_APMPI *ptap; 700 PetscHMapIV hmap; 701 PetscInt i, j, jj, kk, nzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, ccstart, ccend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, *dcc, *occ, loc; 702 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 703 PetscInt offset, ii, pocol; 704 const PetscInt *mappingindices; 705 IS map; 706 707 PetscFunctionBegin; 708 MatCheckProduct(C, 4); 709 ptap = (Mat_APMPI *)C->product->data; 710 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 711 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 712 713 PetscCall(MatZeroEntries(C)); 714 715 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 716 if (ptap->reuse == MAT_REUSE_MATRIX) { 717 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 718 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 719 } 720 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 721 722 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 723 pon *= dof; 724 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 725 PetscCall(MatGetLocalSize(A, &am, NULL)); 726 cmaxr = 0; 727 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 728 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 729 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 730 PetscCall(ISGetIndices(map, &mappingindices)); 731 for (i = 0; i < am && pon; i++) { 732 PetscCall(PetscHMapIVClear(hmap)); 733 offset = i % dof; 734 ii = i / dof; 735 nzi = po->i[ii + 1] - po->i[ii]; 736 if (!nzi) continue; 737 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 738 voff = 0; 739 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 740 if (!voff) continue; 741 742 /* Form C(ii, :) */ 743 poj = po->j + po->i[ii]; 744 poa = po->a + po->i[ii]; 745 for (j = 0; j < nzi; j++) { 746 pocol = poj[j] * dof + offset; 747 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 748 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 749 for (jj = 0; jj < voff; jj++) { 750 apvaluestmp[jj] = apvalues[jj] * poa[j]; 751 /* If the row is empty */ 752 if (!c_rmtc[pocol]) { 753 c_rmtjj[jj] = apindices[jj]; 754 c_rmtaa[jj] = apvaluestmp[jj]; 755 c_rmtc[pocol]++; 756 } else { 757 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 758 if (loc >= 0) { /* hit */ 759 c_rmtaa[loc] += apvaluestmp[jj]; 760 PetscCall(PetscLogFlops(1.0)); 761 } else { /* new element */ 762 loc = -(loc + 1); 763 /* Move data backward */ 764 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 765 c_rmtjj[kk] = c_rmtjj[kk - 1]; 766 c_rmtaa[kk] = c_rmtaa[kk - 1]; 767 } /* End kk */ 768 c_rmtjj[loc] = apindices[jj]; 769 c_rmtaa[loc] = apvaluestmp[jj]; 770 c_rmtc[pocol]++; 771 } 772 } 773 PetscCall(PetscLogFlops(voff)); 774 } /* End jj */ 775 } /* End j */ 776 } /* End i */ 777 778 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 779 PetscCall(PetscHMapIVDestroy(&hmap)); 780 781 PetscCall(MatGetLocalSize(P, NULL, &pn)); 782 pn *= dof; 783 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 784 785 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 786 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 787 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 788 pcstart = pcstart * dof; 789 pcend = pcend * dof; 790 cd = (Mat_SeqAIJ *)c->A->data; 791 co = (Mat_SeqAIJ *)c->B->data; 792 793 cmaxr = 0; 794 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 795 PetscCall(PetscCalloc5(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pn, &dcc, pn, &occ)); 796 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 797 for (i = 0; i < am && pn; i++) { 798 PetscCall(PetscHMapIVClear(hmap)); 799 offset = i % dof; 800 ii = i / dof; 801 nzi = pd->i[ii + 1] - pd->i[ii]; 802 if (!nzi) continue; 803 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 804 voff = 0; 805 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 806 if (!voff) continue; 807 /* Form C(ii, :) */ 808 pdj = pd->j + pd->i[ii]; 809 pda = pd->a + pd->i[ii]; 810 for (j = 0; j < nzi; j++) { 811 row = pcstart + pdj[j] * dof + offset; 812 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; 813 PetscCall(PetscLogFlops(voff)); 814 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 815 } 816 } 817 PetscCall(ISRestoreIndices(map, &mappingindices)); 818 PetscCall(MatGetOwnershipRangeColumn(C, &ccstart, &ccend)); 819 PetscCall(PetscFree5(apindices, apvalues, apvaluestmp, dcc, occ)); 820 PetscCall(PetscHMapIVDestroy(&hmap)); 821 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 822 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 823 PetscCall(PetscFree2(c_rmtj, c_rmta)); 824 825 /* Add contributions from remote */ 826 for (i = 0; i < pn; i++) { 827 row = i + pcstart; 828 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]), PetscSafePointerPlusOffset(c_otha, ptap->c_othi[i]), ADD_VALUES)); 829 } 830 PetscCall(PetscFree2(c_othj, c_otha)); 831 832 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 833 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 834 835 ptap->reuse = MAT_REUSE_MATRIX; 836 PetscFunctionReturn(PETSC_SUCCESS); 837 } 838 839 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, Mat C) 840 { 841 PetscFunctionBegin; 842 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, C)); 843 PetscFunctionReturn(PETSC_SUCCESS); 844 } 845 846 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, Mat C) 847 { 848 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *c = (Mat_MPIAIJ *)C->data; 849 Mat_SeqAIJ *cd, *co, *po = (Mat_SeqAIJ *)p->B->data, *pd = (Mat_SeqAIJ *)p->A->data; 850 Mat_APMPI *ptap; 851 PetscHMapIV hmap; 852 PetscInt i, j, jj, kk, nzi, dnzi, *c_rmtj, voff, *c_othj, pn, pon, pcstart, pcend, row, am, *poj, *pdj, *apindices, cmaxr, *c_rmtc, *c_rmtjj, loc; 853 PetscScalar *c_rmta, *c_otha, *poa, *pda, *apvalues, *apvaluestmp, *c_rmtaa; 854 PetscInt offset, ii, pocol; 855 const PetscInt *mappingindices; 856 IS map; 857 858 PetscFunctionBegin; 859 MatCheckProduct(C, 4); 860 ptap = (Mat_APMPI *)C->product->data; 861 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 862 PetscCheck(ptap->P_oth, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 863 864 PetscCall(MatZeroEntries(C)); 865 866 /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 867 if (ptap->reuse == MAT_REUSE_MATRIX) { 868 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 869 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_REUSE_MATRIX, &ptap->P_oth)); 870 } 871 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 872 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 873 pon *= dof; 874 PetscCall(MatGetLocalSize(P, NULL, &pn)); 875 pn *= dof; 876 877 PetscCall(PetscCalloc2(ptap->c_rmti[pon], &c_rmtj, ptap->c_rmti[pon], &c_rmta)); 878 PetscCall(MatGetLocalSize(A, &am, NULL)); 879 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 880 pcstart *= dof; 881 pcend *= dof; 882 cmaxr = 0; 883 for (i = 0; i < pon; i++) cmaxr = PetscMax(cmaxr, ptap->c_rmti[i + 1] - ptap->c_rmti[i]); 884 cd = (Mat_SeqAIJ *)c->A->data; 885 co = (Mat_SeqAIJ *)c->B->data; 886 for (i = 0; i < pn; i++) cmaxr = PetscMax(cmaxr, (cd->i[i + 1] - cd->i[i]) + (co->i[i + 1] - co->i[i])); 887 PetscCall(PetscCalloc4(cmaxr, &apindices, cmaxr, &apvalues, cmaxr, &apvaluestmp, pon, &c_rmtc)); 888 PetscCall(PetscHMapIVCreateWithSize(cmaxr, &hmap)); 889 PetscCall(ISGetIndices(map, &mappingindices)); 890 for (i = 0; i < am && (pon || pn); i++) { 891 PetscCall(PetscHMapIVClear(hmap)); 892 offset = i % dof; 893 ii = i / dof; 894 nzi = po->i[ii + 1] - po->i[ii]; 895 dnzi = pd->i[ii + 1] - pd->i[ii]; 896 if (!nzi && !dnzi) continue; 897 PetscCall(MatPtAPNumericComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, hmap)); 898 voff = 0; 899 PetscCall(PetscHMapIVGetPairs(hmap, &voff, apindices, apvalues)); 900 if (!voff) continue; 901 902 /* Form remote C(ii, :) */ 903 poj = PetscSafePointerPlusOffset(po->j, po->i[ii]); 904 poa = PetscSafePointerPlusOffset(po->a, po->i[ii]); 905 for (j = 0; j < nzi; j++) { 906 pocol = poj[j] * dof + offset; 907 c_rmtjj = c_rmtj + ptap->c_rmti[pocol]; 908 c_rmtaa = c_rmta + ptap->c_rmti[pocol]; 909 for (jj = 0; jj < voff; jj++) { 910 apvaluestmp[jj] = apvalues[jj] * poa[j]; 911 /* If the row is empty */ 912 if (!c_rmtc[pocol]) { 913 c_rmtjj[jj] = apindices[jj]; 914 c_rmtaa[jj] = apvaluestmp[jj]; 915 c_rmtc[pocol]++; 916 } else { 917 PetscCall(PetscFindInt(apindices[jj], c_rmtc[pocol], c_rmtjj, &loc)); 918 if (loc >= 0) { /* hit */ 919 c_rmtaa[loc] += apvaluestmp[jj]; 920 PetscCall(PetscLogFlops(1.0)); 921 } else { /* new element */ 922 loc = -(loc + 1); 923 /* Move data backward */ 924 for (kk = c_rmtc[pocol]; kk > loc; kk--) { 925 c_rmtjj[kk] = c_rmtjj[kk - 1]; 926 c_rmtaa[kk] = c_rmtaa[kk - 1]; 927 } /* End kk */ 928 c_rmtjj[loc] = apindices[jj]; 929 c_rmtaa[loc] = apvaluestmp[jj]; 930 c_rmtc[pocol]++; 931 } 932 } 933 } /* End jj */ 934 PetscCall(PetscLogFlops(voff)); 935 } /* End j */ 936 937 /* Form local C(ii, :) */ 938 pdj = pd->j + pd->i[ii]; 939 pda = pd->a + pd->i[ii]; 940 for (j = 0; j < dnzi; j++) { 941 row = pcstart + pdj[j] * dof + offset; 942 for (jj = 0; jj < voff; jj++) apvaluestmp[jj] = apvalues[jj] * pda[j]; /* End kk */ 943 PetscCall(PetscLogFlops(voff)); 944 PetscCall(MatSetValues(C, 1, &row, voff, apindices, apvaluestmp, ADD_VALUES)); 945 } /* End j */ 946 } /* End i */ 947 948 PetscCall(ISRestoreIndices(map, &mappingindices)); 949 PetscCall(PetscFree4(apindices, apvalues, apvaluestmp, c_rmtc)); 950 PetscCall(PetscHMapIVDestroy(&hmap)); 951 PetscCall(PetscCalloc2(ptap->c_othi[pn], &c_othj, ptap->c_othi[pn], &c_otha)); 952 953 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 954 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 955 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 956 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_SCALAR, c_rmta, c_otha, MPI_REPLACE)); 957 PetscCall(PetscFree2(c_rmtj, c_rmta)); 958 959 /* Add contributions from remote */ 960 for (i = 0; i < pn; i++) { 961 row = i + pcstart; 962 PetscCall(MatSetValues(C, 1, &row, ptap->c_othi[i + 1] - ptap->c_othi[i], PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]), PetscSafePointerPlusOffset(c_otha, ptap->c_othi[i]), ADD_VALUES)); 963 } 964 PetscCall(PetscFree2(c_othj, c_otha)); 965 966 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 967 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 968 969 ptap->reuse = MAT_REUSE_MATRIX; 970 PetscFunctionReturn(PETSC_SUCCESS); 971 } 972 973 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, Mat C) 974 { 975 PetscFunctionBegin; 976 PetscCall(MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, C)); 977 PetscFunctionReturn(PETSC_SUCCESS); 978 } 979 980 /* TODO: move algorithm selection to MatProductSetFromOptions */ 981 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 982 { 983 Mat_APMPI *ptap; 984 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 985 MPI_Comm comm; 986 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 987 MatType mtype; 988 PetscSF sf; 989 PetscSFNode *iremote; 990 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 991 const PetscInt *rootdegrees; 992 PetscHSetI ht, oht, *hta, *hto; 993 PetscInt pn, pon, *c_rmtc, i, j, nzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 994 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 995 PetscInt nalg = 2, alg = 0, offset, ii; 996 PetscMPIInt owner; 997 const PetscInt *mappingindices; 998 PetscBool flg; 999 const char *algTypes[2] = {"overlapping", "merged"}; 1000 IS map; 1001 1002 PetscFunctionBegin; 1003 MatCheckProduct(Cmpi, 5); 1004 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1005 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1006 1007 /* Create symbolic parallel matrix Cmpi */ 1008 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1009 pn *= dof; 1010 PetscCall(MatGetType(A, &mtype)); 1011 PetscCall(MatSetType(Cmpi, mtype)); 1012 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1013 1014 PetscCall(PetscNew(&ptap)); 1015 ptap->reuse = MAT_INITIAL_MATRIX; 1016 ptap->algType = 2; 1017 1018 /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1019 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1020 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1021 /* This equals to the number of offdiag columns in P */ 1022 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1023 pon *= dof; 1024 /* offsets */ 1025 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1026 /* The number of columns we will send to remote ranks */ 1027 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1028 PetscCall(PetscMalloc1(pon, &hta)); 1029 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1030 PetscCall(MatGetLocalSize(A, &am, NULL)); 1031 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1032 /* Create hash table to merge all columns for C(i, :) */ 1033 PetscCall(PetscHSetICreate(&ht)); 1034 1035 PetscCall(ISGetIndices(map, &mappingindices)); 1036 ptap->c_rmti[0] = 0; 1037 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1038 for (i = 0; i < am && pon; i++) { 1039 /* Form one row of AP */ 1040 PetscCall(PetscHSetIClear(ht)); 1041 offset = i % dof; 1042 ii = i / dof; 1043 /* If the off-diagonal is empty, do not do any calculation */ 1044 nzi = po->i[ii + 1] - po->i[ii]; 1045 if (!nzi) continue; 1046 1047 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, ht)); 1048 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1049 /* If AP is empty, just continue */ 1050 if (!htsize) continue; 1051 /* Form C(ii, :) */ 1052 poj = po->j + po->i[ii]; 1053 for (j = 0; j < nzi; j++) PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1054 } 1055 1056 for (i = 0; i < pon; i++) { 1057 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1058 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1059 c_rmtc[i] = htsize; 1060 } 1061 1062 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1063 1064 for (i = 0; i < pon; i++) { 1065 off = 0; 1066 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1067 PetscCall(PetscHSetIDestroy(&hta[i])); 1068 } 1069 PetscCall(PetscFree(hta)); 1070 1071 PetscCall(PetscMalloc1(pon, &iremote)); 1072 for (i = 0; i < pon; i++) { 1073 owner = 0; 1074 lidx = 0; 1075 offset = i % dof; 1076 ii = i / dof; 1077 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1078 iremote[i].index = lidx * dof + offset; 1079 iremote[i].rank = owner; 1080 } 1081 1082 PetscCall(PetscSFCreate(comm, &sf)); 1083 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1084 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1085 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1086 PetscCall(PetscSFSetFromOptions(sf)); 1087 PetscCall(PetscSFSetUp(sf)); 1088 /* How many neighbors have contributions to my rows? */ 1089 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1090 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1091 rootspacesize = 0; 1092 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1093 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1094 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1095 /* Get information from leaves 1096 * Number of columns other people contribute to my rows 1097 * */ 1098 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1099 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1100 PetscCall(PetscFree(c_rmtc)); 1101 PetscCall(PetscCalloc1(pn + 1, &ptap->c_othi)); 1102 /* The number of columns is received for each row */ 1103 ptap->c_othi[0] = 0; 1104 rootspacesize = 0; 1105 rootspaceoffsets[0] = 0; 1106 for (i = 0; i < pn; i++) { 1107 rcvncols = 0; 1108 for (j = 0; j < rootdegrees[i]; j++) { 1109 rcvncols += rootspace[rootspacesize]; 1110 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1111 rootspacesize++; 1112 } 1113 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1114 } 1115 PetscCall(PetscFree(rootspace)); 1116 1117 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1118 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1119 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1120 PetscCall(PetscSFDestroy(&sf)); 1121 PetscCall(PetscFree(rootspaceoffsets)); 1122 1123 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1124 nleaves = 0; 1125 for (i = 0; i < pon; i++) { 1126 owner = 0; 1127 ii = i / dof; 1128 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1129 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1130 for (j = 0; j < sendncols; j++) { 1131 iremote[nleaves].rank = owner; 1132 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1133 } 1134 } 1135 PetscCall(PetscFree(c_rmtoffsets)); 1136 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1137 1138 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1139 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1140 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1141 /* One to one map */ 1142 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1143 1144 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1145 PetscCall(PetscHSetICreate(&oht)); 1146 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1147 pcstart *= dof; 1148 pcend *= dof; 1149 PetscCall(PetscMalloc2(pn, &hta, pn, &hto)); 1150 for (i = 0; i < pn; i++) { 1151 PetscCall(PetscHSetICreate(&hta[i])); 1152 PetscCall(PetscHSetICreate(&hto[i])); 1153 } 1154 /* Work on local part */ 1155 /* 4) Pass 1: Estimate memory for C_loc */ 1156 for (i = 0; i < am && pn; i++) { 1157 PetscCall(PetscHSetIClear(ht)); 1158 PetscCall(PetscHSetIClear(oht)); 1159 offset = i % dof; 1160 ii = i / dof; 1161 nzi = pd->i[ii + 1] - pd->i[ii]; 1162 if (!nzi) continue; 1163 1164 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1165 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1166 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1167 if (!(htsize + htosize)) continue; 1168 /* Form C(ii, :) */ 1169 pdj = pd->j + pd->i[ii]; 1170 for (j = 0; j < nzi; j++) { 1171 PetscCall(PetscHSetIUpdate(hta[pdj[j] * dof + offset], ht)); 1172 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1173 } 1174 } 1175 1176 PetscCall(ISRestoreIndices(map, &mappingindices)); 1177 1178 PetscCall(PetscHSetIDestroy(&ht)); 1179 PetscCall(PetscHSetIDestroy(&oht)); 1180 1181 /* Get remote data */ 1182 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1183 PetscCall(PetscFree(c_rmtj)); 1184 1185 for (i = 0; i < pn; i++) { 1186 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1187 rdj = PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]); 1188 for (j = 0; j < nzi; j++) { 1189 col = rdj[j]; 1190 /* diag part */ 1191 if (col >= pcstart && col < pcend) { 1192 PetscCall(PetscHSetIAdd(hta[i], col)); 1193 } else { /* off-diagonal */ 1194 PetscCall(PetscHSetIAdd(hto[i], col)); 1195 } 1196 } 1197 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1198 dnz[i] = htsize; 1199 PetscCall(PetscHSetIDestroy(&hta[i])); 1200 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1201 onz[i] = htsize; 1202 PetscCall(PetscHSetIDestroy(&hto[i])); 1203 } 1204 1205 PetscCall(PetscFree2(hta, hto)); 1206 PetscCall(PetscFree(c_othj)); 1207 1208 /* local sizes and preallocation */ 1209 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1210 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1211 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1212 PetscCall(MatSetUp(Cmpi)); 1213 PetscCall(PetscFree2(dnz, onz)); 1214 1215 /* attach the supporting struct to Cmpi for reuse */ 1216 Cmpi->product->data = ptap; 1217 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1218 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1219 1220 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1221 Cmpi->assembled = PETSC_FALSE; 1222 /* pick an algorithm */ 1223 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1224 alg = 0; 1225 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1226 PetscOptionsEnd(); 1227 switch (alg) { 1228 case 0: 1229 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1230 break; 1231 case 1: 1232 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1233 break; 1234 default: 1235 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1236 } 1237 PetscFunctionReturn(PETSC_SUCCESS); 1238 } 1239 1240 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A, Mat P, PetscReal fill, Mat C) 1241 { 1242 PetscFunctionBegin; 1243 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A, P, 1, fill, C)); 1244 PetscFunctionReturn(PETSC_SUCCESS); 1245 } 1246 1247 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A, Mat P, PetscInt dof, PetscReal fill, Mat Cmpi) 1248 { 1249 Mat_APMPI *ptap; 1250 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1251 MPI_Comm comm; 1252 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data; 1253 MatType mtype; 1254 PetscSF sf; 1255 PetscSFNode *iremote; 1256 PetscInt rootspacesize, *rootspace, *rootspaceoffsets, nleaves; 1257 const PetscInt *rootdegrees; 1258 PetscHSetI ht, oht, *hta, *hto, *htd; 1259 PetscInt pn, pon, *c_rmtc, i, j, nzi, dnzi, htsize, htosize, *c_rmtj, off, *c_othj, rcvncols, sendncols, *c_rmtoffsets; 1260 PetscInt lidx, *rdj, col, pcstart, pcend, *dnz, *onz, am, arstart, arend, *poj, *pdj; 1261 PetscInt nalg = 2, alg = 0, offset, ii; 1262 PetscMPIInt owner; 1263 PetscBool flg; 1264 const char *algTypes[2] = {"merged", "overlapping"}; 1265 const PetscInt *mappingindices; 1266 IS map; 1267 1268 PetscFunctionBegin; 1269 MatCheckProduct(Cmpi, 5); 1270 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1271 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1272 1273 /* Create symbolic parallel matrix Cmpi */ 1274 PetscCall(MatGetLocalSize(P, NULL, &pn)); 1275 pn *= dof; 1276 PetscCall(MatGetType(A, &mtype)); 1277 PetscCall(MatSetType(Cmpi, mtype)); 1278 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1279 1280 PetscCall(PetscNew(&ptap)); 1281 ptap->reuse = MAT_INITIAL_MATRIX; 1282 ptap->algType = 3; 1283 1284 /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1285 PetscCall(MatGetBrowsOfAcols_MPIXAIJ(A, P, dof, MAT_INITIAL_MATRIX, &ptap->P_oth)); 1286 PetscCall(PetscObjectQuery((PetscObject)ptap->P_oth, "aoffdiagtopothmapping", (PetscObject *)&map)); 1287 1288 /* This equals to the number of offdiag columns in P */ 1289 PetscCall(MatGetLocalSize(p->B, NULL, &pon)); 1290 pon *= dof; 1291 /* offsets */ 1292 PetscCall(PetscMalloc1(pon + 1, &ptap->c_rmti)); 1293 /* The number of columns we will send to remote ranks */ 1294 PetscCall(PetscMalloc1(pon, &c_rmtc)); 1295 PetscCall(PetscMalloc1(pon, &hta)); 1296 for (i = 0; i < pon; i++) PetscCall(PetscHSetICreate(&hta[i])); 1297 PetscCall(MatGetLocalSize(A, &am, NULL)); 1298 PetscCall(MatGetOwnershipRange(A, &arstart, &arend)); 1299 /* Create hash table to merge all columns for C(i, :) */ 1300 PetscCall(PetscHSetICreate(&ht)); 1301 PetscCall(PetscHSetICreate(&oht)); 1302 PetscCall(PetscMalloc2(pn, &htd, pn, &hto)); 1303 for (i = 0; i < pn; i++) { 1304 PetscCall(PetscHSetICreate(&htd[i])); 1305 PetscCall(PetscHSetICreate(&hto[i])); 1306 } 1307 1308 PetscCall(ISGetIndices(map, &mappingindices)); 1309 ptap->c_rmti[0] = 0; 1310 /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */ 1311 for (i = 0; i < am && (pon || pn); i++) { 1312 /* Form one row of AP */ 1313 PetscCall(PetscHSetIClear(ht)); 1314 PetscCall(PetscHSetIClear(oht)); 1315 offset = i % dof; 1316 ii = i / dof; 1317 /* If the off-diagonal is empty, do not do any calculation */ 1318 nzi = po->i[ii + 1] - po->i[ii]; 1319 dnzi = pd->i[ii + 1] - pd->i[ii]; 1320 if (!nzi && !dnzi) continue; 1321 1322 PetscCall(MatPtAPSymbolicComputeOneRowOfAP_private(A, P, ptap->P_oth, mappingindices, dof, i, ht, oht)); 1323 PetscCall(PetscHSetIGetSize(ht, &htsize)); 1324 PetscCall(PetscHSetIGetSize(oht, &htosize)); 1325 /* If AP is empty, just continue */ 1326 if (!(htsize + htosize)) continue; 1327 1328 /* Form remote C(ii, :) */ 1329 poj = PetscSafePointerPlusOffset(po->j, po->i[ii]); 1330 for (j = 0; j < nzi; j++) { 1331 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], ht)); 1332 PetscCall(PetscHSetIUpdate(hta[poj[j] * dof + offset], oht)); 1333 } 1334 1335 /* Form local C(ii, :) */ 1336 pdj = pd->j + pd->i[ii]; 1337 for (j = 0; j < dnzi; j++) { 1338 PetscCall(PetscHSetIUpdate(htd[pdj[j] * dof + offset], ht)); 1339 PetscCall(PetscHSetIUpdate(hto[pdj[j] * dof + offset], oht)); 1340 } 1341 } 1342 1343 PetscCall(ISRestoreIndices(map, &mappingindices)); 1344 1345 PetscCall(PetscHSetIDestroy(&ht)); 1346 PetscCall(PetscHSetIDestroy(&oht)); 1347 1348 for (i = 0; i < pon; i++) { 1349 PetscCall(PetscHSetIGetSize(hta[i], &htsize)); 1350 ptap->c_rmti[i + 1] = ptap->c_rmti[i] + htsize; 1351 c_rmtc[i] = htsize; 1352 } 1353 1354 PetscCall(PetscMalloc1(ptap->c_rmti[pon], &c_rmtj)); 1355 1356 for (i = 0; i < pon; i++) { 1357 off = 0; 1358 PetscCall(PetscHSetIGetElems(hta[i], &off, c_rmtj + ptap->c_rmti[i])); 1359 PetscCall(PetscHSetIDestroy(&hta[i])); 1360 } 1361 PetscCall(PetscFree(hta)); 1362 1363 PetscCall(PetscMalloc1(pon, &iremote)); 1364 for (i = 0; i < pon; i++) { 1365 owner = 0; 1366 lidx = 0; 1367 offset = i % dof; 1368 ii = i / dof; 1369 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, &lidx)); 1370 iremote[i].index = lidx * dof + offset; 1371 iremote[i].rank = owner; 1372 } 1373 1374 PetscCall(PetscSFCreate(comm, &sf)); 1375 PetscCall(PetscSFSetGraph(sf, pn, pon, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1376 /* Reorder ranks properly so that the data handled by gather and scatter have the same order */ 1377 PetscCall(PetscSFSetRankOrder(sf, PETSC_TRUE)); 1378 PetscCall(PetscSFSetFromOptions(sf)); 1379 PetscCall(PetscSFSetUp(sf)); 1380 /* How many neighbors have contributions to my rows? */ 1381 PetscCall(PetscSFComputeDegreeBegin(sf, &rootdegrees)); 1382 PetscCall(PetscSFComputeDegreeEnd(sf, &rootdegrees)); 1383 rootspacesize = 0; 1384 for (i = 0; i < pn; i++) rootspacesize += rootdegrees[i]; 1385 PetscCall(PetscMalloc1(rootspacesize, &rootspace)); 1386 PetscCall(PetscMalloc1(rootspacesize + 1, &rootspaceoffsets)); 1387 /* Get information from leaves 1388 * Number of columns other people contribute to my rows 1389 * */ 1390 PetscCall(PetscSFGatherBegin(sf, MPIU_INT, c_rmtc, rootspace)); 1391 PetscCall(PetscSFGatherEnd(sf, MPIU_INT, c_rmtc, rootspace)); 1392 PetscCall(PetscFree(c_rmtc)); 1393 PetscCall(PetscMalloc1(pn + 1, &ptap->c_othi)); 1394 /* The number of columns is received for each row */ 1395 ptap->c_othi[0] = 0; 1396 rootspacesize = 0; 1397 rootspaceoffsets[0] = 0; 1398 for (i = 0; i < pn; i++) { 1399 rcvncols = 0; 1400 for (j = 0; j < rootdegrees[i]; j++) { 1401 rcvncols += rootspace[rootspacesize]; 1402 rootspaceoffsets[rootspacesize + 1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize]; 1403 rootspacesize++; 1404 } 1405 ptap->c_othi[i + 1] = ptap->c_othi[i] + rcvncols; 1406 } 1407 PetscCall(PetscFree(rootspace)); 1408 1409 PetscCall(PetscMalloc1(pon, &c_rmtoffsets)); 1410 PetscCall(PetscSFScatterBegin(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1411 PetscCall(PetscSFScatterEnd(sf, MPIU_INT, rootspaceoffsets, c_rmtoffsets)); 1412 PetscCall(PetscSFDestroy(&sf)); 1413 PetscCall(PetscFree(rootspaceoffsets)); 1414 1415 PetscCall(PetscCalloc1(ptap->c_rmti[pon], &iremote)); 1416 nleaves = 0; 1417 for (i = 0; i < pon; i++) { 1418 owner = 0; 1419 ii = i / dof; 1420 PetscCall(PetscLayoutFindOwnerIndex(P->cmap, p->garray[ii], &owner, NULL)); 1421 sendncols = ptap->c_rmti[i + 1] - ptap->c_rmti[i]; 1422 for (j = 0; j < sendncols; j++) { 1423 iremote[nleaves].rank = owner; 1424 iremote[nleaves++].index = c_rmtoffsets[i] + j; 1425 } 1426 } 1427 PetscCall(PetscFree(c_rmtoffsets)); 1428 PetscCall(PetscCalloc1(ptap->c_othi[pn], &c_othj)); 1429 1430 PetscCall(PetscSFCreate(comm, &ptap->sf)); 1431 PetscCall(PetscSFSetGraph(ptap->sf, ptap->c_othi[pn], nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER)); 1432 PetscCall(PetscSFSetFromOptions(ptap->sf)); 1433 /* One to one map */ 1434 PetscCall(PetscSFReduceBegin(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1435 /* Get remote data */ 1436 PetscCall(PetscSFReduceEnd(ptap->sf, MPIU_INT, c_rmtj, c_othj, MPI_REPLACE)); 1437 PetscCall(PetscFree(c_rmtj)); 1438 PetscCall(PetscMalloc2(pn, &dnz, pn, &onz)); 1439 PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, &pcend)); 1440 pcstart *= dof; 1441 pcend *= dof; 1442 for (i = 0; i < pn; i++) { 1443 nzi = ptap->c_othi[i + 1] - ptap->c_othi[i]; 1444 rdj = PetscSafePointerPlusOffset(c_othj, ptap->c_othi[i]); 1445 for (j = 0; j < nzi; j++) { 1446 col = rdj[j]; 1447 /* diagonal part */ 1448 if (col >= pcstart && col < pcend) { 1449 PetscCall(PetscHSetIAdd(htd[i], col)); 1450 } else { /* off-diagonal */ 1451 PetscCall(PetscHSetIAdd(hto[i], col)); 1452 } 1453 } 1454 PetscCall(PetscHSetIGetSize(htd[i], &htsize)); 1455 dnz[i] = htsize; 1456 PetscCall(PetscHSetIDestroy(&htd[i])); 1457 PetscCall(PetscHSetIGetSize(hto[i], &htsize)); 1458 onz[i] = htsize; 1459 PetscCall(PetscHSetIDestroy(&hto[i])); 1460 } 1461 1462 PetscCall(PetscFree2(htd, hto)); 1463 PetscCall(PetscFree(c_othj)); 1464 1465 /* local sizes and preallocation */ 1466 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1467 PetscCall(MatSetBlockSizes(Cmpi, dof > 1 ? dof : P->cmap->bs, dof > 1 ? dof : P->cmap->bs)); 1468 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1469 PetscCall(PetscFree2(dnz, onz)); 1470 1471 /* attach the supporting struct to Cmpi for reuse */ 1472 Cmpi->product->data = ptap; 1473 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1474 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1475 1476 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1477 Cmpi->assembled = PETSC_FALSE; 1478 /* pick an algorithm */ 1479 PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "MatPtAP", "Mat"); 1480 alg = 0; 1481 PetscCall(PetscOptionsEList("-matptap_allatonce_via", "PtAP allatonce numeric approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 1482 PetscOptionsEnd(); 1483 switch (alg) { 1484 case 0: 1485 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged; 1486 break; 1487 case 1: 1488 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce; 1489 break; 1490 default: 1491 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, " Unsupported allatonce numerical algorithm "); 1492 } 1493 PetscFunctionReturn(PETSC_SUCCESS); 1494 } 1495 1496 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A, Mat P, PetscReal fill, Mat C) 1497 { 1498 PetscFunctionBegin; 1499 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A, P, 1, fill, C)); 1500 PetscFunctionReturn(PETSC_SUCCESS); 1501 } 1502 1503 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat Cmpi) 1504 { 1505 Mat_APMPI *ptap; 1506 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1507 MPI_Comm comm; 1508 PetscMPIInt size, rank, nsend, proc; 1509 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1510 PetscInt am = A->rmap->n, pm = P->rmap->n, pN = P->cmap->N, pn = P->cmap->n; 1511 PetscInt *lnk, i, k, pnz, row; 1512 PetscBT lnkbt; 1513 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 1514 PETSC_UNUSED PetscMPIInt icompleted = 0; 1515 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 1516 PetscInt len, *dnz, *onz, *owners, nzi, nspacedouble; 1517 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1518 MPI_Request *swaits, *rwaits; 1519 MPI_Status *sstatus, rstatus; 1520 PetscLayout rowmap; 1521 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1522 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 1523 PetscInt *api, *apj, *Jptr, apnz, *prmap = p->garray, con, j, ap_rmax = 0, Crmax, *aj, *ai, *pi; 1524 Mat_SeqAIJ *p_loc, *p_oth = NULL, *ad = (Mat_SeqAIJ *)a->A->data, *ao = NULL, *c_loc, *c_oth; 1525 PetscScalar *apv; 1526 PetscHMapI ta; 1527 MatType mtype; 1528 const char *prefix; 1529 #if defined(PETSC_USE_INFO) 1530 PetscReal apfill; 1531 #endif 1532 1533 PetscFunctionBegin; 1534 MatCheckProduct(Cmpi, 4); 1535 PetscCheck(!Cmpi->product->data, PetscObjectComm((PetscObject)Cmpi), PETSC_ERR_PLIB, "Product data not empty"); 1536 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1537 PetscCallMPI(MPI_Comm_size(comm, &size)); 1538 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1539 1540 if (size > 1) ao = (Mat_SeqAIJ *)a->B->data; 1541 1542 /* create symbolic parallel matrix Cmpi */ 1543 PetscCall(MatGetType(A, &mtype)); 1544 PetscCall(MatSetType(Cmpi, mtype)); 1545 1546 /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */ 1547 Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ; 1548 1549 /* create struct Mat_APMPI and attached it to C later */ 1550 PetscCall(PetscNew(&ptap)); 1551 ptap->reuse = MAT_INITIAL_MATRIX; 1552 ptap->algType = 1; 1553 1554 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1555 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1556 /* get P_loc by taking all local rows of P */ 1557 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 1558 1559 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1560 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 1561 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 1562 1563 /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */ 1564 p_loc = (Mat_SeqAIJ *)ptap->P_loc->data; 1565 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data; 1566 1567 /* create and initialize a linked list */ 1568 PetscCall(PetscHMapICreateWithSize(pn, &ta)); /* for compute AP_loc and Cmpi */ 1569 MatRowMergeMax_SeqAIJ(p_loc, ptap->P_loc->rmap->N, ta); 1570 MatRowMergeMax_SeqAIJ(p_oth, ptap->P_oth->rmap->N, ta); 1571 PetscCall(PetscHMapIGetSize(ta, &Crmax)); /* Crmax = nnz(sum of Prows) */ 1572 /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */ 1573 1574 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1575 1576 /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */ 1577 if (ao) { 1578 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], PetscIntSumTruncate(ao->i[am], p_loc->i[pm]))), &free_space)); 1579 } else { 1580 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ad->i[am], p_loc->i[pm])), &free_space)); 1581 } 1582 current_space = free_space; 1583 nspacedouble = 0; 1584 1585 PetscCall(PetscMalloc1(am + 1, &api)); 1586 api[0] = 0; 1587 for (i = 0; i < am; i++) { 1588 /* diagonal portion: Ad[i,:]*P */ 1589 ai = ad->i; 1590 pi = p_loc->i; 1591 nzi = ai[i + 1] - ai[i]; 1592 aj = PetscSafePointerPlusOffset(ad->j, ai[i]); 1593 for (j = 0; j < nzi; j++) { 1594 row = aj[j]; 1595 pnz = pi[row + 1] - pi[row]; 1596 Jptr = p_loc->j + pi[row]; 1597 /* add non-zero cols of P into the sorted linked list lnk */ 1598 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1599 } 1600 /* off-diagonal portion: Ao[i,:]*P */ 1601 if (ao) { 1602 ai = ao->i; 1603 pi = p_oth->i; 1604 nzi = ai[i + 1] - ai[i]; 1605 aj = PetscSafePointerPlusOffset(ao->j, ai[i]); 1606 for (j = 0; j < nzi; j++) { 1607 row = aj[j]; 1608 pnz = pi[row + 1] - pi[row]; 1609 Jptr = p_oth->j + pi[row]; 1610 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 1611 } 1612 } 1613 apnz = lnk[0]; 1614 api[i + 1] = api[i] + apnz; 1615 if (ap_rmax < apnz) ap_rmax = apnz; 1616 1617 /* if free space is not available, double the total space in the list */ 1618 if (current_space->local_remaining < apnz) { 1619 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 1620 nspacedouble++; 1621 } 1622 1623 /* Copy data into free space, then initialize lnk */ 1624 PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt)); 1625 1626 current_space->array = PetscSafePointerPlusOffset(current_space->array, apnz); 1627 current_space->local_used += apnz; 1628 current_space->local_remaining -= apnz; 1629 } 1630 /* Allocate space for apj and apv, initialize apj, and */ 1631 /* destroy list of free space and other temporary array(s) */ 1632 PetscCall(PetscMalloc2(api[am], &apj, api[am], &apv)); 1633 PetscCall(PetscFreeSpaceContiguous(&free_space, apj)); 1634 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1635 1636 /* Create AP_loc for reuse */ 1637 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, pN, api, apj, apv, &ptap->AP_loc)); 1638 PetscCall(MatSetType(ptap->AP_loc, ((PetscObject)p->A)->type_name)); 1639 #if defined(PETSC_USE_INFO) 1640 if (ao) { 1641 apfill = (PetscReal)api[am] / (ad->i[am] + ao->i[am] + p_loc->i[pm] + 1); 1642 } else { 1643 apfill = (PetscReal)api[am] / (ad->i[am] + p_loc->i[pm] + 1); 1644 } 1645 ptap->AP_loc->info.mallocs = nspacedouble; 1646 ptap->AP_loc->info.fill_ratio_given = fill; 1647 ptap->AP_loc->info.fill_ratio_needed = apfill; 1648 1649 if (api[am]) { 1650 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)apfill)); 1651 PetscCall(PetscInfo(ptap->AP_loc, "Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n", (double)apfill)); 1652 } else { 1653 PetscCall(PetscInfo(ptap->AP_loc, "Nonscalable algorithm, AP_loc is empty \n")); 1654 } 1655 #endif 1656 1657 /* (2-1) compute symbolic Co = Ro*AP_loc */ 1658 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1659 PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix)); 1660 PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_")); 1661 PetscCall(MatProductCreate(ptap->Ro, ptap->AP_loc, NULL, &ptap->C_oth)); 1662 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1663 PetscCall(MatSetOptionsPrefix(ptap->C_oth, prefix)); 1664 PetscCall(MatAppendOptionsPrefix(ptap->C_oth, "inner_C_oth_")); 1665 PetscCall(MatProductSetType(ptap->C_oth, MATPRODUCT_AB)); 1666 PetscCall(MatProductSetAlgorithm(ptap->C_oth, "default")); 1667 PetscCall(MatProductSetFill(ptap->C_oth, fill)); 1668 PetscCall(MatProductSetFromOptions(ptap->C_oth)); 1669 PetscCall(MatProductSymbolic(ptap->C_oth)); 1670 1671 /* (3) send coj of C_oth to other processors */ 1672 /* determine row ownership */ 1673 PetscCall(PetscLayoutCreate(comm, &rowmap)); 1674 rowmap->n = pn; 1675 rowmap->bs = 1; 1676 PetscCall(PetscLayoutSetUp(rowmap)); 1677 owners = rowmap->range; 1678 1679 /* determine the number of messages to send, their lengths */ 1680 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 1681 PetscCall(PetscArrayzero(len_s, size)); 1682 PetscCall(PetscArrayzero(len_si, size)); 1683 1684 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 1685 coi = c_oth->i; 1686 coj = c_oth->j; 1687 con = ptap->C_oth->rmap->n; 1688 proc = 0; 1689 for (i = 0; i < con; i++) { 1690 while (prmap[i] >= owners[proc + 1]) proc++; 1691 len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */ 1692 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1693 } 1694 1695 len = 0; /* max length of buf_si[], see (4) */ 1696 owners_co[0] = 0; 1697 nsend = 0; 1698 for (proc = 0; proc < size; proc++) { 1699 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1700 if (len_s[proc]) { 1701 nsend++; 1702 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1703 len += len_si[proc]; 1704 } 1705 } 1706 1707 /* determine the number and length of messages to receive for coi and coj */ 1708 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 1709 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 1710 1711 /* post the Irecv and Isend of coj */ 1712 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1713 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 1714 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 1715 for (proc = 0, k = 0; proc < size; proc++) { 1716 if (!len_s[proc]) continue; 1717 i = owners_co[proc]; 1718 PetscCallMPI(MPIU_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1719 k++; 1720 } 1721 1722 /* (2-2) compute symbolic C_loc = Rd*AP_loc */ 1723 PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix)); 1724 PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_")); 1725 PetscCall(MatProductCreate(ptap->Rd, ptap->AP_loc, NULL, &ptap->C_loc)); 1726 PetscCall(MatGetOptionsPrefix(Cmpi, &prefix)); 1727 PetscCall(MatSetOptionsPrefix(ptap->C_loc, prefix)); 1728 PetscCall(MatAppendOptionsPrefix(ptap->C_loc, "inner_C_loc_")); 1729 PetscCall(MatProductSetType(ptap->C_loc, MATPRODUCT_AB)); 1730 PetscCall(MatProductSetAlgorithm(ptap->C_loc, "default")); 1731 PetscCall(MatProductSetFill(ptap->C_loc, fill)); 1732 PetscCall(MatProductSetFromOptions(ptap->C_loc)); 1733 PetscCall(MatProductSymbolic(ptap->C_loc)); 1734 1735 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 1736 1737 /* receives coj are complete */ 1738 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1739 PetscCall(PetscFree(rwaits)); 1740 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1741 1742 /* add received column indices into ta to update Crmax */ 1743 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1744 Jptr = buf_rj[k]; 1745 for (j = 0; j < len_r[k]; j++) PetscCall(PetscHMapISet(ta, *(Jptr + j) + 1, 1)); 1746 } 1747 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 1748 PetscCall(PetscHMapIDestroy(&ta)); 1749 1750 /* (4) send and recv coi */ 1751 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1752 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 1753 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1754 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1755 for (proc = 0, k = 0; proc < size; proc++) { 1756 if (!len_s[proc]) continue; 1757 /* form outgoing message for i-structure: 1758 buf_si[0]: nrows to be sent 1759 [1:nrows]: row index (global) 1760 [nrows+1:2*nrows+1]: i-structure index 1761 */ 1762 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 1763 buf_si_i = buf_si + nrows + 1; 1764 buf_si[0] = nrows; 1765 buf_si_i[0] = 0; 1766 nrows = 0; 1767 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1768 nzi = coi[i + 1] - coi[i]; 1769 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1770 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1771 nrows++; 1772 } 1773 PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1774 k++; 1775 buf_si += len_si[proc]; 1776 } 1777 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1778 PetscCall(PetscFree(rwaits)); 1779 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1780 1781 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 1782 PetscCall(PetscFree(len_ri)); 1783 PetscCall(PetscFree(swaits)); 1784 PetscCall(PetscFree(buf_s)); 1785 1786 /* (5) compute the local portion of Cmpi */ 1787 /* set initial free space to be Crmax, sufficient for holding nonzeros in each row of Cmpi */ 1788 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 1789 current_space = free_space; 1790 1791 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 1792 for (k = 0; k < nrecv; k++) { 1793 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1794 nrows = *buf_ri_k[k]; 1795 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1796 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1797 } 1798 1799 MatPreallocateBegin(comm, pn, pn, dnz, onz); 1800 PetscCall(PetscLLCondensedCreate(Crmax, pN, &lnk, &lnkbt)); 1801 for (i = 0; i < pn; i++) { 1802 /* add C_loc into Cmpi */ 1803 nzi = c_loc->i[i + 1] - c_loc->i[i]; 1804 Jptr = PetscSafePointerPlusOffset(c_loc->j, c_loc->i[i]); 1805 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1806 1807 /* add received col data into lnk */ 1808 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1809 if (i == *nextrow[k]) { /* i-th row */ 1810 nzi = *(nextci[k] + 1) - *nextci[k]; 1811 Jptr = buf_rj[k] + *nextci[k]; 1812 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1813 nextrow[k]++; 1814 nextci[k]++; 1815 } 1816 } 1817 nzi = lnk[0]; 1818 1819 /* copy data into free space, then initialize lnk */ 1820 PetscCall(PetscLLCondensedClean(pN, nzi, current_space->array, lnk, lnkbt)); 1821 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 1822 } 1823 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1824 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1825 PetscCall(PetscFreeSpaceDestroy(free_space)); 1826 1827 /* local sizes and preallocation */ 1828 PetscCall(MatSetSizes(Cmpi, pn, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1829 if (P->cmap->bs > 0) { 1830 PetscCall(PetscLayoutSetBlockSize(Cmpi->rmap, P->cmap->bs)); 1831 PetscCall(PetscLayoutSetBlockSize(Cmpi->cmap, P->cmap->bs)); 1832 } 1833 PetscCall(MatMPIAIJSetPreallocation(Cmpi, 0, dnz, 0, onz)); 1834 MatPreallocateEnd(dnz, onz); 1835 1836 /* members in merge */ 1837 PetscCall(PetscFree(id_r)); 1838 PetscCall(PetscFree(len_r)); 1839 PetscCall(PetscFree(buf_ri[0])); 1840 PetscCall(PetscFree(buf_ri)); 1841 PetscCall(PetscFree(buf_rj[0])); 1842 PetscCall(PetscFree(buf_rj)); 1843 PetscCall(PetscLayoutDestroy(&rowmap)); 1844 1845 PetscCall(PetscCalloc1(pN, &ptap->apa)); 1846 1847 /* attach the supporting struct to Cmpi for reuse */ 1848 Cmpi->product->data = ptap; 1849 Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP; 1850 Cmpi->product->view = MatView_MPIAIJ_PtAP; 1851 1852 /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */ 1853 Cmpi->assembled = PETSC_FALSE; 1854 PetscFunctionReturn(PETSC_SUCCESS); 1855 } 1856 1857 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) 1858 { 1859 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data; 1860 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)a->A->data, *ao = (Mat_SeqAIJ *)a->B->data; 1861 Mat_SeqAIJ *ap, *p_loc, *p_oth = NULL, *c_seq; 1862 Mat_APMPI *ptap; 1863 Mat AP_loc, C_loc, C_oth; 1864 PetscInt i, rstart, rend, cm, ncols, row; 1865 PetscInt *api, *apj, am = A->rmap->n, j, col, apnz; 1866 PetscScalar *apa; 1867 const PetscInt *cols; 1868 const PetscScalar *vals; 1869 1870 PetscFunctionBegin; 1871 MatCheckProduct(C, 3); 1872 ptap = (Mat_APMPI *)C->product->data; 1873 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1874 PetscCheck(ptap->AP_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be reused. Do not call MatProductClear()"); 1875 1876 PetscCall(MatZeroEntries(C)); 1877 /* 1) get R = Pd^T,Ro = Po^T */ 1878 if (ptap->reuse == MAT_REUSE_MATRIX) { 1879 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 1880 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 1881 } 1882 1883 /* 2) get AP_loc */ 1884 AP_loc = ptap->AP_loc; 1885 ap = (Mat_SeqAIJ *)AP_loc->data; 1886 1887 /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 1888 if (ptap->reuse == MAT_REUSE_MATRIX) { 1889 /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */ 1890 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1891 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 1892 } 1893 1894 /* 2-2) compute numeric A_loc*P - dominating part */ 1895 /* get data from symbolic products */ 1896 p_loc = (Mat_SeqAIJ *)ptap->P_loc->data; 1897 if (ptap->P_oth) p_oth = (Mat_SeqAIJ *)ptap->P_oth->data; 1898 apa = ptap->apa; 1899 api = ap->i; 1900 apj = ap->j; 1901 for (i = 0; i < am; i++) { 1902 /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */ 1903 AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa); 1904 apnz = api[i + 1] - api[i]; 1905 for (j = 0; j < apnz; j++) { 1906 col = apj[j + api[i]]; 1907 ap->a[j + ap->i[i]] = apa[col]; 1908 apa[col] = 0.0; 1909 } 1910 } 1911 /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */ 1912 PetscCall(PetscObjectStateIncrease((PetscObject)AP_loc)); 1913 1914 /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */ 1915 PetscCall(MatProductNumeric(ptap->C_loc)); 1916 PetscCall(MatProductNumeric(ptap->C_oth)); 1917 C_loc = ptap->C_loc; 1918 C_oth = ptap->C_oth; 1919 1920 /* add C_loc and Co to C */ 1921 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 1922 1923 /* C_loc -> C */ 1924 cm = C_loc->rmap->N; 1925 c_seq = (Mat_SeqAIJ *)C_loc->data; 1926 cols = c_seq->j; 1927 vals = c_seq->a; 1928 1929 /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */ 1930 /* when there are no off-processor parts. */ 1931 /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */ 1932 /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */ 1933 /* a table, and other, more complex stuff has to be done. */ 1934 if (C->assembled) { 1935 C->was_assembled = PETSC_TRUE; 1936 C->assembled = PETSC_FALSE; 1937 } 1938 if (C->was_assembled) { 1939 for (i = 0; i < cm; i++) { 1940 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1941 row = rstart + i; 1942 PetscCall(MatSetValues_MPIAIJ(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1943 cols = PetscSafePointerPlusOffset(cols, ncols); 1944 vals += ncols; 1945 } 1946 } else { 1947 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat(C, c_seq->j, c_seq->i, c_seq->a)); 1948 } 1949 1950 /* Co -> C, off-processor part */ 1951 cm = C_oth->rmap->N; 1952 c_seq = (Mat_SeqAIJ *)C_oth->data; 1953 cols = c_seq->j; 1954 vals = c_seq->a; 1955 for (i = 0; i < cm; i++) { 1956 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1957 row = p->garray[i]; 1958 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1959 cols += ncols; 1960 vals += ncols; 1961 } 1962 1963 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1964 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1965 1966 ptap->reuse = MAT_REUSE_MATRIX; 1967 PetscFunctionReturn(PETSC_SUCCESS); 1968 } 1969 1970 PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C) 1971 { 1972 Mat_Product *product = C->product; 1973 Mat A = product->A, P = product->B; 1974 MatProductAlgorithm alg = product->alg; 1975 PetscReal fill = product->fill; 1976 PetscBool flg; 1977 1978 PetscFunctionBegin; 1979 /* scalable: do R=P^T locally, then C=R*A*P */ 1980 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 1981 if (flg) { 1982 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A, P, product->fill, C)); 1983 C->ops->productnumeric = MatProductNumeric_PtAP; 1984 goto next; 1985 } 1986 1987 /* nonscalable: do R=P^T locally, then C=R*A*P */ 1988 PetscCall(PetscStrcmp(alg, "nonscalable", &flg)); 1989 if (flg) { 1990 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ(A, P, fill, C)); 1991 goto next; 1992 } 1993 1994 /* allatonce */ 1995 PetscCall(PetscStrcmp(alg, "allatonce", &flg)); 1996 if (flg) { 1997 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A, P, fill, C)); 1998 goto next; 1999 } 2000 2001 /* allatonce_merged */ 2002 PetscCall(PetscStrcmp(alg, "allatonce_merged", &flg)); 2003 if (flg) { 2004 PetscCall(MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A, P, fill, C)); 2005 goto next; 2006 } 2007 2008 /* backend general code */ 2009 PetscCall(PetscStrcmp(alg, "backend", &flg)); 2010 if (flg) { 2011 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 2012 PetscFunctionReturn(PETSC_SUCCESS); 2013 } 2014 2015 /* hypre */ 2016 #if defined(PETSC_HAVE_HYPRE) 2017 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 2018 if (flg) { 2019 PetscCall(MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A, P, fill, C)); 2020 C->ops->productnumeric = MatProductNumeric_PtAP; 2021 PetscFunctionReturn(PETSC_SUCCESS); 2022 } 2023 #endif 2024 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 2025 2026 next: 2027 C->ops->productnumeric = MatProductNumeric_PtAP; 2028 PetscFunctionReturn(PETSC_SUCCESS); 2029 } 2030