1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 /* 22 Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks 23 24 The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but 25 that is a very major recoding job. 26 27 Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin 28 */ 29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[]) 30 { 31 PetscFunctionBegin; 32 for (PetscInt i = 0; i < imax; i++) { 33 if (!is[i]) break; 34 PetscInt n = 0, N, Nmax, Nmin; 35 const PetscInt *idx; 36 PetscInt *nidx = NULL; 37 MPI_Comm comm; 38 PetscBT bt; 39 40 PetscCall(ISGetLocalSize(is[i], &N)); 41 if (N > 0) { /* Nmax and Nmin are garbage for empty IS */ 42 PetscCall(ISGetIndices(is[i], &idx)); 43 PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax)); 44 Nmin = Nmin / bs; 45 Nmax = Nmax / bs; 46 PetscCall(PetscBTCreate(Nmax - Nmin, &bt)); 47 for (PetscInt j = 0; j < N; j++) { 48 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++; 49 } 50 PetscCall(PetscMalloc1(n, &nidx)); 51 n = 0; 52 PetscCall(PetscBTMemzero(Nmax - Nmin, bt)); 53 for (PetscInt j = 0; j < N; j++) { 54 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs; 55 } 56 PetscCall(PetscBTDestroy(&bt)); 57 PetscCall(ISRestoreIndices(is[i], &idx)); 58 } 59 PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm)); 60 PetscCall(ISDestroy(is + i)); 61 PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i)); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) 67 { 68 PetscInt i; 69 70 PetscFunctionBegin; 71 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 72 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); 73 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 74 PetscFunctionReturn(PETSC_SUCCESS); 75 } 76 77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) 78 { 79 PetscInt i; 80 81 PetscFunctionBegin; 82 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 83 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); 84 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) 89 { 90 MPI_Comm comm; 91 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_isids, j; 92 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 93 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 94 const PetscInt *indices_i, **indices; 95 PetscLayout rmap; 96 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner, *rrow_ranks; 97 PetscSF sf; 98 PetscSFNode *remote; 99 100 PetscFunctionBegin; 101 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 102 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 103 PetscCallMPI(MPI_Comm_size(comm, &size)); 104 /* get row map to determine where rows should be going */ 105 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 106 /* retrieve IS data and put all together so that we 107 * can optimize communication 108 * */ 109 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 110 tlength = 0; 111 for (PetscInt i = 0; i < nidx; i++) { 112 PetscCall(ISGetLocalSize(is[i], &length[i])); 113 tlength += length[i]; 114 PetscCall(ISGetIndices(is[i], &indices[i])); 115 } 116 /* find these rows on remote processors */ 117 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 118 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 119 nrrows = 0; 120 for (PetscInt i = 0; i < nidx; i++) { 121 length_i = length[i]; 122 indices_i = indices[i]; 123 for (PetscInt j = 0; j < length_i; j++) { 124 owner = -1; 125 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 126 /* remote processors */ 127 if (owner != rank) { 128 tosizes_temp[owner]++; /* number of rows to owner */ 129 rrow_ranks[nrrows] = owner; /* processor */ 130 rrow_isids[nrrows] = i; /* is id */ 131 remoterows[nrrows++] = indices_i[j]; /* row */ 132 } 133 } 134 PetscCall(ISRestoreIndices(is[i], &indices[i])); 135 } 136 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 137 /* test if we need to exchange messages 138 * generally speaking, we do not need to exchange 139 * data when overlap is 1 140 * */ 141 PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 142 /* we do not have any messages 143 * It usually corresponds to overlap 1 144 * */ 145 if (!reducednrrows) { 146 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 147 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 148 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 149 PetscFunctionReturn(PETSC_SUCCESS); 150 } 151 nto = 0; 152 /* send sizes and ranks for building a two-sided communication */ 153 for (PetscMPIInt i = 0; i < size; i++) { 154 if (tosizes_temp[i]) { 155 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 156 tosizes_temp[i] = nto; /* a map from processor to index */ 157 toranks[nto++] = i; /* MPI process */ 158 } 159 } 160 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 161 toffsets[0] = 0; 162 for (PetscInt i = 0; i < nto; i++) { 163 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 164 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 165 } 166 /* send information to other processors */ 167 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 168 nrecvrows = 0; 169 for (PetscMPIInt i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 170 PetscCall(PetscMalloc1(nrecvrows, &remote)); 171 nrecvrows = 0; 172 for (PetscMPIInt i = 0; i < nfrom; i++) { 173 for (PetscInt j = 0; j < fromsizes[2 * i]; j++) { 174 remote[nrecvrows].rank = fromranks[i]; 175 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 176 } 177 } 178 PetscCall(PetscSFCreate(comm, &sf)); 179 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 180 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 181 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 182 PetscCall(PetscSFSetFromOptions(sf)); 183 /* message pair <no of is, row> */ 184 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 185 for (PetscInt i = 0; i < nrrows; i++) { 186 owner = rrow_ranks[i]; /* process */ 187 j = tosizes_temp[owner]; /* index */ 188 todata[toffsets[j]++] = rrow_isids[i]; 189 todata[toffsets[j]++] = remoterows[i]; 190 } 191 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 192 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 193 PetscCall(PetscFree(toffsets)); 194 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 195 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 196 PetscCall(PetscSFDestroy(&sf)); 197 /* send rows belonging to the remote so that then we could get the overlapping data back */ 198 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 199 PetscCall(PetscFree2(todata, fromdata)); 200 PetscCall(PetscFree(fromsizes)); 201 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 202 PetscCall(PetscFree(fromranks)); 203 nrecvrows = 0; 204 for (PetscInt i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 205 PetscCall(PetscCalloc1(nrecvrows, &todata)); 206 PetscCall(PetscMalloc1(nrecvrows, &remote)); 207 nrecvrows = 0; 208 for (PetscInt i = 0; i < nto; i++) { 209 for (PetscInt j = 0; j < tosizes[2 * i]; j++) { 210 remote[nrecvrows].rank = toranks[i]; 211 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 212 } 213 } 214 PetscCall(PetscSFCreate(comm, &sf)); 215 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 216 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 217 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 218 PetscCall(PetscSFSetFromOptions(sf)); 219 /* overlap communication and computation */ 220 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 221 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 222 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 223 PetscCall(PetscSFDestroy(&sf)); 224 PetscCall(PetscFree2(sbdata, sbsizes)); 225 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 226 PetscCall(PetscFree(toranks)); 227 PetscCall(PetscFree(tosizes)); 228 PetscCall(PetscFree(todata)); 229 PetscFunctionReturn(PETSC_SUCCESS); 230 } 231 232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 233 { 234 PetscInt *isz, isz_i, i, j, is_id, data_size; 235 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 236 const PetscInt *indices_i_temp; 237 MPI_Comm *iscomms; 238 239 PetscFunctionBegin; 240 max_lsize = 0; 241 PetscCall(PetscMalloc1(nidx, &isz)); 242 for (i = 0; i < nidx; i++) { 243 PetscCall(ISGetLocalSize(is[i], &lsize)); 244 max_lsize = lsize > max_lsize ? lsize : max_lsize; 245 isz[i] = lsize; 246 } 247 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 248 for (i = 0; i < nidx; i++) { 249 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 250 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 251 PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i])); 252 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 253 PetscCall(ISDestroy(&is[i])); 254 } 255 /* retrieve information to get row id and its overlap */ 256 for (i = 0; i < nrecvs;) { 257 is_id = recvdata[i++]; 258 data_size = recvdata[i++]; 259 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 260 isz_i = isz[is_id]; 261 for (j = 0; j < data_size; j++) { 262 col = recvdata[i++]; 263 indices_i[isz_i++] = col; 264 } 265 isz[is_id] = isz_i; 266 } 267 /* remove duplicate entities */ 268 for (i = 0; i < nidx; i++) { 269 indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i); 270 isz_i = isz[i]; 271 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 272 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 273 PetscCall(PetscCommDestroy(&iscomms[i])); 274 } 275 PetscCall(PetscFree(isz)); 276 PetscCall(PetscFree2(indices_temp, iscomms)); 277 PetscFunctionReturn(PETSC_SUCCESS); 278 } 279 280 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 281 { 282 PetscLayout rmap, cmap; 283 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 284 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 285 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 286 const PetscInt *gcols, *ai, *aj, *bi, *bj; 287 Mat amat, bmat; 288 PetscMPIInt rank; 289 PetscBool done; 290 MPI_Comm comm; 291 292 PetscFunctionBegin; 293 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 294 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 295 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 296 /* Even if the mat is symmetric, we still assume it is not symmetric */ 297 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 298 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 299 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 300 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 301 /* total number of nonzero values is used to estimate the memory usage in the next step */ 302 tnz = ai[an] + bi[bn]; 303 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 304 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 305 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 306 /* to find the longest message */ 307 max_fszs = 0; 308 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 309 /* better way to estimate number of nonzero in the mat??? */ 310 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 311 for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i); 312 rows_pos = 0; 313 totalrows = 0; 314 for (i = 0; i < nfrom; i++) { 315 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 316 /* group data together */ 317 for (j = 0; j < fromsizes[2 * i]; j += 2) { 318 is_id = fromrows[rows_pos++]; /* no of is */ 319 rows_i = rows_data[is_id]; 320 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 321 } 322 /* estimate a space to avoid multiple allocations */ 323 for (j = 0; j < nidx; j++) { 324 indvc_ij = 0; 325 rows_i = rows_data[j]; 326 for (l = 0; l < rows_pos_i[j]; l++) { 327 row = rows_i[l] - rstart; 328 start = ai[row]; 329 end = ai[row + 1]; 330 for (k = start; k < end; k++) { /* Amat */ 331 col = aj[k] + cstart; 332 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 333 } 334 start = bi[row]; 335 end = bi[row + 1]; 336 for (k = start; k < end; k++) { /* Bmat */ 337 col = gcols[bj[k]]; 338 indices_tmp[indvc_ij++] = col; 339 } 340 } 341 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 342 indv_counts[i * nidx + j] = indvc_ij; 343 totalrows += indvc_ij; 344 } 345 } 346 /* message triple <no of is, number of rows, rows> */ 347 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 348 totalrows = 0; 349 rows_pos = 0; 350 /* use this code again */ 351 for (i = 0; i < nfrom; i++) { 352 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 353 for (j = 0; j < fromsizes[2 * i]; j += 2) { 354 is_id = fromrows[rows_pos++]; 355 rows_i = rows_data[is_id]; 356 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 357 } 358 /* add data */ 359 for (j = 0; j < nidx; j++) { 360 if (!indv_counts[i * nidx + j]) continue; 361 indvc_ij = 0; 362 sbdata[totalrows++] = j; 363 sbdata[totalrows++] = indv_counts[i * nidx + j]; 364 sbsizes[2 * i] += 2; 365 rows_i = rows_data[j]; 366 for (l = 0; l < rows_pos_i[j]; l++) { 367 row = rows_i[l] - rstart; 368 start = ai[row]; 369 end = ai[row + 1]; 370 for (k = start; k < end; k++) { /* Amat */ 371 col = aj[k] + cstart; 372 indices_tmp[indvc_ij++] = col; 373 } 374 start = bi[row]; 375 end = bi[row + 1]; 376 for (k = start; k < end; k++) { /* Bmat */ 377 col = gcols[bj[k]]; 378 indices_tmp[indvc_ij++] = col; 379 } 380 } 381 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 382 sbsizes[2 * i] += indvc_ij; 383 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 384 totalrows += indvc_ij; 385 } 386 } 387 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 388 offsets[0] = 0; 389 for (i = 0; i < nfrom; i++) { 390 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 391 sbsizes[2 * i + 1] = offsets[i]; 392 } 393 PetscCall(PetscFree(offsets)); 394 if (sbrowsizes) *sbrowsizes = sbsizes; 395 if (sbrows) *sbrows = sbdata; 396 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 397 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 398 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 399 PetscFunctionReturn(PETSC_SUCCESS); 400 } 401 402 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) 403 { 404 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 405 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 406 PetscInt lsize, lsize_tmp; 407 PetscMPIInt rank, owner; 408 Mat amat, bmat; 409 PetscBool done; 410 PetscLayout cmap, rmap; 411 MPI_Comm comm; 412 413 PetscFunctionBegin; 414 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 415 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 416 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 417 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 418 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 419 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 420 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 421 /* is it a safe way to compute number of nonzero values ? */ 422 tnz = ai[an] + bi[bn]; 423 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 424 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 425 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 426 /* it is a better way to estimate memory than the old implementation 427 * where global size of matrix is used 428 * */ 429 PetscCall(PetscMalloc1(tnz, &indices_temp)); 430 for (i = 0; i < nidx; i++) { 431 MPI_Comm iscomm; 432 433 PetscCall(ISGetLocalSize(is[i], &lsize)); 434 PetscCall(ISGetIndices(is[i], &indices)); 435 lsize_tmp = 0; 436 for (j = 0; j < lsize; j++) { 437 owner = -1; 438 row = indices[j]; 439 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 440 if (owner != rank) continue; 441 /* local number */ 442 row -= rstart; 443 start = ai[row]; 444 end = ai[row + 1]; 445 for (k = start; k < end; k++) { /* Amat */ 446 col = aj[k] + cstart; 447 indices_temp[lsize_tmp++] = col; 448 } 449 start = bi[row]; 450 end = bi[row + 1]; 451 for (k = start; k < end; k++) { /* Bmat */ 452 col = gcols[bj[k]]; 453 indices_temp[lsize_tmp++] = col; 454 } 455 } 456 PetscCall(ISRestoreIndices(is[i], &indices)); 457 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 458 PetscCall(ISDestroy(&is[i])); 459 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 460 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 461 PetscCall(PetscCommDestroy(&iscomm)); 462 } 463 PetscCall(PetscFree(indices_temp)); 464 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 465 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 466 PetscFunctionReturn(PETSC_SUCCESS); 467 } 468 469 /* 470 Sample message format: 471 If a processor A wants processor B to process some elements corresponding 472 to index sets is[1],is[5] 473 mesg [0] = 2 (no of index sets in the mesg) 474 ----------- 475 mesg [1] = 1 => is[1] 476 mesg [2] = sizeof(is[1]); 477 ----------- 478 mesg [3] = 5 => is[5] 479 mesg [4] = sizeof(is[5]); 480 ----------- 481 mesg [5] 482 mesg [n] datas[1] 483 ----------- 484 mesg[n+1] 485 mesg[m] data(is[5]) 486 ----------- 487 488 Notes: 489 nrqs - no of requests sent (or to be sent out) 490 nrqr - no of requests received (which have to be or which have been processed) 491 */ 492 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) 493 { 494 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 495 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 496 const PetscInt **idx, *idx_i; 497 PetscInt *n, **data, len; 498 #if defined(PETSC_USE_CTABLE) 499 PetscHMapI *table_data, table_data_i; 500 PetscInt *tdata, tcount, tcount_max; 501 #else 502 PetscInt *data_i, *d_p; 503 #endif 504 PetscMPIInt size, rank, tag1, tag2, proc = 0, nrqs, *pa; 505 PetscInt M, k, **rbuf, row, msz, **outdat, **ptr, *isz1; 506 PetscInt *ctr, *tmp, *isz, **xdata, **rbuf2; 507 PetscBT *table; 508 MPI_Comm comm; 509 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 510 MPI_Status *recv_status; 511 MPI_Comm *iscomms; 512 char *t_p; 513 514 PetscFunctionBegin; 515 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 516 size = c->size; 517 rank = c->rank; 518 M = C->rmap->N; 519 520 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 521 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 522 523 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 524 525 for (PetscInt i = 0; i < imax; i++) { 526 PetscCall(ISGetIndices(is[i], &idx[i])); 527 PetscCall(ISGetLocalSize(is[i], &n[i])); 528 } 529 530 /* evaluate communication - mesg to who,length of mesg, and buffer space 531 required. Based on this, buffers are allocated, and data copied into them */ 532 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 533 for (PetscInt i = 0; i < imax; i++) { 534 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 535 idx_i = idx[i]; 536 len = n[i]; 537 for (PetscInt j = 0; j < len; j++) { 538 row = idx_i[j]; 539 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 540 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 541 w4[proc]++; 542 } 543 for (PetscMPIInt j = 0; j < size; j++) { 544 if (w4[j]) { 545 w1[j] += w4[j]; 546 w3[j]++; 547 } 548 } 549 } 550 551 nrqs = 0; /* no of outgoing messages */ 552 msz = 0; /* total mesg length (for all proc */ 553 w1[rank] = 0; /* no mesg sent to intself */ 554 w3[rank] = 0; 555 for (PetscMPIInt i = 0; i < size; i++) { 556 if (w1[i]) { 557 w2[i] = 1; 558 nrqs++; 559 } /* there exists a message to proc i */ 560 } 561 /* pa - is list of processors to communicate with */ 562 PetscCall(PetscMalloc1(nrqs, &pa)); 563 for (PetscMPIInt i = 0, j = 0; i < size; i++) { 564 if (w1[i]) { 565 pa[j] = i; 566 j++; 567 } 568 } 569 570 /* Each message would have a header = 1 + 2*(no of IS) + data */ 571 for (PetscMPIInt i = 0; i < nrqs; i++) { 572 PetscMPIInt j = pa[i]; 573 w1[j] += w2[j] + 2 * w3[j]; 574 msz += w1[j]; 575 } 576 577 /* Determine the number of messages to expect, their lengths, from from-ids */ 578 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 579 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 580 581 /* Now post the Irecvs corresponding to these messages */ 582 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 583 584 /* Allocate Memory for outgoing messages */ 585 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 586 PetscCall(PetscArrayzero(outdat, size)); 587 PetscCall(PetscArrayzero(ptr, size)); 588 589 { 590 PetscInt *iptr = tmp, ict = 0; 591 for (PetscMPIInt i = 0; i < nrqs; i++) { 592 PetscMPIInt j = pa[i]; 593 iptr += ict; 594 outdat[j] = iptr; 595 ict = w1[j]; 596 } 597 } 598 599 /* Form the outgoing messages */ 600 /* plug in the headers */ 601 for (PetscMPIInt i = 0; i < nrqs; i++) { 602 PetscMPIInt j = pa[i]; 603 outdat[j][0] = 0; 604 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 605 ptr[j] = outdat[j] + 2 * w3[j] + 1; 606 } 607 608 /* Memory for doing local proc's work */ 609 { 610 PetscInt M_BPB_imax = 0; 611 #if defined(PETSC_USE_CTABLE) 612 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 613 PetscCall(PetscMalloc1(imax, &table_data)); 614 for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i)); 615 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 616 for (PetscInt i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 617 #else 618 PetscInt Mimax = 0; 619 PetscCall(PetscIntMultError(M, imax, &Mimax)); 620 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 621 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 622 for (PetscInt i = 0; i < imax; i++) { 623 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 624 data[i] = d_p + M * i; 625 } 626 #endif 627 } 628 629 /* Parse the IS and update local tables and the outgoing buf with the data */ 630 { 631 PetscInt n_i, isz_i, *outdat_j, ctr_j; 632 PetscBT table_i; 633 634 for (PetscInt i = 0; i < imax; i++) { 635 PetscCall(PetscArrayzero(ctr, size)); 636 n_i = n[i]; 637 table_i = table[i]; 638 idx_i = idx[i]; 639 #if defined(PETSC_USE_CTABLE) 640 table_data_i = table_data[i]; 641 #else 642 data_i = data[i]; 643 #endif 644 isz_i = isz[i]; 645 for (PetscInt j = 0; j < n_i; j++) { /* parse the indices of each IS */ 646 row = idx_i[j]; 647 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 648 if (proc != rank) { /* copy to the outgoing buffer */ 649 ctr[proc]++; 650 *ptr[proc] = row; 651 ptr[proc]++; 652 } else if (!PetscBTLookupSet(table_i, row)) { 653 #if defined(PETSC_USE_CTABLE) 654 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 655 #else 656 data_i[isz_i] = row; /* Update the local table */ 657 #endif 658 isz_i++; 659 } 660 } 661 /* Update the headers for the current IS */ 662 for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 663 if ((ctr_j = ctr[j])) { 664 outdat_j = outdat[j]; 665 k = ++outdat_j[0]; 666 outdat_j[2 * k] = ctr_j; 667 outdat_j[2 * k - 1] = i; 668 } 669 } 670 isz[i] = isz_i; 671 } 672 } 673 674 /* Now post the sends */ 675 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 676 for (PetscMPIInt i = 0; i < nrqs; ++i) { 677 PetscMPIInt j = pa[i]; 678 PetscCallMPI(MPIU_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 679 } 680 681 /* No longer need the original indices */ 682 PetscCall(PetscMalloc1(imax, &iscomms)); 683 for (PetscInt i = 0; i < imax; ++i) { 684 PetscCall(ISRestoreIndices(is[i], idx + i)); 685 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 686 } 687 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 688 689 for (PetscInt i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i])); 690 691 /* Do Local work */ 692 #if defined(PETSC_USE_CTABLE) 693 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 694 #else 695 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 696 #endif 697 698 /* Receive messages */ 699 PetscCall(PetscMalloc1(nrqr, &recv_status)); 700 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 701 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 702 703 /* Phase 1 sends are complete - deallocate buffers */ 704 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 705 PetscCall(PetscFree4(w1, w2, w3, w4)); 706 707 PetscCall(PetscMalloc1(nrqr, &xdata)); 708 PetscCall(PetscMalloc1(nrqr, &isz1)); 709 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 710 PetscCall(PetscFree(rbuf[0])); 711 PetscCall(PetscFree(rbuf)); 712 713 /* Send the data back */ 714 /* Do a global reduction to know the buffer space req for incoming messages */ 715 { 716 PetscMPIInt *rw1; 717 718 PetscCall(PetscCalloc1(size, &rw1)); 719 for (PetscMPIInt i = 0; i < nrqr; ++i) { 720 proc = recv_status[i].MPI_SOURCE; 721 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 722 PetscCall(PetscMPIIntCast(isz1[i], &rw1[proc])); 723 } 724 PetscCall(PetscFree(onodes1)); 725 PetscCall(PetscFree(olengths1)); 726 727 /* Determine the number of messages to expect, their lengths, from from-ids */ 728 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 729 PetscCall(PetscFree(rw1)); 730 } 731 /* Now post the Irecvs corresponding to these messages */ 732 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 733 734 /* Now post the sends */ 735 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 736 for (PetscMPIInt i = 0; i < nrqr; ++i) { PetscCallMPI(MPIU_Isend(xdata[i], isz1[i], MPIU_INT, recv_status[i].MPI_SOURCE, tag2, comm, s_waits2 + i)); } 737 738 /* receive work done on other processors */ 739 { 740 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 741 PetscMPIInt idex; 742 PetscBT table_i; 743 744 for (PetscMPIInt i = 0; i < nrqs; ++i) { 745 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 746 /* Process the message */ 747 rbuf2_i = rbuf2[idex]; 748 ct1 = 2 * rbuf2_i[0] + 1; 749 jmax = rbuf2[idex][0]; 750 for (PetscInt j = 1; j <= jmax; j++) { 751 max = rbuf2_i[2 * j]; 752 is_no = rbuf2_i[2 * j - 1]; 753 isz_i = isz[is_no]; 754 table_i = table[is_no]; 755 #if defined(PETSC_USE_CTABLE) 756 table_data_i = table_data[is_no]; 757 #else 758 data_i = data[is_no]; 759 #endif 760 for (k = 0; k < max; k++, ct1++) { 761 row = rbuf2_i[ct1]; 762 if (!PetscBTLookupSet(table_i, row)) { 763 #if defined(PETSC_USE_CTABLE) 764 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 765 #else 766 data_i[isz_i] = row; 767 #endif 768 isz_i++; 769 } 770 } 771 isz[is_no] = isz_i; 772 } 773 } 774 775 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 776 } 777 778 #if defined(PETSC_USE_CTABLE) 779 tcount_max = 0; 780 for (PetscInt i = 0; i < imax; ++i) { 781 table_data_i = table_data[i]; 782 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 783 if (tcount_max < tcount) tcount_max = tcount; 784 } 785 PetscCall(PetscMalloc1(tcount_max + 1, &tdata)); 786 #endif 787 788 for (PetscInt i = 0; i < imax; ++i) { 789 #if defined(PETSC_USE_CTABLE) 790 PetscHashIter tpos; 791 PetscInt j; 792 793 table_data_i = table_data[i]; 794 PetscHashIterBegin(table_data_i, tpos); 795 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 796 PetscHashIterGetKey(table_data_i, tpos, k); 797 PetscHashIterGetVal(table_data_i, tpos, j); 798 PetscHashIterNext(table_data_i, tpos); 799 tdata[--j] = --k; 800 } 801 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 802 #else 803 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 804 #endif 805 PetscCall(PetscCommDestroy(&iscomms[i])); 806 } 807 808 PetscCall(PetscFree(iscomms)); 809 PetscCall(PetscFree(onodes2)); 810 PetscCall(PetscFree(olengths2)); 811 812 PetscCall(PetscFree(pa)); 813 PetscCall(PetscFree(rbuf2[0])); 814 PetscCall(PetscFree(rbuf2)); 815 PetscCall(PetscFree(s_waits1)); 816 PetscCall(PetscFree(r_waits1)); 817 PetscCall(PetscFree(s_waits2)); 818 PetscCall(PetscFree(r_waits2)); 819 PetscCall(PetscFree(recv_status)); 820 if (xdata) { 821 PetscCall(PetscFree(xdata[0])); 822 PetscCall(PetscFree(xdata)); 823 } 824 PetscCall(PetscFree(isz1)); 825 #if defined(PETSC_USE_CTABLE) 826 for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i)); 827 PetscCall(PetscFree(table_data)); 828 PetscCall(PetscFree(tdata)); 829 PetscCall(PetscFree4(table, data, isz, t_p)); 830 #else 831 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 832 #endif 833 PetscFunctionReturn(PETSC_SUCCESS); 834 } 835 836 /* 837 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 838 the work on the local processor. 839 840 Inputs: 841 C - MAT_MPIAIJ; 842 imax - total no of index sets processed at a time; 843 table - an array of char - size = m bits. 844 845 Output: 846 isz - array containing the count of the solution elements corresponding 847 to each index set; 848 data or table_data - pointer to the solutions 849 */ 850 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data) 851 { 852 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 853 Mat A = c->A, B = c->B; 854 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 855 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 856 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 857 PetscBT table_i; 858 #if defined(PETSC_USE_CTABLE) 859 PetscHMapI table_data_i; 860 PetscHashIter tpos; 861 PetscInt tcount, *tdata; 862 #else 863 PetscInt *data_i; 864 #endif 865 866 PetscFunctionBegin; 867 rstart = C->rmap->rstart; 868 cstart = C->cmap->rstart; 869 ai = a->i; 870 aj = a->j; 871 bi = b->i; 872 bj = b->j; 873 garray = c->garray; 874 875 for (i = 0; i < imax; i++) { 876 #if defined(PETSC_USE_CTABLE) 877 /* copy existing entries of table_data_i into tdata[] */ 878 table_data_i = table_data[i]; 879 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 880 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 881 882 PetscCall(PetscMalloc1(tcount, &tdata)); 883 PetscHashIterBegin(table_data_i, tpos); 884 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 885 PetscHashIterGetKey(table_data_i, tpos, row); 886 PetscHashIterGetVal(table_data_i, tpos, j); 887 PetscHashIterNext(table_data_i, tpos); 888 tdata[--j] = --row; 889 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 890 } 891 #else 892 data_i = data[i]; 893 #endif 894 table_i = table[i]; 895 isz_i = isz[i]; 896 max = isz[i]; 897 898 for (j = 0; j < max; j++) { 899 #if defined(PETSC_USE_CTABLE) 900 row = tdata[j] - rstart; 901 #else 902 row = data_i[j] - rstart; 903 #endif 904 start = ai[row]; 905 end = ai[row + 1]; 906 for (k = start; k < end; k++) { /* Amat */ 907 val = aj[k] + cstart; 908 if (!PetscBTLookupSet(table_i, val)) { 909 #if defined(PETSC_USE_CTABLE) 910 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 911 #else 912 data_i[isz_i] = val; 913 #endif 914 isz_i++; 915 } 916 } 917 start = bi[row]; 918 end = bi[row + 1]; 919 for (k = start; k < end; k++) { /* Bmat */ 920 val = garray[bj[k]]; 921 if (!PetscBTLookupSet(table_i, val)) { 922 #if defined(PETSC_USE_CTABLE) 923 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 924 #else 925 data_i[isz_i] = val; 926 #endif 927 isz_i++; 928 } 929 } 930 } 931 isz[i] = isz_i; 932 933 #if defined(PETSC_USE_CTABLE) 934 PetscCall(PetscFree(tdata)); 935 #endif 936 } 937 PetscFunctionReturn(PETSC_SUCCESS); 938 } 939 940 /* 941 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 942 and return the output 943 944 Input: 945 C - the matrix 946 nrqr - no of messages being processed. 947 rbuf - an array of pointers to the received requests 948 949 Output: 950 xdata - array of messages to be sent back 951 isz1 - size of each message 952 953 For better efficiency perhaps we should malloc separately each xdata[i], 954 then if a remalloc is required we need only copy the data for that one row 955 rather than all previous rows as it is now where a single large chunk of 956 memory is used. 957 958 */ 959 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) 960 { 961 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 962 Mat A = c->A, B = c->B; 963 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 964 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 965 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 966 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 967 PetscInt *rbuf_i, kmax, rbuf_0; 968 PetscBT xtable; 969 970 PetscFunctionBegin; 971 m = C->rmap->N; 972 rstart = C->rmap->rstart; 973 cstart = C->cmap->rstart; 974 ai = a->i; 975 aj = a->j; 976 bi = b->i; 977 bj = b->j; 978 garray = c->garray; 979 980 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 981 rbuf_i = rbuf[i]; 982 rbuf_0 = rbuf_i[0]; 983 ct += rbuf_0; 984 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 985 } 986 987 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 988 else max1 = 1; 989 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 990 if (nrqr) { 991 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 992 ++no_malloc; 993 } 994 PetscCall(PetscBTCreate(m, &xtable)); 995 PetscCall(PetscArrayzero(isz1, nrqr)); 996 997 ct3 = 0; 998 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 999 rbuf_i = rbuf[i]; 1000 rbuf_0 = rbuf_i[0]; 1001 ct1 = 2 * rbuf_0 + 1; 1002 ct2 = ct1; 1003 ct3 += ct1; 1004 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 1005 PetscCall(PetscBTMemzero(m, xtable)); 1006 oct2 = ct2; 1007 kmax = rbuf_i[2 * j]; 1008 for (k = 0; k < kmax; k++, ct1++) { 1009 row = rbuf_i[ct1]; 1010 if (!PetscBTLookupSet(xtable, row)) { 1011 if (!(ct3 < mem_estimate)) { 1012 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1013 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1014 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1015 PetscCall(PetscFree(xdata[0])); 1016 xdata[0] = tmp; 1017 mem_estimate = new_estimate; 1018 ++no_malloc; 1019 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1020 } 1021 xdata[i][ct2++] = row; 1022 ct3++; 1023 } 1024 } 1025 for (k = oct2, max2 = ct2; k < max2; k++) { 1026 row = xdata[i][k] - rstart; 1027 start = ai[row]; 1028 end = ai[row + 1]; 1029 for (l = start; l < end; l++) { 1030 val = aj[l] + cstart; 1031 if (!PetscBTLookupSet(xtable, val)) { 1032 if (!(ct3 < mem_estimate)) { 1033 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1034 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1035 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1036 PetscCall(PetscFree(xdata[0])); 1037 xdata[0] = tmp; 1038 mem_estimate = new_estimate; 1039 ++no_malloc; 1040 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1041 } 1042 xdata[i][ct2++] = val; 1043 ct3++; 1044 } 1045 } 1046 start = bi[row]; 1047 end = bi[row + 1]; 1048 for (l = start; l < end; l++) { 1049 val = garray[bj[l]]; 1050 if (!PetscBTLookupSet(xtable, val)) { 1051 if (!(ct3 < mem_estimate)) { 1052 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1053 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1054 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1055 PetscCall(PetscFree(xdata[0])); 1056 xdata[0] = tmp; 1057 mem_estimate = new_estimate; 1058 ++no_malloc; 1059 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1060 } 1061 xdata[i][ct2++] = val; 1062 ct3++; 1063 } 1064 } 1065 } 1066 /* Update the header*/ 1067 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1068 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1069 } 1070 xdata[i][0] = rbuf_0; 1071 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1072 isz1[i] = ct2; /* size of each message */ 1073 } 1074 PetscCall(PetscBTDestroy(&xtable)); 1075 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1076 PetscFunctionReturn(PETSC_SUCCESS); 1077 } 1078 1079 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1080 /* 1081 Every processor gets the entire matrix 1082 */ 1083 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) 1084 { 1085 Mat B; 1086 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1087 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1088 PetscMPIInt size, rank; 1089 PetscInt sendcount, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0; 1090 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1091 PetscMPIInt iN, inrecv, inz; 1092 1093 PetscFunctionBegin; 1094 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1095 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1096 if (scall == MAT_INITIAL_MATRIX) { 1097 /* Tell every processor the number of nonzeros per row */ 1098 PetscCall(PetscCalloc1(A->rmap->N, &lens)); 1099 for (PetscInt i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; 1100 1101 /* All MPI processes get the same matrix */ 1102 PetscCall(PetscMPIIntCast(A->rmap->N, &iN)); 1103 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, lens, iN, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A))); 1104 1105 /* Create the sequential matrix of the same type as the local block diagonal */ 1106 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1107 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1108 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1109 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1110 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1111 PetscCall(PetscCalloc1(2, Bin)); 1112 **Bin = B; 1113 b = (Mat_SeqAIJ *)B->data; 1114 1115 /* zero column space */ 1116 nrecv = 0; 1117 for (PetscMPIInt i = 0; i < size; i++) { 1118 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j]; 1119 } 1120 PetscCall(PetscArrayzero(b->j, nrecv)); 1121 1122 /* Copy my part of matrix column indices over */ 1123 sendcount = ad->nz + bd->nz; 1124 jsendbuf = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]); 1125 a_jsendbuf = ad->j; 1126 b_jsendbuf = bd->j; 1127 n = A->rmap->rend - A->rmap->rstart; 1128 cnt = 0; 1129 for (PetscInt i = 0; i < n; i++) { 1130 /* put in lower diagonal portion */ 1131 m = bd->i[i + 1] - bd->i[i]; 1132 while (m > 0) { 1133 /* is it above diagonal (in bd (compressed) numbering) */ 1134 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1135 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1136 m--; 1137 } 1138 1139 /* put in diagonal portion */ 1140 for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1141 1142 /* put in upper diagonal portion */ 1143 while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1144 } 1145 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1146 1147 /* send column indices, b->j was zeroed */ 1148 PetscCall(PetscMPIIntCast(nrecv, &inrecv)); 1149 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->j, inrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A))); 1150 1151 /* Assemble the matrix into useable form (numerical values not yet set) */ 1152 /* set the b->ilen (length of each row) values */ 1153 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1154 /* set the b->i indices */ 1155 b->i[0] = 0; 1156 for (PetscInt i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1]; 1157 PetscCall(PetscFree(lens)); 1158 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1159 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1160 1161 } else { 1162 B = **Bin; 1163 b = (Mat_SeqAIJ *)B->data; 1164 } 1165 1166 /* Copy my part of matrix numerical values into the values location */ 1167 if (flag == MAT_GET_VALUES) { 1168 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1169 MatScalar *sendbuf; 1170 1171 /* initialize b->a */ 1172 PetscCall(PetscArrayzero(b->a, b->nz)); 1173 1174 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1175 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1176 sendcount = ad->nz + bd->nz; 1177 sendbuf = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]); 1178 a_sendbuf = ada; 1179 b_sendbuf = bda; 1180 b_sendj = bd->j; 1181 n = A->rmap->rend - A->rmap->rstart; 1182 cnt = 0; 1183 for (PetscInt i = 0; i < n; i++) { 1184 /* put in lower diagonal portion */ 1185 m = bd->i[i + 1] - bd->i[i]; 1186 while (m > 0) { 1187 /* is it above diagonal (in bd (compressed) numbering) */ 1188 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1189 sendbuf[cnt++] = *b_sendbuf++; 1190 m--; 1191 b_sendj++; 1192 } 1193 1194 /* put in diagonal portion */ 1195 for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++; 1196 1197 /* put in upper diagonal portion */ 1198 while (m-- > 0) { 1199 sendbuf[cnt++] = *b_sendbuf++; 1200 b_sendj++; 1201 } 1202 } 1203 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1204 1205 /* send values, b->a was zeroed */ 1206 PetscCall(PetscMPIIntCast(b->nz, &inz)); 1207 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->a, inz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A))); 1208 1209 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1210 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1211 } /* endof (flag == MAT_GET_VALUES) */ 1212 1213 PetscCall(MatPropagateSymmetryOptions(A, B)); 1214 PetscFunctionReturn(PETSC_SUCCESS); 1215 } 1216 1217 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) 1218 { 1219 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1220 Mat submat, A = c->A, B = c->B; 1221 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1222 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1223 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1224 const PetscInt *icol, *irow; 1225 PetscInt nrow, ncol, start; 1226 PetscMPIInt rank, size, *req_source1, *req_source2, tag1, tag2, tag3, tag4, *w1, *w2, nrqr, nrqs = 0, proc, *pa; 1227 PetscInt **sbuf1, **sbuf2, k, ct1, ct2, ct3, **rbuf1, row; 1228 PetscInt msz, **ptr, *req_size, *ctr, *tmp, tcol, *iptr; 1229 PetscInt **rbuf3, **sbuf_aj, **rbuf2, max1, nnz; 1230 PetscInt *lens, rmax, ncols, *cols, Crow; 1231 #if defined(PETSC_USE_CTABLE) 1232 PetscHMapI cmap, rmap; 1233 PetscInt *cmap_loc, *rmap_loc; 1234 #else 1235 PetscInt *cmap, *rmap; 1236 #endif 1237 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i, jcnt; 1238 PetscInt *cworkB, lwrite, *subcols, ib, jb; 1239 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1240 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1241 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1242 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1243 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1244 MPI_Comm comm; 1245 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1246 PetscMPIInt *onodes1, *olengths1, idex, end, *row2proc; 1247 Mat_SubSppt *smatis1; 1248 PetscBool isrowsorted, iscolsorted; 1249 1250 PetscFunctionBegin; 1251 PetscValidLogicalCollectiveInt(C, ismax, 2); 1252 PetscValidLogicalCollectiveEnum(C, scall, 5); 1253 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1254 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1255 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1256 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1257 size = c->size; 1258 rank = c->rank; 1259 1260 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1261 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1262 PetscCall(ISGetIndices(isrow[0], &irow)); 1263 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1264 if (allcolumns) { 1265 icol = NULL; 1266 ncol = C->cmap->N; 1267 } else { 1268 PetscCall(ISGetIndices(iscol[0], &icol)); 1269 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1270 } 1271 1272 if (scall == MAT_INITIAL_MATRIX) { 1273 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1274 1275 /* Get some new tags to keep the communication clean */ 1276 tag1 = ((PetscObject)C)->tag; 1277 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1278 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1279 1280 /* evaluate communication - mesg to who, length of mesg, and buffer space 1281 required. Based on this, buffers are allocated, and data copied into them */ 1282 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1283 PetscCall(PetscMalloc1(nrow, &row2proc)); 1284 1285 /* w1[proc] = num of rows owned by proc -- to be requested */ 1286 proc = 0; 1287 nrqs = 0; /* num of outgoing messages */ 1288 for (PetscInt j = 0; j < nrow; j++) { 1289 row = irow[j]; 1290 if (!isrowsorted) proc = 0; 1291 while (row >= C->rmap->range[proc + 1]) proc++; 1292 w1[proc]++; 1293 row2proc[j] = proc; /* map row index to proc */ 1294 1295 if (proc != rank && !w2[proc]) { 1296 w2[proc] = 1; 1297 nrqs++; 1298 } 1299 } 1300 w1[rank] = 0; /* rows owned by self will not be requested */ 1301 1302 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1303 for (PetscMPIInt proc = 0, j = 0; proc < size; proc++) { 1304 if (w1[proc]) pa[j++] = proc; 1305 } 1306 1307 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1308 msz = 0; /* total mesg length (for all procs) */ 1309 for (PetscMPIInt i = 0; i < nrqs; i++) { 1310 proc = pa[i]; 1311 w1[proc] += 3; 1312 msz += w1[proc]; 1313 } 1314 PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1315 1316 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1317 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1318 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1319 1320 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1321 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1322 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1323 1324 /* Now post the Irecvs corresponding to these messages */ 1325 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1326 1327 PetscCall(PetscFree(onodes1)); 1328 PetscCall(PetscFree(olengths1)); 1329 1330 /* Allocate Memory for outgoing messages */ 1331 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1332 PetscCall(PetscArrayzero(sbuf1, size)); 1333 PetscCall(PetscArrayzero(ptr, size)); 1334 1335 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1336 iptr = tmp; 1337 for (PetscMPIInt i = 0; i < nrqs; i++) { 1338 proc = pa[i]; 1339 sbuf1[proc] = iptr; 1340 iptr += w1[proc]; 1341 } 1342 1343 /* Form the outgoing messages */ 1344 /* Initialize the header space */ 1345 for (PetscMPIInt i = 0; i < nrqs; i++) { 1346 proc = pa[i]; 1347 PetscCall(PetscArrayzero(sbuf1[proc], 3)); 1348 ptr[proc] = sbuf1[proc] + 3; 1349 } 1350 1351 /* Parse the isrow and copy data into outbuf */ 1352 PetscCall(PetscArrayzero(ctr, size)); 1353 for (PetscInt j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1354 proc = row2proc[j]; 1355 if (proc != rank) { /* copy to the outgoing buf*/ 1356 *ptr[proc] = irow[j]; 1357 ctr[proc]++; 1358 ptr[proc]++; 1359 } 1360 } 1361 1362 /* Update the headers for the current IS */ 1363 for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */ 1364 if ((ctr_j = ctr[j])) { 1365 sbuf1_j = sbuf1[j]; 1366 k = ++sbuf1_j[0]; 1367 sbuf1_j[2 * k] = ctr_j; 1368 sbuf1_j[2 * k - 1] = 0; 1369 } 1370 } 1371 1372 /* Now post the sends */ 1373 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1374 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1375 proc = pa[i]; 1376 PetscCallMPI(MPIU_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i)); 1377 } 1378 1379 /* Post Receives to capture the buffer size */ 1380 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1381 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1382 1383 if (nrqs) rbuf2[0] = tmp + msz; 1384 for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1385 1386 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1387 proc = pa[i]; 1388 PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i)); 1389 } 1390 1391 PetscCall(PetscFree2(w1, w2)); 1392 1393 /* Send to other procs the buf size they should allocate */ 1394 /* Receive messages*/ 1395 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1396 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1397 1398 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1399 for (PetscMPIInt i = 0; i < nrqr; ++i) { 1400 req_size[i] = 0; 1401 rbuf1_i = rbuf1[i]; 1402 start = 2 * rbuf1_i[0] + 1; 1403 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1404 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1405 sbuf2_i = sbuf2[i]; 1406 for (PetscInt j = start; j < end; j++) { 1407 k = rbuf1_i[j] - rstart; 1408 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1409 sbuf2_i[j] = ncols; 1410 req_size[i] += ncols; 1411 } 1412 req_source1[i] = r_status1[i].MPI_SOURCE; 1413 1414 /* form the header */ 1415 sbuf2_i[0] = req_size[i]; 1416 for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1417 1418 PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1419 } 1420 1421 PetscCall(PetscFree(r_status1)); 1422 PetscCall(PetscFree(r_waits1)); 1423 1424 /* rbuf2 is received, Post recv column indices a->j */ 1425 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1426 1427 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1428 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1429 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1430 req_source2[i] = r_status2[i].MPI_SOURCE; 1431 PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1432 } 1433 1434 /* Wait on sends1 and sends2 */ 1435 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1436 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1437 PetscCall(PetscFree(s_waits1)); 1438 PetscCall(PetscFree(s_status1)); 1439 1440 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1441 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1442 1443 /* Now allocate sending buffers for a->j, and send them off */ 1444 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1445 jcnt = 0; 1446 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 1447 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0])); 1448 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1449 1450 for (PetscMPIInt i = 0; i < nrqr; i++) { /* for each requested message */ 1451 rbuf1_i = rbuf1[i]; 1452 sbuf_aj_i = sbuf_aj[i]; 1453 ct1 = 2 * rbuf1_i[0] + 1; 1454 ct2 = 0; 1455 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1456 1457 kmax = rbuf1[i][2]; 1458 for (PetscInt k = 0; k < kmax; k++, ct1++) { /* for each row */ 1459 row = rbuf1_i[ct1] - rstart; 1460 nzA = ai[row + 1] - ai[row]; 1461 nzB = bi[row + 1] - bi[row]; 1462 ncols = nzA + nzB; 1463 cworkA = PetscSafePointerPlusOffset(aj, ai[row]); 1464 cworkB = PetscSafePointerPlusOffset(bj, bi[row]); 1465 1466 /* load the column indices for this row into cols*/ 1467 cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2); 1468 1469 lwrite = 0; 1470 for (PetscInt l = 0; l < nzB; l++) { 1471 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1472 } 1473 for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1474 for (PetscInt l = 0; l < nzB; l++) { 1475 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1476 } 1477 1478 ct2 += ncols; 1479 } 1480 PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1481 } 1482 1483 /* create column map (cmap): global col of C -> local col of submat */ 1484 #if defined(PETSC_USE_CTABLE) 1485 if (!allcolumns) { 1486 PetscCall(PetscHMapICreateWithSize(ncol, &cmap)); 1487 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1488 for (PetscInt j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1489 if (icol[j] >= cstart && icol[j] < cend) { 1490 cmap_loc[icol[j] - cstart] = j + 1; 1491 } else { /* use PetscHMapI for non-local col indices */ 1492 PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1)); 1493 } 1494 } 1495 } else { 1496 cmap = NULL; 1497 cmap_loc = NULL; 1498 } 1499 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1500 #else 1501 if (!allcolumns) { 1502 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1503 for (PetscInt j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1504 } else { 1505 cmap = NULL; 1506 } 1507 #endif 1508 1509 /* Create lens for MatSeqAIJSetPreallocation() */ 1510 PetscCall(PetscCalloc1(nrow, &lens)); 1511 1512 /* Compute lens from local part of C */ 1513 for (PetscInt j = 0; j < nrow; j++) { 1514 row = irow[j]; 1515 proc = row2proc[j]; 1516 if (proc == rank) { 1517 /* diagonal part A = c->A */ 1518 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1519 cols = PetscSafePointerPlusOffset(aj, ai[row - rstart]); 1520 if (!allcolumns) { 1521 for (PetscInt k = 0; k < ncols; k++) { 1522 #if defined(PETSC_USE_CTABLE) 1523 tcol = cmap_loc[cols[k]]; 1524 #else 1525 tcol = cmap[cols[k] + cstart]; 1526 #endif 1527 if (tcol) lens[j]++; 1528 } 1529 } else { /* allcolumns */ 1530 lens[j] = ncols; 1531 } 1532 1533 /* off-diagonal part B = c->B */ 1534 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1535 cols = PetscSafePointerPlusOffset(bj, bi[row - rstart]); 1536 if (!allcolumns) { 1537 for (PetscInt k = 0; k < ncols; k++) { 1538 #if defined(PETSC_USE_CTABLE) 1539 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1540 #else 1541 tcol = cmap[bmap[cols[k]]]; 1542 #endif 1543 if (tcol) lens[j]++; 1544 } 1545 } else { /* allcolumns */ 1546 lens[j] += ncols; 1547 } 1548 } 1549 } 1550 1551 /* Create row map (rmap): global row of C -> local row of submat */ 1552 #if defined(PETSC_USE_CTABLE) 1553 PetscCall(PetscHMapICreateWithSize(nrow, &rmap)); 1554 for (PetscInt j = 0; j < nrow; j++) { 1555 row = irow[j]; 1556 proc = row2proc[j]; 1557 if (proc == rank) { /* a local row */ 1558 rmap_loc[row - rstart] = j; 1559 } else { 1560 PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1)); 1561 } 1562 } 1563 #else 1564 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1565 for (PetscInt j = 0; j < nrow; j++) rmap[irow[j]] = j; 1566 #endif 1567 1568 /* Update lens from offproc data */ 1569 /* recv a->j is done */ 1570 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1571 for (PetscMPIInt i = 0; i < nrqs; i++) { 1572 proc = pa[i]; 1573 sbuf1_i = sbuf1[proc]; 1574 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1575 ct1 = 2 + 1; 1576 ct2 = 0; 1577 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1578 rbuf3_i = rbuf3[i]; /* received C->j */ 1579 1580 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1581 max1 = sbuf1_i[2]; 1582 for (PetscInt k = 0; k < max1; k++, ct1++) { 1583 #if defined(PETSC_USE_CTABLE) 1584 PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row)); 1585 row--; 1586 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1587 #else 1588 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1589 #endif 1590 /* Now, store row index of submat in sbuf1_i[ct1] */ 1591 sbuf1_i[ct1] = row; 1592 1593 nnz = rbuf2_i[ct1]; 1594 if (!allcolumns) { 1595 for (PetscMPIInt l = 0; l < nnz; l++, ct2++) { 1596 #if defined(PETSC_USE_CTABLE) 1597 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1598 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1599 } else { 1600 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1601 } 1602 #else 1603 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1604 #endif 1605 if (tcol) lens[row]++; 1606 } 1607 } else { /* allcolumns */ 1608 lens[row] += nnz; 1609 } 1610 } 1611 } 1612 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1613 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1614 1615 /* Create the submatrices */ 1616 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1617 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1618 1619 PetscCall(ISGetBlockSize(isrow[0], &ib)); 1620 PetscCall(ISGetBlockSize(iscol[0], &jb)); 1621 if (ib > 1 || jb > 1) PetscCall(MatSetBlockSizes(submat, ib, jb)); 1622 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1623 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1624 1625 /* create struct Mat_SubSppt and attached it to submat */ 1626 PetscCall(PetscNew(&smatis1)); 1627 subc = (Mat_SeqAIJ *)submat->data; 1628 subc->submatis1 = smatis1; 1629 1630 smatis1->id = 0; 1631 smatis1->nrqs = nrqs; 1632 smatis1->nrqr = nrqr; 1633 smatis1->rbuf1 = rbuf1; 1634 smatis1->rbuf2 = rbuf2; 1635 smatis1->rbuf3 = rbuf3; 1636 smatis1->sbuf2 = sbuf2; 1637 smatis1->req_source2 = req_source2; 1638 1639 smatis1->sbuf1 = sbuf1; 1640 smatis1->ptr = ptr; 1641 smatis1->tmp = tmp; 1642 smatis1->ctr = ctr; 1643 1644 smatis1->pa = pa; 1645 smatis1->req_size = req_size; 1646 smatis1->req_source1 = req_source1; 1647 1648 smatis1->allcolumns = allcolumns; 1649 smatis1->singleis = PETSC_TRUE; 1650 smatis1->row2proc = row2proc; 1651 smatis1->rmap = rmap; 1652 smatis1->cmap = cmap; 1653 #if defined(PETSC_USE_CTABLE) 1654 smatis1->rmap_loc = rmap_loc; 1655 smatis1->cmap_loc = cmap_loc; 1656 #endif 1657 1658 smatis1->destroy = submat->ops->destroy; 1659 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1660 submat->factortype = C->factortype; 1661 1662 /* compute rmax */ 1663 rmax = 0; 1664 for (PetscMPIInt i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1665 1666 } else { /* scall == MAT_REUSE_MATRIX */ 1667 submat = submats[0]; 1668 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1669 1670 subc = (Mat_SeqAIJ *)submat->data; 1671 rmax = subc->rmax; 1672 smatis1 = subc->submatis1; 1673 nrqs = smatis1->nrqs; 1674 nrqr = smatis1->nrqr; 1675 rbuf1 = smatis1->rbuf1; 1676 rbuf2 = smatis1->rbuf2; 1677 rbuf3 = smatis1->rbuf3; 1678 req_source2 = smatis1->req_source2; 1679 1680 sbuf1 = smatis1->sbuf1; 1681 sbuf2 = smatis1->sbuf2; 1682 ptr = smatis1->ptr; 1683 tmp = smatis1->tmp; 1684 ctr = smatis1->ctr; 1685 1686 pa = smatis1->pa; 1687 req_size = smatis1->req_size; 1688 req_source1 = smatis1->req_source1; 1689 1690 allcolumns = smatis1->allcolumns; 1691 row2proc = smatis1->row2proc; 1692 rmap = smatis1->rmap; 1693 cmap = smatis1->cmap; 1694 #if defined(PETSC_USE_CTABLE) 1695 rmap_loc = smatis1->rmap_loc; 1696 cmap_loc = smatis1->cmap_loc; 1697 #endif 1698 } 1699 1700 /* Post recv matrix values */ 1701 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1702 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1703 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1704 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1705 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1706 PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1707 } 1708 1709 /* Allocate sending buffers for a->a, and send them off */ 1710 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1711 jcnt = 0; 1712 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 1713 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0])); 1714 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1715 1716 for (PetscMPIInt i = 0; i < nrqr; i++) { 1717 rbuf1_i = rbuf1[i]; 1718 sbuf_aa_i = sbuf_aa[i]; 1719 ct1 = 2 * rbuf1_i[0] + 1; 1720 ct2 = 0; 1721 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1722 1723 kmax = rbuf1_i[2]; 1724 for (PetscInt k = 0; k < kmax; k++, ct1++) { 1725 row = rbuf1_i[ct1] - rstart; 1726 nzA = ai[row + 1] - ai[row]; 1727 nzB = bi[row + 1] - bi[row]; 1728 ncols = nzA + nzB; 1729 cworkB = PetscSafePointerPlusOffset(bj, bi[row]); 1730 vworkA = PetscSafePointerPlusOffset(a_a, ai[row]); 1731 vworkB = PetscSafePointerPlusOffset(b_a, bi[row]); 1732 1733 /* load the column values for this row into vals*/ 1734 vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2); 1735 1736 lwrite = 0; 1737 for (PetscInt l = 0; l < nzB; l++) { 1738 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1739 } 1740 for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1741 for (PetscInt l = 0; l < nzB; l++) { 1742 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1743 } 1744 1745 ct2 += ncols; 1746 } 1747 PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1748 } 1749 1750 /* Assemble submat */ 1751 /* First assemble the local rows */ 1752 for (PetscInt j = 0; j < nrow; j++) { 1753 row = irow[j]; 1754 proc = row2proc[j]; 1755 if (proc == rank) { 1756 Crow = row - rstart; /* local row index of C */ 1757 #if defined(PETSC_USE_CTABLE) 1758 row = rmap_loc[Crow]; /* row index of submat */ 1759 #else 1760 row = rmap[row]; 1761 #endif 1762 1763 if (allcolumns) { 1764 PetscInt ncol = 0; 1765 1766 /* diagonal part A = c->A */ 1767 ncols = ai[Crow + 1] - ai[Crow]; 1768 cols = PetscSafePointerPlusOffset(aj, ai[Crow]); 1769 vals = PetscSafePointerPlusOffset(a_a, ai[Crow]); 1770 for (PetscInt k = 0; k < ncols; k++) { 1771 subcols[ncol] = cols[k] + cstart; 1772 subvals[ncol++] = vals[k]; 1773 } 1774 1775 /* off-diagonal part B = c->B */ 1776 ncols = bi[Crow + 1] - bi[Crow]; 1777 cols = PetscSafePointerPlusOffset(bj, bi[Crow]); 1778 vals = PetscSafePointerPlusOffset(b_a, bi[Crow]); 1779 for (PetscInt k = 0; k < ncols; k++) { 1780 subcols[ncol] = bmap[cols[k]]; 1781 subvals[ncol++] = vals[k]; 1782 } 1783 1784 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES)); 1785 1786 } else { /* !allcolumns */ 1787 PetscInt ncol = 0; 1788 1789 #if defined(PETSC_USE_CTABLE) 1790 /* diagonal part A = c->A */ 1791 ncols = ai[Crow + 1] - ai[Crow]; 1792 cols = PetscSafePointerPlusOffset(aj, ai[Crow]); 1793 vals = PetscSafePointerPlusOffset(a_a, ai[Crow]); 1794 for (PetscInt k = 0; k < ncols; k++) { 1795 tcol = cmap_loc[cols[k]]; 1796 if (tcol) { 1797 subcols[ncol] = --tcol; 1798 subvals[ncol++] = vals[k]; 1799 } 1800 } 1801 1802 /* off-diagonal part B = c->B */ 1803 ncols = bi[Crow + 1] - bi[Crow]; 1804 cols = PetscSafePointerPlusOffset(bj, bi[Crow]); 1805 vals = PetscSafePointerPlusOffset(b_a, bi[Crow]); 1806 for (PetscInt k = 0; k < ncols; k++) { 1807 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1808 if (tcol) { 1809 subcols[ncol] = --tcol; 1810 subvals[ncol++] = vals[k]; 1811 } 1812 } 1813 #else 1814 /* diagonal part A = c->A */ 1815 ncols = ai[Crow + 1] - ai[Crow]; 1816 cols = aj + ai[Crow]; 1817 vals = a_a + ai[Crow]; 1818 for (PetscInt k = 0; k < ncols; k++) { 1819 tcol = cmap[cols[k] + cstart]; 1820 if (tcol) { 1821 subcols[ncol] = --tcol; 1822 subvals[ncol++] = vals[k]; 1823 } 1824 } 1825 1826 /* off-diagonal part B = c->B */ 1827 ncols = bi[Crow + 1] - bi[Crow]; 1828 cols = bj + bi[Crow]; 1829 vals = b_a + bi[Crow]; 1830 for (PetscInt k = 0; k < ncols; k++) { 1831 tcol = cmap[bmap[cols[k]]]; 1832 if (tcol) { 1833 subcols[ncol] = --tcol; 1834 subvals[ncol++] = vals[k]; 1835 } 1836 } 1837 #endif 1838 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES)); 1839 } 1840 } 1841 } 1842 1843 /* Now assemble the off-proc rows */ 1844 for (PetscMPIInt i = 0; i < nrqs; i++) { /* for each requested message */ 1845 /* recv values from other processes */ 1846 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1847 proc = pa[idex]; 1848 sbuf1_i = sbuf1[proc]; 1849 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1850 ct1 = 2 + 1; 1851 ct2 = 0; /* count of received C->j */ 1852 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1853 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1854 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1855 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1856 1857 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1858 max1 = sbuf1_i[2]; /* num of rows */ 1859 for (PetscInt k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1860 row = sbuf1_i[ct1]; /* row index of submat */ 1861 if (!allcolumns) { 1862 idex = 0; 1863 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1864 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1865 for (PetscInt l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1866 #if defined(PETSC_USE_CTABLE) 1867 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1868 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1869 } else { 1870 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1871 } 1872 #else 1873 tcol = cmap[rbuf3_i[ct2]]; 1874 #endif 1875 if (tcol) { 1876 subcols[idex] = --tcol; /* may not be sorted */ 1877 subvals[idex++] = rbuf4_i[ct2]; 1878 1879 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1880 For reuse, we replace received C->j with index that should be inserted to submat */ 1881 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1882 } 1883 } 1884 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1885 } else { /* scall == MAT_REUSE_MATRIX */ 1886 submat = submats[0]; 1887 subc = (Mat_SeqAIJ *)submat->data; 1888 1889 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1890 for (PetscInt l = 0; l < nnz; l++) { 1891 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1892 subvals[idex++] = rbuf4_i[ct2]; 1893 } 1894 1895 bj = subc->j + subc->i[row]; /* sorted column indices */ 1896 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1897 } 1898 } else { /* allcolumns */ 1899 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1900 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES)); 1901 ct2 += nnz; 1902 } 1903 } 1904 } 1905 1906 /* sending a->a are done */ 1907 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1908 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1909 1910 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1911 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1912 submats[0] = submat; 1913 1914 /* Restore the indices */ 1915 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1916 if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol)); 1917 1918 /* Destroy allocated memory */ 1919 for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 1920 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1921 if (sbuf_aa) { 1922 PetscCall(PetscFree(sbuf_aa[0])); 1923 PetscCall(PetscFree(sbuf_aa)); 1924 } 1925 1926 if (scall == MAT_INITIAL_MATRIX) { 1927 PetscCall(PetscFree(lens)); 1928 if (sbuf_aj) { 1929 PetscCall(PetscFree(sbuf_aj[0])); 1930 PetscCall(PetscFree(sbuf_aj)); 1931 } 1932 } 1933 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1934 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1935 PetscFunctionReturn(PETSC_SUCCESS); 1936 } 1937 1938 static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1939 { 1940 PetscInt ncol; 1941 PetscBool colflag, allcolumns = PETSC_FALSE; 1942 1943 PetscFunctionBegin; 1944 /* Allocate memory to hold all the submatrices */ 1945 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat)); 1946 1947 /* Check for special case: each processor gets entire matrix columns */ 1948 PetscCall(ISIdentity(iscol[0], &colflag)); 1949 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1950 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1951 1952 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1953 PetscFunctionReturn(PETSC_SUCCESS); 1954 } 1955 1956 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1957 { 1958 PetscInt nmax, nstages = 0, max_no, nrow, ncol, in[2], out[2]; 1959 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1960 Mat_SeqAIJ *subc; 1961 Mat_SubSppt *smat; 1962 1963 PetscFunctionBegin; 1964 /* Check for special case: each processor has a single IS */ 1965 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1966 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1967 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1968 PetscFunctionReturn(PETSC_SUCCESS); 1969 } 1970 1971 /* Collect global wantallmatrix and nstages */ 1972 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1973 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1974 if (!nmax) nmax = 1; 1975 1976 if (scall == MAT_INITIAL_MATRIX) { 1977 /* Collect global wantallmatrix and nstages */ 1978 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1979 PetscCall(ISIdentity(*isrow, &rowflag)); 1980 PetscCall(ISIdentity(*iscol, &colflag)); 1981 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1982 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1983 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1984 wantallmatrix = PETSC_TRUE; 1985 1986 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1987 } 1988 } 1989 1990 /* Determine the number of stages through which submatrices are done 1991 Each stage will extract nmax submatrices. 1992 nmax is determined by the matrix column dimension. 1993 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1994 */ 1995 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1996 1997 in[0] = -1 * (PetscInt)wantallmatrix; 1998 in[1] = nstages; 1999 PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 2000 wantallmatrix = (PetscBool)(-out[0]); 2001 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2002 2003 } else { /* MAT_REUSE_MATRIX */ 2004 if (ismax) { 2005 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2006 smat = subc->submatis1; 2007 } else { /* (*submat)[0] is a dummy matrix */ 2008 smat = (Mat_SubSppt *)(*submat)[0]->data; 2009 } 2010 if (!smat) { 2011 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2012 wantallmatrix = PETSC_TRUE; 2013 } else if (smat->singleis) { 2014 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 2015 PetscFunctionReturn(PETSC_SUCCESS); 2016 } else { 2017 nstages = smat->nstages; 2018 } 2019 } 2020 2021 if (wantallmatrix) { 2022 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 2023 PetscFunctionReturn(PETSC_SUCCESS); 2024 } 2025 2026 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2027 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat)); 2028 2029 for (PetscInt i = 0, pos = 0; i < nstages; i++) { 2030 if (pos + nmax <= ismax) max_no = nmax; 2031 else if (pos >= ismax) max_no = 0; 2032 else max_no = ismax - pos; 2033 2034 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos)); 2035 if (!max_no) { 2036 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2037 smat = (Mat_SubSppt *)(*submat)[pos]->data; 2038 smat->nstages = nstages; 2039 } 2040 pos++; /* advance to next dummy matrix if any */ 2041 } else pos += max_no; 2042 } 2043 2044 if (ismax && scall == MAT_INITIAL_MATRIX) { 2045 /* save nstages for reuse */ 2046 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2047 smat = subc->submatis1; 2048 smat->nstages = nstages; 2049 } 2050 PetscFunctionReturn(PETSC_SUCCESS); 2051 } 2052 2053 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) 2054 { 2055 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2056 Mat A = c->A; 2057 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2058 const PetscInt **icol, **irow; 2059 PetscInt *nrow, *ncol, start; 2060 PetscMPIInt nrqs = 0, *pa, proc = -1; 2061 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr, *req_source1 = NULL, *req_source2; 2062 PetscInt **sbuf1, **sbuf2, k, ct1, ct2, **rbuf1, row; 2063 PetscInt msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *tmp = NULL, tcol; 2064 PetscInt **rbuf3 = NULL, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2065 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2066 #if defined(PETSC_USE_CTABLE) 2067 PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i; 2068 #else 2069 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2070 #endif 2071 const PetscInt *irow_i; 2072 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2073 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2074 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2075 MPI_Comm comm; 2076 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2077 PetscMPIInt *onodes1, *olengths1, end, **row2proc, *row2proc_i; 2078 PetscInt ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2079 Mat_SubSppt *smat_i; 2080 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2081 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen, jcnt; 2082 2083 PetscFunctionBegin; 2084 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2085 size = c->size; 2086 rank = c->rank; 2087 2088 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2089 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2090 2091 for (PetscInt i = 0; i < ismax; i++) { 2092 PetscCall(ISSorted(iscol[i], &issorted[i])); 2093 if (!issorted[i]) iscsorted = issorted[i]; 2094 2095 PetscCall(ISSorted(isrow[i], &issorted[i])); 2096 2097 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2098 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2099 2100 /* Check for special case: allcolumn */ 2101 PetscCall(ISIdentity(iscol[i], &colflag)); 2102 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2103 if (colflag && ncol[i] == C->cmap->N) { 2104 allcolumns[i] = PETSC_TRUE; 2105 icol[i] = NULL; 2106 } else { 2107 allcolumns[i] = PETSC_FALSE; 2108 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2109 } 2110 } 2111 2112 if (scall == MAT_REUSE_MATRIX) { 2113 /* Assumes new rows are same length as the old rows */ 2114 for (PetscInt i = 0; i < ismax; i++) { 2115 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2116 subc = (Mat_SeqAIJ *)submats[i]->data; 2117 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2118 2119 /* Initial matrix as if empty */ 2120 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2121 2122 smat_i = subc->submatis1; 2123 2124 nrqs = smat_i->nrqs; 2125 nrqr = smat_i->nrqr; 2126 rbuf1 = smat_i->rbuf1; 2127 rbuf2 = smat_i->rbuf2; 2128 rbuf3 = smat_i->rbuf3; 2129 req_source2 = smat_i->req_source2; 2130 2131 sbuf1 = smat_i->sbuf1; 2132 sbuf2 = smat_i->sbuf2; 2133 ptr = smat_i->ptr; 2134 tmp = smat_i->tmp; 2135 ctr = smat_i->ctr; 2136 2137 pa = smat_i->pa; 2138 req_size = smat_i->req_size; 2139 req_source1 = smat_i->req_source1; 2140 2141 allcolumns[i] = smat_i->allcolumns; 2142 row2proc[i] = smat_i->row2proc; 2143 rmap[i] = smat_i->rmap; 2144 cmap[i] = smat_i->cmap; 2145 } 2146 2147 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2148 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2149 smat_i = (Mat_SubSppt *)submats[0]->data; 2150 2151 nrqs = smat_i->nrqs; 2152 nrqr = smat_i->nrqr; 2153 rbuf1 = smat_i->rbuf1; 2154 rbuf2 = smat_i->rbuf2; 2155 rbuf3 = smat_i->rbuf3; 2156 req_source2 = smat_i->req_source2; 2157 2158 sbuf1 = smat_i->sbuf1; 2159 sbuf2 = smat_i->sbuf2; 2160 ptr = smat_i->ptr; 2161 tmp = smat_i->tmp; 2162 ctr = smat_i->ctr; 2163 2164 pa = smat_i->pa; 2165 req_size = smat_i->req_size; 2166 req_source1 = smat_i->req_source1; 2167 2168 allcolumns[0] = PETSC_FALSE; 2169 } 2170 } else { /* scall == MAT_INITIAL_MATRIX */ 2171 /* Get some new tags to keep the communication clean */ 2172 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2173 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2174 2175 /* evaluate communication - mesg to who, length of mesg, and buffer space 2176 required. Based on this, buffers are allocated, and data copied into them*/ 2177 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2178 2179 for (PetscInt i = 0; i < ismax; i++) { 2180 jmax = nrow[i]; 2181 irow_i = irow[i]; 2182 2183 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2184 row2proc[i] = row2proc_i; 2185 2186 if (issorted[i]) proc = 0; 2187 for (PetscInt j = 0; j < jmax; j++) { 2188 if (!issorted[i]) proc = 0; 2189 row = irow_i[j]; 2190 while (row >= C->rmap->range[proc + 1]) proc++; 2191 w4[proc]++; 2192 row2proc_i[j] = proc; /* map row index to proc */ 2193 } 2194 for (PetscMPIInt j = 0; j < size; j++) { 2195 if (w4[j]) { 2196 w1[j] += w4[j]; 2197 w3[j]++; 2198 w4[j] = 0; 2199 } 2200 } 2201 } 2202 2203 nrqs = 0; /* no of outgoing messages */ 2204 msz = 0; /* total mesg length (for all procs) */ 2205 w1[rank] = 0; /* no mesg sent to self */ 2206 w3[rank] = 0; 2207 for (PetscMPIInt i = 0; i < size; i++) { 2208 if (w1[i]) { 2209 w2[i] = 1; 2210 nrqs++; 2211 } /* there exists a message to proc i */ 2212 } 2213 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2214 for (PetscMPIInt i = 0, j = 0; i < size; i++) { 2215 if (w1[i]) { 2216 pa[j] = i; 2217 j++; 2218 } 2219 } 2220 2221 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2222 for (PetscMPIInt i = 0; i < nrqs; i++) { 2223 PetscMPIInt j = pa[i]; 2224 w1[j] += w2[j] + 2 * w3[j]; 2225 msz += w1[j]; 2226 } 2227 PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2228 2229 /* Determine the number of messages to expect, their lengths, from from-ids */ 2230 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2231 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2232 2233 /* Now post the Irecvs corresponding to these messages */ 2234 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2235 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2236 2237 /* Allocate Memory for outgoing messages */ 2238 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2239 PetscCall(PetscArrayzero(sbuf1, size)); 2240 PetscCall(PetscArrayzero(ptr, size)); 2241 2242 { 2243 PetscInt *iptr = tmp; 2244 k = 0; 2245 for (PetscMPIInt i = 0; i < nrqs; i++) { 2246 PetscMPIInt j = pa[i]; 2247 iptr += k; 2248 sbuf1[j] = iptr; 2249 k = w1[j]; 2250 } 2251 } 2252 2253 /* Form the outgoing messages. Initialize the header space */ 2254 for (PetscMPIInt i = 0; i < nrqs; i++) { 2255 PetscMPIInt j = pa[i]; 2256 sbuf1[j][0] = 0; 2257 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2258 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2259 } 2260 2261 /* Parse the isrow and copy data into outbuf */ 2262 for (PetscInt i = 0; i < ismax; i++) { 2263 row2proc_i = row2proc[i]; 2264 PetscCall(PetscArrayzero(ctr, size)); 2265 irow_i = irow[i]; 2266 jmax = nrow[i]; 2267 for (PetscInt j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2268 proc = row2proc_i[j]; 2269 if (proc != rank) { /* copy to the outgoing buf*/ 2270 ctr[proc]++; 2271 *ptr[proc] = irow_i[j]; 2272 ptr[proc]++; 2273 } 2274 } 2275 /* Update the headers for the current IS */ 2276 for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */ 2277 if ((ctr_j = ctr[j])) { 2278 sbuf1_j = sbuf1[j]; 2279 k = ++sbuf1_j[0]; 2280 sbuf1_j[2 * k] = ctr_j; 2281 sbuf1_j[2 * k - 1] = i; 2282 } 2283 } 2284 } 2285 2286 /* Now post the sends */ 2287 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2288 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2289 PetscMPIInt j = pa[i]; 2290 PetscCallMPI(MPIU_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2291 } 2292 2293 /* Post Receives to capture the buffer size */ 2294 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2295 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2296 if (nrqs) rbuf2[0] = tmp + msz; 2297 for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 2298 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2299 PetscMPIInt j = pa[i]; 2300 PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2301 } 2302 2303 /* Send to other procs the buf size they should allocate */ 2304 /* Receive messages*/ 2305 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2306 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2307 { 2308 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2309 PetscInt *sbuf2_i; 2310 2311 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2312 for (PetscMPIInt i = 0; i < nrqr; ++i) { 2313 req_size[i] = 0; 2314 rbuf1_i = rbuf1[i]; 2315 start = 2 * rbuf1_i[0] + 1; 2316 end = olengths1[i]; 2317 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2318 sbuf2_i = sbuf2[i]; 2319 for (PetscInt j = start; j < end; j++) { 2320 id = rbuf1_i[j] - rstart; 2321 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2322 sbuf2_i[j] = ncols; 2323 req_size[i] += ncols; 2324 } 2325 req_source1[i] = onodes1[i]; 2326 /* form the header */ 2327 sbuf2_i[0] = req_size[i]; 2328 for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2329 2330 PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2331 } 2332 } 2333 2334 PetscCall(PetscFree(onodes1)); 2335 PetscCall(PetscFree(olengths1)); 2336 PetscCall(PetscFree(r_waits1)); 2337 PetscCall(PetscFree4(w1, w2, w3, w4)); 2338 2339 /* Receive messages*/ 2340 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2341 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2342 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2343 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2344 req_source2[i] = pa[i]; 2345 PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2346 } 2347 PetscCall(PetscFree(r_waits2)); 2348 2349 /* Wait on sends1 and sends2 */ 2350 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2351 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2352 PetscCall(PetscFree(s_waits1)); 2353 PetscCall(PetscFree(s_waits2)); 2354 2355 /* Now allocate sending buffers for a->j, and send them off */ 2356 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2357 jcnt = 0; 2358 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 2359 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0])); 2360 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2361 2362 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2363 { 2364 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2365 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2366 PetscInt cend = C->cmap->rend; 2367 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2368 2369 for (PetscMPIInt i = 0; i < nrqr; i++) { 2370 rbuf1_i = rbuf1[i]; 2371 sbuf_aj_i = sbuf_aj[i]; 2372 ct1 = 2 * rbuf1_i[0] + 1; 2373 ct2 = 0; 2374 for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2375 kmax = rbuf1[i][2 * j]; 2376 for (PetscInt k = 0; k < kmax; k++, ct1++) { 2377 row = rbuf1_i[ct1] - rstart; 2378 nzA = a_i[row + 1] - a_i[row]; 2379 nzB = b_i[row + 1] - b_i[row]; 2380 ncols = nzA + nzB; 2381 cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]); 2382 cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]); 2383 2384 /* load the column indices for this row into cols */ 2385 cols = sbuf_aj_i + ct2; 2386 2387 lwrite = 0; 2388 for (PetscInt l = 0; l < nzB; l++) { 2389 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2390 } 2391 for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2392 for (PetscInt l = 0; l < nzB; l++) { 2393 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2394 } 2395 2396 ct2 += ncols; 2397 } 2398 } 2399 PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2400 } 2401 } 2402 2403 /* create col map: global col of C -> local col of submatrices */ 2404 { 2405 const PetscInt *icol_i; 2406 #if defined(PETSC_USE_CTABLE) 2407 for (PetscInt i = 0; i < ismax; i++) { 2408 if (!allcolumns[i]) { 2409 PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i)); 2410 2411 jmax = ncol[i]; 2412 icol_i = icol[i]; 2413 cmap_i = cmap[i]; 2414 for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1)); 2415 } else cmap[i] = NULL; 2416 } 2417 #else 2418 for (PetscInt i = 0; i < ismax; i++) { 2419 if (!allcolumns[i]) { 2420 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2421 jmax = ncol[i]; 2422 icol_i = icol[i]; 2423 cmap_i = cmap[i]; 2424 for (PetscInt j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1; 2425 } else cmap[i] = NULL; 2426 } 2427 #endif 2428 } 2429 2430 /* Create lens which is required for MatCreate... */ 2431 jcnt = 0; 2432 for (PetscInt i = 0; i < ismax; i++) jcnt += nrow[i]; 2433 PetscCall(PetscMalloc1(ismax, &lens)); 2434 2435 if (ismax) PetscCall(PetscCalloc1(jcnt, &lens[0])); 2436 for (PetscInt i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]); 2437 2438 /* Update lens from local data */ 2439 for (PetscInt i = 0; i < ismax; i++) { 2440 row2proc_i = row2proc[i]; 2441 jmax = nrow[i]; 2442 if (!allcolumns[i]) cmap_i = cmap[i]; 2443 irow_i = irow[i]; 2444 lens_i = lens[i]; 2445 for (PetscInt j = 0; j < jmax; j++) { 2446 row = irow_i[j]; 2447 proc = row2proc_i[j]; 2448 if (proc == rank) { 2449 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2450 if (!allcolumns[i]) { 2451 for (PetscInt k = 0; k < ncols; k++) { 2452 #if defined(PETSC_USE_CTABLE) 2453 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2454 #else 2455 tcol = cmap_i[cols[k]]; 2456 #endif 2457 if (tcol) lens_i[j]++; 2458 } 2459 } else { /* allcolumns */ 2460 lens_i[j] = ncols; 2461 } 2462 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2463 } 2464 } 2465 } 2466 2467 /* Create row map: global row of C -> local row of submatrices */ 2468 #if defined(PETSC_USE_CTABLE) 2469 for (PetscInt i = 0; i < ismax; i++) { 2470 PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i)); 2471 irow_i = irow[i]; 2472 jmax = nrow[i]; 2473 for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1)); 2474 } 2475 #else 2476 for (PetscInt i = 0; i < ismax; i++) { 2477 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2478 rmap_i = rmap[i]; 2479 irow_i = irow[i]; 2480 jmax = nrow[i]; 2481 for (PetscInt j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j; 2482 } 2483 #endif 2484 2485 /* Update lens from offproc data */ 2486 { 2487 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2488 2489 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2490 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2491 sbuf1_i = sbuf1[pa[tmp2]]; 2492 jmax = sbuf1_i[0]; 2493 ct1 = 2 * jmax + 1; 2494 ct2 = 0; 2495 rbuf2_i = rbuf2[tmp2]; 2496 rbuf3_i = rbuf3[tmp2]; 2497 for (PetscInt j = 1; j <= jmax; j++) { 2498 is_no = sbuf1_i[2 * j - 1]; 2499 max1 = sbuf1_i[2 * j]; 2500 lens_i = lens[is_no]; 2501 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2502 rmap_i = rmap[is_no]; 2503 for (PetscInt k = 0; k < max1; k++, ct1++) { 2504 #if defined(PETSC_USE_CTABLE) 2505 PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row)); 2506 row--; 2507 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2508 #else 2509 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2510 #endif 2511 max2 = rbuf2_i[ct1]; 2512 for (PetscInt l = 0; l < max2; l++, ct2++) { 2513 if (!allcolumns[is_no]) { 2514 #if defined(PETSC_USE_CTABLE) 2515 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2516 #else 2517 tcol = cmap_i[rbuf3_i[ct2]]; 2518 #endif 2519 if (tcol) lens_i[row]++; 2520 } else { /* allcolumns */ 2521 lens_i[row]++; /* lens_i[row] += max2 ? */ 2522 } 2523 } 2524 } 2525 } 2526 } 2527 } 2528 PetscCall(PetscFree(r_waits3)); 2529 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2530 PetscCall(PetscFree(s_waits3)); 2531 2532 /* Create the submatrices */ 2533 for (PetscInt i = 0; i < ismax; i++) { 2534 PetscInt rbs, cbs; 2535 2536 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2537 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2538 2539 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2540 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2541 2542 if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2543 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2544 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2545 2546 /* create struct Mat_SubSppt and attached it to submat */ 2547 PetscCall(PetscNew(&smat_i)); 2548 subc = (Mat_SeqAIJ *)submats[i]->data; 2549 subc->submatis1 = smat_i; 2550 2551 smat_i->destroy = submats[i]->ops->destroy; 2552 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2553 submats[i]->factortype = C->factortype; 2554 2555 smat_i->id = i; 2556 smat_i->nrqs = nrqs; 2557 smat_i->nrqr = nrqr; 2558 smat_i->rbuf1 = rbuf1; 2559 smat_i->rbuf2 = rbuf2; 2560 smat_i->rbuf3 = rbuf3; 2561 smat_i->sbuf2 = sbuf2; 2562 smat_i->req_source2 = req_source2; 2563 2564 smat_i->sbuf1 = sbuf1; 2565 smat_i->ptr = ptr; 2566 smat_i->tmp = tmp; 2567 smat_i->ctr = ctr; 2568 2569 smat_i->pa = pa; 2570 smat_i->req_size = req_size; 2571 smat_i->req_source1 = req_source1; 2572 2573 smat_i->allcolumns = allcolumns[i]; 2574 smat_i->singleis = PETSC_FALSE; 2575 smat_i->row2proc = row2proc[i]; 2576 smat_i->rmap = rmap[i]; 2577 smat_i->cmap = cmap[i]; 2578 } 2579 2580 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2581 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2582 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2583 PetscCall(MatSetType(submats[0], MATDUMMY)); 2584 2585 /* create struct Mat_SubSppt and attached it to submat */ 2586 PetscCall(PetscNew(&smat_i)); 2587 submats[0]->data = (void *)smat_i; 2588 2589 smat_i->destroy = submats[0]->ops->destroy; 2590 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2591 submats[0]->factortype = C->factortype; 2592 2593 smat_i->id = 0; 2594 smat_i->nrqs = nrqs; 2595 smat_i->nrqr = nrqr; 2596 smat_i->rbuf1 = rbuf1; 2597 smat_i->rbuf2 = rbuf2; 2598 smat_i->rbuf3 = rbuf3; 2599 smat_i->sbuf2 = sbuf2; 2600 smat_i->req_source2 = req_source2; 2601 2602 smat_i->sbuf1 = sbuf1; 2603 smat_i->ptr = ptr; 2604 smat_i->tmp = tmp; 2605 smat_i->ctr = ctr; 2606 2607 smat_i->pa = pa; 2608 smat_i->req_size = req_size; 2609 smat_i->req_source1 = req_source1; 2610 2611 smat_i->allcolumns = PETSC_FALSE; 2612 smat_i->singleis = PETSC_FALSE; 2613 smat_i->row2proc = NULL; 2614 smat_i->rmap = NULL; 2615 smat_i->cmap = NULL; 2616 } 2617 2618 if (ismax) PetscCall(PetscFree(lens[0])); 2619 PetscCall(PetscFree(lens)); 2620 if (sbuf_aj) { 2621 PetscCall(PetscFree(sbuf_aj[0])); 2622 PetscCall(PetscFree(sbuf_aj)); 2623 } 2624 2625 } /* endof scall == MAT_INITIAL_MATRIX */ 2626 2627 /* Post recv matrix values */ 2628 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2629 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2630 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2631 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2632 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2633 PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2634 } 2635 2636 /* Allocate sending buffers for a->a, and send them off */ 2637 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2638 jcnt = 0; 2639 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 2640 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0])); 2641 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2642 2643 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2644 { 2645 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2646 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2647 PetscInt cend = C->cmap->rend; 2648 PetscInt *b_j = b->j; 2649 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2650 2651 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2652 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2653 for (PetscMPIInt i = 0; i < nrqr; i++) { 2654 rbuf1_i = rbuf1[i]; 2655 sbuf_aa_i = sbuf_aa[i]; 2656 ct1 = 2 * rbuf1_i[0] + 1; 2657 ct2 = 0; 2658 for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2659 kmax = rbuf1_i[2 * j]; 2660 for (PetscInt k = 0; k < kmax; k++, ct1++) { 2661 row = rbuf1_i[ct1] - rstart; 2662 nzA = a_i[row + 1] - a_i[row]; 2663 nzB = b_i[row + 1] - b_i[row]; 2664 ncols = nzA + nzB; 2665 cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]); 2666 vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]); 2667 vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]); 2668 2669 /* load the column values for this row into vals*/ 2670 vals = sbuf_aa_i + ct2; 2671 2672 lwrite = 0; 2673 for (PetscInt l = 0; l < nzB; l++) { 2674 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2675 } 2676 for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2677 for (PetscInt l = 0; l < nzB; l++) { 2678 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2679 } 2680 2681 ct2 += ncols; 2682 } 2683 } 2684 PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2685 } 2686 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2687 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2688 } 2689 2690 /* Assemble the matrices */ 2691 /* First assemble the local rows */ 2692 for (PetscInt i = 0; i < ismax; i++) { 2693 row2proc_i = row2proc[i]; 2694 subc = (Mat_SeqAIJ *)submats[i]->data; 2695 imat_ilen = subc->ilen; 2696 imat_j = subc->j; 2697 imat_i = subc->i; 2698 imat_a = subc->a; 2699 2700 if (!allcolumns[i]) cmap_i = cmap[i]; 2701 rmap_i = rmap[i]; 2702 irow_i = irow[i]; 2703 jmax = nrow[i]; 2704 for (PetscInt j = 0; j < jmax; j++) { 2705 row = irow_i[j]; 2706 proc = row2proc_i[j]; 2707 if (proc == rank) { 2708 old_row = row; 2709 #if defined(PETSC_USE_CTABLE) 2710 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2711 row--; 2712 #else 2713 row = rmap_i[row]; 2714 #endif 2715 ilen_row = imat_ilen[row]; 2716 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2717 mat_i = imat_i[row]; 2718 mat_a = imat_a + mat_i; 2719 mat_j = imat_j + mat_i; 2720 if (!allcolumns[i]) { 2721 for (PetscInt k = 0; k < ncols; k++) { 2722 #if defined(PETSC_USE_CTABLE) 2723 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2724 #else 2725 tcol = cmap_i[cols[k]]; 2726 #endif 2727 if (tcol) { 2728 *mat_j++ = tcol - 1; 2729 *mat_a++ = vals[k]; 2730 ilen_row++; 2731 } 2732 } 2733 } else { /* allcolumns */ 2734 for (PetscInt k = 0; k < ncols; k++) { 2735 *mat_j++ = cols[k]; /* global col index! */ 2736 *mat_a++ = vals[k]; 2737 ilen_row++; 2738 } 2739 } 2740 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2741 2742 imat_ilen[row] = ilen_row; 2743 } 2744 } 2745 } 2746 2747 /* Now assemble the off proc rows */ 2748 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2749 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2750 sbuf1_i = sbuf1[pa[tmp2]]; 2751 jmax = sbuf1_i[0]; 2752 ct1 = 2 * jmax + 1; 2753 ct2 = 0; 2754 rbuf2_i = rbuf2[tmp2]; 2755 rbuf3_i = rbuf3[tmp2]; 2756 rbuf4_i = rbuf4[tmp2]; 2757 for (PetscInt j = 1; j <= jmax; j++) { 2758 is_no = sbuf1_i[2 * j - 1]; 2759 rmap_i = rmap[is_no]; 2760 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2761 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2762 imat_ilen = subc->ilen; 2763 imat_j = subc->j; 2764 imat_i = subc->i; 2765 imat_a = subc->a; 2766 max1 = sbuf1_i[2 * j]; 2767 for (PetscInt k = 0; k < max1; k++, ct1++) { 2768 row = sbuf1_i[ct1]; 2769 #if defined(PETSC_USE_CTABLE) 2770 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2771 row--; 2772 #else 2773 row = rmap_i[row]; 2774 #endif 2775 ilen = imat_ilen[row]; 2776 mat_i = imat_i[row]; 2777 mat_a = PetscSafePointerPlusOffset(imat_a, mat_i); 2778 mat_j = PetscSafePointerPlusOffset(imat_j, mat_i); 2779 max2 = rbuf2_i[ct1]; 2780 if (!allcolumns[is_no]) { 2781 for (PetscInt l = 0; l < max2; l++, ct2++) { 2782 #if defined(PETSC_USE_CTABLE) 2783 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2784 #else 2785 tcol = cmap_i[rbuf3_i[ct2]]; 2786 #endif 2787 if (tcol) { 2788 *mat_j++ = tcol - 1; 2789 *mat_a++ = rbuf4_i[ct2]; 2790 ilen++; 2791 } 2792 } 2793 } else { /* allcolumns */ 2794 for (PetscInt l = 0; l < max2; l++, ct2++) { 2795 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2796 *mat_a++ = rbuf4_i[ct2]; 2797 ilen++; 2798 } 2799 } 2800 imat_ilen[row] = ilen; 2801 } 2802 } 2803 } 2804 2805 if (!iscsorted) { /* sort column indices of the rows */ 2806 for (PetscInt i = 0; i < ismax; i++) { 2807 subc = (Mat_SeqAIJ *)submats[i]->data; 2808 imat_j = subc->j; 2809 imat_i = subc->i; 2810 imat_a = subc->a; 2811 imat_ilen = subc->ilen; 2812 2813 if (allcolumns[i]) continue; 2814 jmax = nrow[i]; 2815 for (PetscInt j = 0; j < jmax; j++) { 2816 mat_i = imat_i[j]; 2817 mat_a = imat_a + mat_i; 2818 mat_j = imat_j + mat_i; 2819 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2820 } 2821 } 2822 } 2823 2824 PetscCall(PetscFree(r_waits4)); 2825 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2826 PetscCall(PetscFree(s_waits4)); 2827 2828 /* Restore the indices */ 2829 for (PetscInt i = 0; i < ismax; i++) { 2830 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2831 if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i)); 2832 } 2833 2834 for (PetscInt i = 0; i < ismax; i++) { 2835 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2836 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2837 } 2838 2839 /* Destroy allocated memory */ 2840 if (sbuf_aa) { 2841 PetscCall(PetscFree(sbuf_aa[0])); 2842 PetscCall(PetscFree(sbuf_aa)); 2843 } 2844 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2845 2846 for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 2847 PetscCall(PetscFree(rbuf4)); 2848 2849 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2850 PetscFunctionReturn(PETSC_SUCCESS); 2851 } 2852 2853 /* 2854 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2855 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2856 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2857 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2858 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2859 state, and needs to be "assembled" later by compressing B's column space. 2860 2861 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2862 Following this call, C->A & C->B have been created, even if empty. 2863 */ 2864 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) 2865 { 2866 /* If making this function public, change the error returned in this function away from _PLIB. */ 2867 Mat_MPIAIJ *aij; 2868 Mat_SeqAIJ *Baij; 2869 PetscBool seqaij, Bdisassembled; 2870 PetscInt m, n, *nz, ngcol, col, rstart, rend, shift, count; 2871 PetscScalar v; 2872 const PetscInt *rowindices, *colindices; 2873 2874 PetscFunctionBegin; 2875 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2876 if (A) { 2877 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2878 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2879 if (rowemb) { 2880 PetscCall(ISGetLocalSize(rowemb, &m)); 2881 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2882 } else { 2883 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2884 } 2885 if (dcolemb) { 2886 PetscCall(ISGetLocalSize(dcolemb, &n)); 2887 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2888 } else { 2889 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2890 } 2891 } 2892 if (B) { 2893 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2894 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2895 if (rowemb) { 2896 PetscCall(ISGetLocalSize(rowemb, &m)); 2897 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2898 } else { 2899 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2900 } 2901 if (ocolemb) { 2902 PetscCall(ISGetLocalSize(ocolemb, &n)); 2903 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2904 } else { 2905 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2906 } 2907 } 2908 2909 aij = (Mat_MPIAIJ *)C->data; 2910 if (!aij->A) { 2911 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2912 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2913 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2914 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2915 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2916 } 2917 if (A) { 2918 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2919 } else { 2920 PetscCall(MatSetUp(aij->A)); 2921 } 2922 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2923 /* 2924 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2925 need to "disassemble" B -- convert it to using C's global indices. 2926 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2927 2928 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2929 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2930 2931 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2932 At least avoid calling MatSetValues() and the implied searches? 2933 */ 2934 2935 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2936 #if defined(PETSC_USE_CTABLE) 2937 PetscCall(PetscHMapIDestroy(&aij->colmap)); 2938 #else 2939 PetscCall(PetscFree(aij->colmap)); 2940 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2941 #endif 2942 ngcol = 0; 2943 if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol)); 2944 if (aij->garray) PetscCall(PetscFree(aij->garray)); 2945 PetscCall(VecDestroy(&aij->lvec)); 2946 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2947 } 2948 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B)); 2949 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B)); 2950 } 2951 Bdisassembled = PETSC_FALSE; 2952 if (!aij->B) { 2953 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2954 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2955 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2956 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2957 Bdisassembled = PETSC_TRUE; 2958 } 2959 if (B) { 2960 Baij = (Mat_SeqAIJ *)B->data; 2961 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2962 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2963 for (PetscInt i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i]; 2964 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2965 PetscCall(PetscFree(nz)); 2966 } 2967 2968 PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend)); 2969 shift = rend - rstart; 2970 count = 0; 2971 rowindices = NULL; 2972 colindices = NULL; 2973 if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices)); 2974 if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices)); 2975 for (PetscInt i = 0; i < B->rmap->n; i++) { 2976 PetscInt row; 2977 row = i; 2978 if (rowindices) row = rowindices[i]; 2979 for (PetscInt j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2980 col = Baij->j[count]; 2981 if (colindices) col = colindices[col]; 2982 if (Bdisassembled && col >= rstart) col += shift; 2983 v = Baij->a[count]; 2984 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2985 ++count; 2986 } 2987 } 2988 /* No assembly for aij->B is necessary. */ 2989 /* FIXME: set aij->B's nonzerostate correctly. */ 2990 } else { 2991 PetscCall(MatSetUp(aij->B)); 2992 } 2993 C->preallocated = PETSC_TRUE; 2994 C->was_assembled = PETSC_FALSE; 2995 C->assembled = PETSC_FALSE; 2996 /* 2997 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2998 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2999 */ 3000 PetscFunctionReturn(PETSC_SUCCESS); 3001 } 3002 3003 /* 3004 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3005 */ 3006 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) 3007 { 3008 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 3009 3010 PetscFunctionBegin; 3011 PetscAssertPointer(A, 2); 3012 PetscAssertPointer(B, 3); 3013 /* FIXME: make sure C is assembled */ 3014 *A = aij->A; 3015 *B = aij->B; 3016 /* Note that we don't incref *A and *B, so be careful! */ 3017 PetscFunctionReturn(PETSC_SUCCESS); 3018 } 3019 3020 /* 3021 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3022 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3023 */ 3024 static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) 3025 { 3026 PetscMPIInt size, flag; 3027 PetscInt cismax, ispar; 3028 Mat *A, *B; 3029 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 3030 3031 PetscFunctionBegin; 3032 if (!ismax) PetscFunctionReturn(PETSC_SUCCESS); 3033 3034 cismax = 0; 3035 for (PetscInt i = 0; i < ismax; ++i) { 3036 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 3037 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3038 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3039 if (size > 1) ++cismax; 3040 } 3041 3042 /* 3043 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3044 ispar counts the number of parallel ISs across C's comm. 3045 */ 3046 PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 3047 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3048 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 3049 PetscFunctionReturn(PETSC_SUCCESS); 3050 } 3051 3052 /* if (ispar) */ 3053 /* 3054 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3055 These are used to extract the off-diag portion of the resulting parallel matrix. 3056 The row IS for the off-diag portion is the same as for the diag portion, 3057 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3058 */ 3059 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3060 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3061 for (PetscInt i = 0, ii = 0; i < ismax; ++i) { 3062 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3063 if (size > 1) { 3064 /* 3065 TODO: This is the part that's ***NOT SCALABLE***. 3066 To fix this we need to extract just the indices of C's nonzero columns 3067 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3068 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3069 to be done without serializing on the IS list, so, most likely, it is best 3070 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3071 */ 3072 PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii])); 3073 /* Now we have to 3074 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3075 were sorted on each rank, concatenated they might no longer be sorted; 3076 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3077 indices in the nondecreasing order to the original index positions. 3078 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3079 */ 3080 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3081 PetscCall(ISSort(ciscol[ii])); 3082 ++ii; 3083 } 3084 } 3085 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3086 for (PetscInt i = 0, ii = 0; i < ismax; ++i) { 3087 PetscInt issize; 3088 const PetscInt *indices; 3089 3090 /* 3091 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3092 */ 3093 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3094 PetscCall(ISSort(isrow[i])); 3095 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3096 PetscCall(ISGetIndices(isrow[i], &indices)); 3097 for (PetscInt j = 1; j < issize; ++j) { 3098 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3099 } 3100 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3101 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3102 PetscCall(ISSort(iscol[i])); 3103 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3104 PetscCall(ISGetIndices(iscol[i], &indices)); 3105 for (PetscInt j = 1; j < issize; ++j) { 3106 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3107 } 3108 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3109 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3110 if (size > 1) { 3111 cisrow[ii] = isrow[i]; 3112 ++ii; 3113 } 3114 } 3115 /* 3116 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3117 array of sequential matrices underlying the resulting parallel matrices. 3118 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3119 contain duplicates. 3120 3121 There are as many diag matrices as there are original index sets. There are only as many parallel 3122 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3123 3124 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3125 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3126 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3127 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3128 with A[i] and B[ii] extracted from the corresponding MPI submat. 3129 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3130 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3131 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3132 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3133 values into A[i] and B[ii] sitting inside the corresponding submat. 3134 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3135 A[i], B[ii], AA[i] or BB[ii] matrices. 3136 */ 3137 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3138 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat)); 3139 3140 /* Now obtain the sequential A and B submatrices separately. */ 3141 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3142 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3143 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3144 3145 /* 3146 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3147 matrices A & B have been extracted directly into the parallel matrices containing them, or 3148 simply into the sequential matrix identical with the corresponding A (if size == 1). 3149 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3150 to have the same sparsity pattern. 3151 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3152 must be constructed for C. This is done by setseqmat(s). 3153 */ 3154 for (PetscInt i = 0, ii = 0; i < ismax; ++i) { 3155 /* 3156 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3157 That way we can avoid sorting and computing permutations when reusing. 3158 To this end: 3159 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3160 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3161 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3162 */ 3163 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3164 3165 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3166 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3167 if (size > 1) { 3168 if (scall == MAT_INITIAL_MATRIX) { 3169 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3170 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3171 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3172 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3173 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3174 } 3175 /* 3176 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3177 */ 3178 { 3179 Mat AA = A[i], BB = B[ii]; 3180 3181 if (AA || BB) { 3182 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3183 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3184 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3185 } 3186 PetscCall(MatDestroy(&AA)); 3187 } 3188 PetscCall(ISDestroy(ciscol + ii)); 3189 PetscCall(ISDestroy(ciscol_p + ii)); 3190 ++ii; 3191 } else { /* if (size == 1) */ 3192 if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i])); 3193 if (isrow_p[i] || iscol_p[i]) { 3194 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3195 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3196 /* Otherwise A is extracted straight into (*submats)[i]. */ 3197 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3198 PetscCall(MatDestroy(A + i)); 3199 } else (*submat)[i] = A[i]; 3200 } 3201 PetscCall(ISDestroy(&isrow_p[i])); 3202 PetscCall(ISDestroy(&iscol_p[i])); 3203 } 3204 PetscCall(PetscFree2(cisrow, ciscol)); 3205 PetscCall(PetscFree2(isrow_p, iscol_p)); 3206 PetscCall(PetscFree(ciscol_p)); 3207 PetscCall(PetscFree(A)); 3208 PetscCall(MatDestroySubMatrices(cismax, &B)); 3209 PetscFunctionReturn(PETSC_SUCCESS); 3210 } 3211 3212 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 3213 { 3214 PetscFunctionBegin; 3215 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3216 PetscFunctionReturn(PETSC_SUCCESS); 3217 } 3218