1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 /* 22 Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks 23 24 The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but 25 that is a very major recoding job. 26 27 Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin 28 */ 29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[]) 30 { 31 PetscFunctionBegin; 32 for (PetscInt i = 0; i < imax; i++) { 33 if (!is[i]) break; 34 PetscInt n = 0, N, Nmax, Nmin; 35 const PetscInt *idx; 36 PetscInt *nidx = NULL; 37 MPI_Comm comm; 38 PetscBT bt; 39 40 PetscCall(ISGetLocalSize(is[i], &N)); 41 if (N > 0) { /* Nmax and Nmin are garbage for empty IS */ 42 PetscCall(ISGetIndices(is[i], &idx)); 43 PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax)); 44 Nmin = Nmin / bs; 45 Nmax = Nmax / bs; 46 PetscCall(PetscBTCreate(Nmax - Nmin, &bt)); 47 for (PetscInt j = 0; j < N; j++) { 48 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++; 49 } 50 PetscCall(PetscMalloc1(n, &nidx)); 51 n = 0; 52 PetscCall(PetscBTMemzero(Nmax - Nmin, bt)); 53 for (PetscInt j = 0; j < N; j++) { 54 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs; 55 } 56 PetscCall(PetscBTDestroy(&bt)); 57 PetscCall(ISRestoreIndices(is[i], &idx)); 58 } 59 PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm)); 60 PetscCall(ISDestroy(is + i)); 61 PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i)); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) 67 { 68 PetscInt i; 69 70 PetscFunctionBegin; 71 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 72 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); 73 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 74 PetscFunctionReturn(PETSC_SUCCESS); 75 } 76 77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) 78 { 79 PetscInt i; 80 81 PetscFunctionBegin; 82 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 83 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); 84 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) 89 { 90 MPI_Comm comm; 91 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_isids, j; 92 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 93 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 94 const PetscInt *indices_i, **indices; 95 PetscLayout rmap; 96 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner, *rrow_ranks; 97 PetscSF sf; 98 PetscSFNode *remote; 99 100 PetscFunctionBegin; 101 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 102 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 103 PetscCallMPI(MPI_Comm_size(comm, &size)); 104 /* get row map to determine where rows should be going */ 105 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 106 /* retrieve IS data and put all together so that we 107 * can optimize communication 108 * */ 109 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 110 tlength = 0; 111 for (PetscInt i = 0; i < nidx; i++) { 112 PetscCall(ISGetLocalSize(is[i], &length[i])); 113 tlength += length[i]; 114 PetscCall(ISGetIndices(is[i], &indices[i])); 115 } 116 /* find these rows on remote processors */ 117 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 118 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 119 nrrows = 0; 120 for (PetscInt i = 0; i < nidx; i++) { 121 length_i = length[i]; 122 indices_i = indices[i]; 123 for (PetscInt j = 0; j < length_i; j++) { 124 owner = -1; 125 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 126 /* remote processors */ 127 if (owner != rank) { 128 tosizes_temp[owner]++; /* number of rows to owner */ 129 rrow_ranks[nrrows] = owner; /* processor */ 130 rrow_isids[nrrows] = i; /* is id */ 131 remoterows[nrrows++] = indices_i[j]; /* row */ 132 } 133 } 134 PetscCall(ISRestoreIndices(is[i], &indices[i])); 135 } 136 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 137 /* test if we need to exchange messages 138 * generally speaking, we do not need to exchange 139 * data when overlap is 1 140 * */ 141 PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 142 /* we do not have any messages 143 * It usually corresponds to overlap 1 144 * */ 145 if (!reducednrrows) { 146 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 147 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 148 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 149 PetscFunctionReturn(PETSC_SUCCESS); 150 } 151 nto = 0; 152 /* send sizes and ranks for building a two-sided communication */ 153 for (PetscMPIInt i = 0; i < size; i++) { 154 if (tosizes_temp[i]) { 155 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 156 tosizes_temp[i] = nto; /* a map from processor to index */ 157 toranks[nto++] = i; /* MPI process */ 158 } 159 } 160 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 161 toffsets[0] = 0; 162 for (PetscInt i = 0; i < nto; i++) { 163 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 164 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 165 } 166 /* send information to other processors */ 167 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 168 nrecvrows = 0; 169 for (PetscMPIInt i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 170 PetscCall(PetscMalloc1(nrecvrows, &remote)); 171 nrecvrows = 0; 172 for (PetscMPIInt i = 0; i < nfrom; i++) { 173 for (PetscInt j = 0; j < fromsizes[2 * i]; j++) { 174 remote[nrecvrows].rank = fromranks[i]; 175 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 176 } 177 } 178 PetscCall(PetscSFCreate(comm, &sf)); 179 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 180 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 181 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 182 PetscCall(PetscSFSetFromOptions(sf)); 183 /* message pair <no of is, row> */ 184 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 185 for (PetscInt i = 0; i < nrrows; i++) { 186 owner = rrow_ranks[i]; /* process */ 187 j = tosizes_temp[owner]; /* index */ 188 todata[toffsets[j]++] = rrow_isids[i]; 189 todata[toffsets[j]++] = remoterows[i]; 190 } 191 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 192 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 193 PetscCall(PetscFree(toffsets)); 194 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 195 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 196 PetscCall(PetscSFDestroy(&sf)); 197 /* send rows belonging to the remote so that then we could get the overlapping data back */ 198 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 199 PetscCall(PetscFree2(todata, fromdata)); 200 PetscCall(PetscFree(fromsizes)); 201 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 202 PetscCall(PetscFree(fromranks)); 203 nrecvrows = 0; 204 for (PetscInt i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 205 PetscCall(PetscCalloc1(nrecvrows, &todata)); 206 PetscCall(PetscMalloc1(nrecvrows, &remote)); 207 nrecvrows = 0; 208 for (PetscInt i = 0; i < nto; i++) { 209 for (PetscInt j = 0; j < tosizes[2 * i]; j++) { 210 remote[nrecvrows].rank = toranks[i]; 211 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 212 } 213 } 214 PetscCall(PetscSFCreate(comm, &sf)); 215 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 216 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 217 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 218 PetscCall(PetscSFSetFromOptions(sf)); 219 /* overlap communication and computation */ 220 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 221 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 222 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 223 PetscCall(PetscSFDestroy(&sf)); 224 PetscCall(PetscFree2(sbdata, sbsizes)); 225 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 226 PetscCall(PetscFree(toranks)); 227 PetscCall(PetscFree(tosizes)); 228 PetscCall(PetscFree(todata)); 229 PetscFunctionReturn(PETSC_SUCCESS); 230 } 231 232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 233 { 234 PetscInt *isz, isz_i, i, j, is_id, data_size; 235 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 236 const PetscInt *indices_i_temp; 237 MPI_Comm *iscomms; 238 239 PetscFunctionBegin; 240 max_lsize = 0; 241 PetscCall(PetscMalloc1(nidx, &isz)); 242 for (i = 0; i < nidx; i++) { 243 PetscCall(ISGetLocalSize(is[i], &lsize)); 244 max_lsize = lsize > max_lsize ? lsize : max_lsize; 245 isz[i] = lsize; 246 } 247 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 248 for (i = 0; i < nidx; i++) { 249 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 250 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 251 PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i])); 252 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 253 PetscCall(ISDestroy(&is[i])); 254 } 255 /* retrieve information to get row id and its overlap */ 256 for (i = 0; i < nrecvs;) { 257 is_id = recvdata[i++]; 258 data_size = recvdata[i++]; 259 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 260 isz_i = isz[is_id]; 261 for (j = 0; j < data_size; j++) { 262 col = recvdata[i++]; 263 indices_i[isz_i++] = col; 264 } 265 isz[is_id] = isz_i; 266 } 267 /* remove duplicate entities */ 268 for (i = 0; i < nidx; i++) { 269 indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i); 270 isz_i = isz[i]; 271 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 272 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 273 PetscCall(PetscCommDestroy(&iscomms[i])); 274 } 275 PetscCall(PetscFree(isz)); 276 PetscCall(PetscFree2(indices_temp, iscomms)); 277 PetscFunctionReturn(PETSC_SUCCESS); 278 } 279 280 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 281 { 282 PetscLayout rmap, cmap; 283 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 284 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 285 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 286 const PetscInt *gcols, *ai, *aj, *bi, *bj; 287 Mat amat, bmat; 288 PetscMPIInt rank; 289 PetscBool done; 290 MPI_Comm comm; 291 292 PetscFunctionBegin; 293 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 294 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 295 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 296 /* Even if the mat is symmetric, we still assume it is not symmetric */ 297 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 298 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 299 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 300 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 301 /* total number of nonzero values is used to estimate the memory usage in the next step */ 302 tnz = ai[an] + bi[bn]; 303 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 304 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 305 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 306 /* to find the longest message */ 307 max_fszs = 0; 308 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 309 /* better way to estimate number of nonzero in the mat??? */ 310 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 311 for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i); 312 rows_pos = 0; 313 totalrows = 0; 314 for (i = 0; i < nfrom; i++) { 315 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 316 /* group data together */ 317 for (j = 0; j < fromsizes[2 * i]; j += 2) { 318 is_id = fromrows[rows_pos++]; /* no of is */ 319 rows_i = rows_data[is_id]; 320 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 321 } 322 /* estimate a space to avoid multiple allocations */ 323 for (j = 0; j < nidx; j++) { 324 indvc_ij = 0; 325 rows_i = rows_data[j]; 326 for (l = 0; l < rows_pos_i[j]; l++) { 327 row = rows_i[l] - rstart; 328 start = ai[row]; 329 end = ai[row + 1]; 330 for (k = start; k < end; k++) { /* Amat */ 331 col = aj[k] + cstart; 332 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 333 } 334 start = bi[row]; 335 end = bi[row + 1]; 336 for (k = start; k < end; k++) { /* Bmat */ 337 col = gcols[bj[k]]; 338 indices_tmp[indvc_ij++] = col; 339 } 340 } 341 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 342 indv_counts[i * nidx + j] = indvc_ij; 343 totalrows += indvc_ij; 344 } 345 } 346 /* message triple <no of is, number of rows, rows> */ 347 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 348 totalrows = 0; 349 rows_pos = 0; 350 /* use this code again */ 351 for (i = 0; i < nfrom; i++) { 352 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 353 for (j = 0; j < fromsizes[2 * i]; j += 2) { 354 is_id = fromrows[rows_pos++]; 355 rows_i = rows_data[is_id]; 356 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 357 } 358 /* add data */ 359 for (j = 0; j < nidx; j++) { 360 if (!indv_counts[i * nidx + j]) continue; 361 indvc_ij = 0; 362 sbdata[totalrows++] = j; 363 sbdata[totalrows++] = indv_counts[i * nidx + j]; 364 sbsizes[2 * i] += 2; 365 rows_i = rows_data[j]; 366 for (l = 0; l < rows_pos_i[j]; l++) { 367 row = rows_i[l] - rstart; 368 start = ai[row]; 369 end = ai[row + 1]; 370 for (k = start; k < end; k++) { /* Amat */ 371 col = aj[k] + cstart; 372 indices_tmp[indvc_ij++] = col; 373 } 374 start = bi[row]; 375 end = bi[row + 1]; 376 for (k = start; k < end; k++) { /* Bmat */ 377 col = gcols[bj[k]]; 378 indices_tmp[indvc_ij++] = col; 379 } 380 } 381 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 382 sbsizes[2 * i] += indvc_ij; 383 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 384 totalrows += indvc_ij; 385 } 386 } 387 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 388 offsets[0] = 0; 389 for (i = 0; i < nfrom; i++) { 390 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 391 sbsizes[2 * i + 1] = offsets[i]; 392 } 393 PetscCall(PetscFree(offsets)); 394 if (sbrowsizes) *sbrowsizes = sbsizes; 395 if (sbrows) *sbrows = sbdata; 396 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 397 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 398 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 399 PetscFunctionReturn(PETSC_SUCCESS); 400 } 401 402 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) 403 { 404 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 405 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 406 PetscInt lsize, lsize_tmp; 407 PetscMPIInt rank, owner; 408 Mat amat, bmat; 409 PetscBool done; 410 PetscLayout cmap, rmap; 411 MPI_Comm comm; 412 413 PetscFunctionBegin; 414 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 415 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 416 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 417 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 418 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 419 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 420 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 421 /* is it a safe way to compute number of nonzero values ? */ 422 tnz = ai[an] + bi[bn]; 423 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 424 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 425 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 426 /* it is a better way to estimate memory than the old implementation 427 * where global size of matrix is used 428 * */ 429 PetscCall(PetscMalloc1(tnz, &indices_temp)); 430 for (i = 0; i < nidx; i++) { 431 MPI_Comm iscomm; 432 433 PetscCall(ISGetLocalSize(is[i], &lsize)); 434 PetscCall(ISGetIndices(is[i], &indices)); 435 lsize_tmp = 0; 436 for (j = 0; j < lsize; j++) { 437 owner = -1; 438 row = indices[j]; 439 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 440 if (owner != rank) continue; 441 /* local number */ 442 row -= rstart; 443 start = ai[row]; 444 end = ai[row + 1]; 445 for (k = start; k < end; k++) { /* Amat */ 446 col = aj[k] + cstart; 447 indices_temp[lsize_tmp++] = col; 448 } 449 start = bi[row]; 450 end = bi[row + 1]; 451 for (k = start; k < end; k++) { /* Bmat */ 452 col = gcols[bj[k]]; 453 indices_temp[lsize_tmp++] = col; 454 } 455 } 456 PetscCall(ISRestoreIndices(is[i], &indices)); 457 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 458 PetscCall(ISDestroy(&is[i])); 459 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 460 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 461 PetscCall(PetscCommDestroy(&iscomm)); 462 } 463 PetscCall(PetscFree(indices_temp)); 464 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 465 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 466 PetscFunctionReturn(PETSC_SUCCESS); 467 } 468 469 /* 470 Sample message format: 471 If a processor A wants processor B to process some elements corresponding 472 to index sets is[1],is[5] 473 mesg [0] = 2 (no of index sets in the mesg) 474 ----------- 475 mesg [1] = 1 => is[1] 476 mesg [2] = sizeof(is[1]); 477 ----------- 478 mesg [3] = 5 => is[5] 479 mesg [4] = sizeof(is[5]); 480 ----------- 481 mesg [5] 482 mesg [n] datas[1] 483 ----------- 484 mesg[n+1] 485 mesg[m] data(is[5]) 486 ----------- 487 488 Notes: 489 nrqs - no of requests sent (or to be sent out) 490 nrqr - no of requests received (which have to be or which have been processed) 491 */ 492 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) 493 { 494 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 495 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 496 const PetscInt **idx, *idx_i; 497 PetscInt *n, **data, len; 498 #if defined(PETSC_USE_CTABLE) 499 PetscHMapI *table_data, table_data_i; 500 PetscInt *tdata, tcount, tcount_max; 501 #else 502 PetscInt *data_i, *d_p; 503 #endif 504 PetscMPIInt size, rank, tag1, tag2, proc = 0, nrqs, *pa; 505 PetscInt M, k, **rbuf, row, msz, **outdat, **ptr, *isz1; 506 PetscInt *ctr, *tmp, *isz, **xdata, **rbuf2; 507 PetscBT *table; 508 MPI_Comm comm; 509 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 510 MPI_Status *recv_status; 511 MPI_Comm *iscomms; 512 char *t_p; 513 514 PetscFunctionBegin; 515 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 516 size = c->size; 517 rank = c->rank; 518 M = C->rmap->N; 519 520 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 521 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 522 523 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 524 525 for (PetscInt i = 0; i < imax; i++) { 526 PetscCall(ISGetIndices(is[i], &idx[i])); 527 PetscCall(ISGetLocalSize(is[i], &n[i])); 528 } 529 530 /* evaluate communication - mesg to who,length of mesg, and buffer space 531 required. Based on this, buffers are allocated, and data copied into them */ 532 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 533 for (PetscInt i = 0; i < imax; i++) { 534 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 535 idx_i = idx[i]; 536 len = n[i]; 537 for (PetscInt j = 0; j < len; j++) { 538 row = idx_i[j]; 539 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 540 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 541 w4[proc]++; 542 } 543 for (PetscMPIInt j = 0; j < size; j++) { 544 if (w4[j]) { 545 w1[j] += w4[j]; 546 w3[j]++; 547 } 548 } 549 } 550 551 nrqs = 0; /* no of outgoing messages */ 552 msz = 0; /* total mesg length (for all proc */ 553 w1[rank] = 0; /* no mesg sent to intself */ 554 w3[rank] = 0; 555 for (PetscMPIInt i = 0; i < size; i++) { 556 if (w1[i]) { 557 w2[i] = 1; 558 nrqs++; 559 } /* there exists a message to proc i */ 560 } 561 /* pa - is list of processors to communicate with */ 562 PetscCall(PetscMalloc1(nrqs, &pa)); 563 for (PetscMPIInt i = 0, j = 0; i < size; i++) { 564 if (w1[i]) { 565 pa[j] = i; 566 j++; 567 } 568 } 569 570 /* Each message would have a header = 1 + 2*(no of IS) + data */ 571 for (PetscMPIInt i = 0; i < nrqs; i++) { 572 PetscMPIInt j = pa[i]; 573 w1[j] += w2[j] + 2 * w3[j]; 574 msz += w1[j]; 575 } 576 577 /* Determine the number of messages to expect, their lengths, from from-ids */ 578 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 579 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 580 581 /* Now post the Irecvs corresponding to these messages */ 582 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 583 584 /* Allocate Memory for outgoing messages */ 585 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 586 PetscCall(PetscArrayzero(outdat, size)); 587 PetscCall(PetscArrayzero(ptr, size)); 588 589 { 590 PetscInt *iptr = tmp, ict = 0; 591 for (PetscMPIInt i = 0; i < nrqs; i++) { 592 PetscMPIInt j = pa[i]; 593 iptr += ict; 594 outdat[j] = iptr; 595 ict = w1[j]; 596 } 597 } 598 599 /* Form the outgoing messages */ 600 /* plug in the headers */ 601 for (PetscMPIInt i = 0; i < nrqs; i++) { 602 PetscMPIInt j = pa[i]; 603 outdat[j][0] = 0; 604 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 605 ptr[j] = outdat[j] + 2 * w3[j] + 1; 606 } 607 608 /* Memory for doing local proc's work */ 609 { 610 PetscInt M_BPB_imax = 0; 611 #if defined(PETSC_USE_CTABLE) 612 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 613 PetscCall(PetscMalloc1(imax, &table_data)); 614 for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i)); 615 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 616 for (PetscInt i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 617 #else 618 PetscInt Mimax = 0; 619 PetscCall(PetscIntMultError(M, imax, &Mimax)); 620 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 621 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 622 for (PetscInt i = 0; i < imax; i++) { 623 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 624 data[i] = d_p + M * i; 625 } 626 #endif 627 } 628 629 /* Parse the IS and update local tables and the outgoing buf with the data */ 630 { 631 PetscInt n_i, isz_i, *outdat_j, ctr_j; 632 PetscBT table_i; 633 634 for (PetscInt i = 0; i < imax; i++) { 635 PetscCall(PetscArrayzero(ctr, size)); 636 n_i = n[i]; 637 table_i = table[i]; 638 idx_i = idx[i]; 639 #if defined(PETSC_USE_CTABLE) 640 table_data_i = table_data[i]; 641 #else 642 data_i = data[i]; 643 #endif 644 isz_i = isz[i]; 645 for (PetscInt j = 0; j < n_i; j++) { /* parse the indices of each IS */ 646 row = idx_i[j]; 647 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 648 if (proc != rank) { /* copy to the outgoing buffer */ 649 ctr[proc]++; 650 *ptr[proc] = row; 651 ptr[proc]++; 652 } else if (!PetscBTLookupSet(table_i, row)) { 653 #if defined(PETSC_USE_CTABLE) 654 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 655 #else 656 data_i[isz_i] = row; /* Update the local table */ 657 #endif 658 isz_i++; 659 } 660 } 661 /* Update the headers for the current IS */ 662 for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 663 if ((ctr_j = ctr[j])) { 664 outdat_j = outdat[j]; 665 k = ++outdat_j[0]; 666 outdat_j[2 * k] = ctr_j; 667 outdat_j[2 * k - 1] = i; 668 } 669 } 670 isz[i] = isz_i; 671 } 672 } 673 674 /* Now post the sends */ 675 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 676 for (PetscMPIInt i = 0; i < nrqs; ++i) { 677 PetscMPIInt j = pa[i]; 678 PetscCallMPI(MPIU_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 679 } 680 681 /* No longer need the original indices */ 682 PetscCall(PetscMalloc1(imax, &iscomms)); 683 for (PetscInt i = 0; i < imax; ++i) { 684 PetscCall(ISRestoreIndices(is[i], idx + i)); 685 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 686 } 687 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 688 689 for (PetscInt i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i])); 690 691 /* Do Local work */ 692 #if defined(PETSC_USE_CTABLE) 693 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 694 #else 695 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 696 #endif 697 698 /* Receive messages */ 699 PetscCall(PetscMalloc1(nrqr, &recv_status)); 700 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 701 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 702 703 /* Phase 1 sends are complete - deallocate buffers */ 704 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 705 PetscCall(PetscFree4(w1, w2, w3, w4)); 706 707 PetscCall(PetscMalloc1(nrqr, &xdata)); 708 PetscCall(PetscMalloc1(nrqr, &isz1)); 709 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 710 PetscCall(PetscFree(rbuf[0])); 711 PetscCall(PetscFree(rbuf)); 712 713 /* Send the data back */ 714 /* Do a global reduction to know the buffer space req for incoming messages */ 715 { 716 PetscMPIInt *rw1; 717 718 PetscCall(PetscCalloc1(size, &rw1)); 719 for (PetscMPIInt i = 0; i < nrqr; ++i) { 720 proc = recv_status[i].MPI_SOURCE; 721 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 722 PetscCall(PetscMPIIntCast(isz1[i], &rw1[proc])); 723 } 724 PetscCall(PetscFree(onodes1)); 725 PetscCall(PetscFree(olengths1)); 726 727 /* Determine the number of messages to expect, their lengths, from from-ids */ 728 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 729 PetscCall(PetscFree(rw1)); 730 } 731 /* Now post the Irecvs corresponding to these messages */ 732 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 733 734 /* Now post the sends */ 735 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 736 for (PetscMPIInt i = 0; i < nrqr; ++i) { PetscCallMPI(MPIU_Isend(xdata[i], isz1[i], MPIU_INT, recv_status[i].MPI_SOURCE, tag2, comm, s_waits2 + i)); } 737 738 /* receive work done on other processors */ 739 { 740 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 741 PetscMPIInt idex; 742 PetscBT table_i; 743 744 for (PetscMPIInt i = 0; i < nrqs; ++i) { 745 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 746 /* Process the message */ 747 rbuf2_i = rbuf2[idex]; 748 ct1 = 2 * rbuf2_i[0] + 1; 749 jmax = rbuf2[idex][0]; 750 for (PetscInt j = 1; j <= jmax; j++) { 751 max = rbuf2_i[2 * j]; 752 is_no = rbuf2_i[2 * j - 1]; 753 isz_i = isz[is_no]; 754 table_i = table[is_no]; 755 #if defined(PETSC_USE_CTABLE) 756 table_data_i = table_data[is_no]; 757 #else 758 data_i = data[is_no]; 759 #endif 760 for (k = 0; k < max; k++, ct1++) { 761 row = rbuf2_i[ct1]; 762 if (!PetscBTLookupSet(table_i, row)) { 763 #if defined(PETSC_USE_CTABLE) 764 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 765 #else 766 data_i[isz_i] = row; 767 #endif 768 isz_i++; 769 } 770 } 771 isz[is_no] = isz_i; 772 } 773 } 774 775 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 776 } 777 778 #if defined(PETSC_USE_CTABLE) 779 tcount_max = 0; 780 for (PetscInt i = 0; i < imax; ++i) { 781 table_data_i = table_data[i]; 782 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 783 if (tcount_max < tcount) tcount_max = tcount; 784 } 785 PetscCall(PetscMalloc1(tcount_max + 1, &tdata)); 786 #endif 787 788 for (PetscInt i = 0; i < imax; ++i) { 789 #if defined(PETSC_USE_CTABLE) 790 PetscHashIter tpos; 791 PetscInt j; 792 793 table_data_i = table_data[i]; 794 PetscHashIterBegin(table_data_i, tpos); 795 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 796 PetscHashIterGetKey(table_data_i, tpos, k); 797 PetscHashIterGetVal(table_data_i, tpos, j); 798 PetscHashIterNext(table_data_i, tpos); 799 tdata[--j] = --k; 800 } 801 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 802 #else 803 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 804 #endif 805 PetscCall(PetscCommDestroy(&iscomms[i])); 806 } 807 808 PetscCall(PetscFree(iscomms)); 809 PetscCall(PetscFree(onodes2)); 810 PetscCall(PetscFree(olengths2)); 811 812 PetscCall(PetscFree(pa)); 813 PetscCall(PetscFree(rbuf2[0])); 814 PetscCall(PetscFree(rbuf2)); 815 PetscCall(PetscFree(s_waits1)); 816 PetscCall(PetscFree(r_waits1)); 817 PetscCall(PetscFree(s_waits2)); 818 PetscCall(PetscFree(r_waits2)); 819 PetscCall(PetscFree(recv_status)); 820 if (xdata) { 821 PetscCall(PetscFree(xdata[0])); 822 PetscCall(PetscFree(xdata)); 823 } 824 PetscCall(PetscFree(isz1)); 825 #if defined(PETSC_USE_CTABLE) 826 for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i)); 827 PetscCall(PetscFree(table_data)); 828 PetscCall(PetscFree(tdata)); 829 PetscCall(PetscFree4(table, data, isz, t_p)); 830 #else 831 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 832 #endif 833 PetscFunctionReturn(PETSC_SUCCESS); 834 } 835 836 /* 837 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 838 the work on the local processor. 839 840 Inputs: 841 C - MAT_MPIAIJ; 842 imax - total no of index sets processed at a time; 843 table - an array of char - size = m bits. 844 845 Output: 846 isz - array containing the count of the solution elements corresponding 847 to each index set; 848 data or table_data - pointer to the solutions 849 */ 850 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data) 851 { 852 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 853 Mat A = c->A, B = c->B; 854 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 855 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 856 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 857 PetscBT table_i; 858 #if defined(PETSC_USE_CTABLE) 859 PetscHMapI table_data_i; 860 PetscHashIter tpos; 861 PetscInt tcount, *tdata; 862 #else 863 PetscInt *data_i; 864 #endif 865 866 PetscFunctionBegin; 867 rstart = C->rmap->rstart; 868 cstart = C->cmap->rstart; 869 ai = a->i; 870 aj = a->j; 871 bi = b->i; 872 bj = b->j; 873 garray = c->garray; 874 875 for (i = 0; i < imax; i++) { 876 #if defined(PETSC_USE_CTABLE) 877 /* copy existing entries of table_data_i into tdata[] */ 878 table_data_i = table_data[i]; 879 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 880 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 881 882 PetscCall(PetscMalloc1(tcount, &tdata)); 883 PetscHashIterBegin(table_data_i, tpos); 884 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 885 PetscHashIterGetKey(table_data_i, tpos, row); 886 PetscHashIterGetVal(table_data_i, tpos, j); 887 PetscHashIterNext(table_data_i, tpos); 888 tdata[--j] = --row; 889 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 890 } 891 #else 892 data_i = data[i]; 893 #endif 894 table_i = table[i]; 895 isz_i = isz[i]; 896 max = isz[i]; 897 898 for (j = 0; j < max; j++) { 899 #if defined(PETSC_USE_CTABLE) 900 row = tdata[j] - rstart; 901 #else 902 row = data_i[j] - rstart; 903 #endif 904 start = ai[row]; 905 end = ai[row + 1]; 906 for (k = start; k < end; k++) { /* Amat */ 907 val = aj[k] + cstart; 908 if (!PetscBTLookupSet(table_i, val)) { 909 #if defined(PETSC_USE_CTABLE) 910 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 911 #else 912 data_i[isz_i] = val; 913 #endif 914 isz_i++; 915 } 916 } 917 start = bi[row]; 918 end = bi[row + 1]; 919 for (k = start; k < end; k++) { /* Bmat */ 920 val = garray[bj[k]]; 921 if (!PetscBTLookupSet(table_i, val)) { 922 #if defined(PETSC_USE_CTABLE) 923 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 924 #else 925 data_i[isz_i] = val; 926 #endif 927 isz_i++; 928 } 929 } 930 } 931 isz[i] = isz_i; 932 933 #if defined(PETSC_USE_CTABLE) 934 PetscCall(PetscFree(tdata)); 935 #endif 936 } 937 PetscFunctionReturn(PETSC_SUCCESS); 938 } 939 940 /* 941 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 942 and return the output 943 944 Input: 945 C - the matrix 946 nrqr - no of messages being processed. 947 rbuf - an array of pointers to the received requests 948 949 Output: 950 xdata - array of messages to be sent back 951 isz1 - size of each message 952 953 For better efficiency perhaps we should malloc separately each xdata[i], 954 then if a remalloc is required we need only copy the data for that one row 955 rather than all previous rows as it is now where a single large chunk of 956 memory is used. 957 958 */ 959 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) 960 { 961 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 962 Mat A = c->A, B = c->B; 963 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 964 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 965 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 966 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 967 PetscInt *rbuf_i, kmax, rbuf_0; 968 PetscBT xtable; 969 970 PetscFunctionBegin; 971 m = C->rmap->N; 972 rstart = C->rmap->rstart; 973 cstart = C->cmap->rstart; 974 ai = a->i; 975 aj = a->j; 976 bi = b->i; 977 bj = b->j; 978 garray = c->garray; 979 980 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 981 rbuf_i = rbuf[i]; 982 rbuf_0 = rbuf_i[0]; 983 ct += rbuf_0; 984 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 985 } 986 987 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 988 else max1 = 1; 989 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 990 if (nrqr) { 991 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 992 ++no_malloc; 993 } 994 PetscCall(PetscBTCreate(m, &xtable)); 995 PetscCall(PetscArrayzero(isz1, nrqr)); 996 997 ct3 = 0; 998 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 999 rbuf_i = rbuf[i]; 1000 rbuf_0 = rbuf_i[0]; 1001 ct1 = 2 * rbuf_0 + 1; 1002 ct2 = ct1; 1003 ct3 += ct1; 1004 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 1005 PetscCall(PetscBTMemzero(m, xtable)); 1006 oct2 = ct2; 1007 kmax = rbuf_i[2 * j]; 1008 for (k = 0; k < kmax; k++, ct1++) { 1009 row = rbuf_i[ct1]; 1010 if (!PetscBTLookupSet(xtable, row)) { 1011 if (!(ct3 < mem_estimate)) { 1012 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1013 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1014 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1015 PetscCall(PetscFree(xdata[0])); 1016 xdata[0] = tmp; 1017 mem_estimate = new_estimate; 1018 ++no_malloc; 1019 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1020 } 1021 xdata[i][ct2++] = row; 1022 ct3++; 1023 } 1024 } 1025 for (k = oct2, max2 = ct2; k < max2; k++) { 1026 row = xdata[i][k] - rstart; 1027 start = ai[row]; 1028 end = ai[row + 1]; 1029 for (l = start; l < end; l++) { 1030 val = aj[l] + cstart; 1031 if (!PetscBTLookupSet(xtable, val)) { 1032 if (!(ct3 < mem_estimate)) { 1033 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1034 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1035 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1036 PetscCall(PetscFree(xdata[0])); 1037 xdata[0] = tmp; 1038 mem_estimate = new_estimate; 1039 ++no_malloc; 1040 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1041 } 1042 xdata[i][ct2++] = val; 1043 ct3++; 1044 } 1045 } 1046 start = bi[row]; 1047 end = bi[row + 1]; 1048 for (l = start; l < end; l++) { 1049 val = garray[bj[l]]; 1050 if (!PetscBTLookupSet(xtable, val)) { 1051 if (!(ct3 < mem_estimate)) { 1052 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1053 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1054 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1055 PetscCall(PetscFree(xdata[0])); 1056 xdata[0] = tmp; 1057 mem_estimate = new_estimate; 1058 ++no_malloc; 1059 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1060 } 1061 xdata[i][ct2++] = val; 1062 ct3++; 1063 } 1064 } 1065 } 1066 /* Update the header*/ 1067 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1068 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1069 } 1070 xdata[i][0] = rbuf_0; 1071 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1072 isz1[i] = ct2; /* size of each message */ 1073 } 1074 PetscCall(PetscBTDestroy(&xtable)); 1075 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1076 PetscFunctionReturn(PETSC_SUCCESS); 1077 } 1078 1079 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1080 /* 1081 Every processor gets the entire matrix 1082 */ 1083 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) 1084 { 1085 Mat B; 1086 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1087 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1088 PetscMPIInt size, rank; 1089 PetscInt sendcount, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0; 1090 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1091 1092 PetscFunctionBegin; 1093 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1094 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1095 if (scall == MAT_INITIAL_MATRIX) { 1096 /* Tell every processor the number of nonzeros per row */ 1097 PetscCall(PetscCalloc1(A->rmap->N, &lens)); 1098 for (PetscInt i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; 1099 1100 /* All MPI processes get the same matrix */ 1101 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A))); 1102 1103 /* Create the sequential matrix of the same type as the local block diagonal */ 1104 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1105 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1106 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1107 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1108 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1109 PetscCall(PetscCalloc1(2, Bin)); 1110 **Bin = B; 1111 b = (Mat_SeqAIJ *)B->data; 1112 1113 /* zero column space */ 1114 nrecv = 0; 1115 for (PetscMPIInt i = 0; i < size; i++) { 1116 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j]; 1117 } 1118 PetscCall(PetscArrayzero(b->j, nrecv)); 1119 1120 /* Copy my part of matrix column indices over */ 1121 sendcount = ad->nz + bd->nz; 1122 jsendbuf = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]); 1123 a_jsendbuf = ad->j; 1124 b_jsendbuf = bd->j; 1125 n = A->rmap->rend - A->rmap->rstart; 1126 cnt = 0; 1127 for (PetscInt i = 0; i < n; i++) { 1128 /* put in lower diagonal portion */ 1129 m = bd->i[i + 1] - bd->i[i]; 1130 while (m > 0) { 1131 /* is it above diagonal (in bd (compressed) numbering) */ 1132 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1133 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1134 m--; 1135 } 1136 1137 /* put in diagonal portion */ 1138 for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1139 1140 /* put in upper diagonal portion */ 1141 while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1142 } 1143 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1144 1145 /* send column indices, b->j was zeroed */ 1146 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A))); 1147 1148 /* Assemble the matrix into useable form (numerical values not yet set) */ 1149 /* set the b->ilen (length of each row) values */ 1150 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1151 /* set the b->i indices */ 1152 b->i[0] = 0; 1153 for (PetscInt i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1]; 1154 PetscCall(PetscFree(lens)); 1155 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1156 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1157 1158 } else { 1159 B = **Bin; 1160 b = (Mat_SeqAIJ *)B->data; 1161 } 1162 1163 /* Copy my part of matrix numerical values into the values location */ 1164 if (flag == MAT_GET_VALUES) { 1165 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1166 MatScalar *sendbuf; 1167 1168 /* initialize b->a */ 1169 PetscCall(PetscArrayzero(b->a, b->nz)); 1170 1171 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1172 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1173 sendcount = ad->nz + bd->nz; 1174 sendbuf = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]); 1175 a_sendbuf = ada; 1176 b_sendbuf = bda; 1177 b_sendj = bd->j; 1178 n = A->rmap->rend - A->rmap->rstart; 1179 cnt = 0; 1180 for (PetscInt i = 0; i < n; i++) { 1181 /* put in lower diagonal portion */ 1182 m = bd->i[i + 1] - bd->i[i]; 1183 while (m > 0) { 1184 /* is it above diagonal (in bd (compressed) numbering) */ 1185 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 m--; 1188 b_sendj++; 1189 } 1190 1191 /* put in diagonal portion */ 1192 for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++; 1193 1194 /* put in upper diagonal portion */ 1195 while (m-- > 0) { 1196 sendbuf[cnt++] = *b_sendbuf++; 1197 b_sendj++; 1198 } 1199 } 1200 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1201 1202 /* send values, b->a was zeroed */ 1203 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A))); 1204 1205 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1206 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1207 } /* endof (flag == MAT_GET_VALUES) */ 1208 1209 PetscCall(MatPropagateSymmetryOptions(A, B)); 1210 PetscFunctionReturn(PETSC_SUCCESS); 1211 } 1212 1213 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) 1214 { 1215 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1216 Mat submat, A = c->A, B = c->B; 1217 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1218 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1219 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1220 const PetscInt *icol, *irow; 1221 PetscInt nrow, ncol, start; 1222 PetscMPIInt rank, size, *req_source1, *req_source2, tag1, tag2, tag3, tag4, *w1, *w2, nrqr, nrqs = 0, proc, *pa; 1223 PetscInt **sbuf1, **sbuf2, k, ct1, ct2, ct3, **rbuf1, row; 1224 PetscInt msz, **ptr, *req_size, *ctr, *tmp, tcol, *iptr; 1225 PetscInt **rbuf3, **sbuf_aj, **rbuf2, max1, nnz; 1226 PetscInt *lens, rmax, ncols, *cols, Crow; 1227 #if defined(PETSC_USE_CTABLE) 1228 PetscHMapI cmap, rmap; 1229 PetscInt *cmap_loc, *rmap_loc; 1230 #else 1231 PetscInt *cmap, *rmap; 1232 #endif 1233 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i, jcnt; 1234 PetscInt *cworkB, lwrite, *subcols, ib, jb; 1235 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1236 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1237 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1238 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1239 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1240 MPI_Comm comm; 1241 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1242 PetscMPIInt *onodes1, *olengths1, idex, end, *row2proc; 1243 Mat_SubSppt *smatis1; 1244 PetscBool isrowsorted, iscolsorted; 1245 1246 PetscFunctionBegin; 1247 PetscValidLogicalCollectiveInt(C, ismax, 2); 1248 PetscValidLogicalCollectiveEnum(C, scall, 5); 1249 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1250 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1251 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1252 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1253 size = c->size; 1254 rank = c->rank; 1255 1256 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1257 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1258 PetscCall(ISGetIndices(isrow[0], &irow)); 1259 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1260 if (allcolumns) { 1261 icol = NULL; 1262 ncol = C->cmap->N; 1263 } else { 1264 PetscCall(ISGetIndices(iscol[0], &icol)); 1265 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1266 } 1267 1268 if (scall == MAT_INITIAL_MATRIX) { 1269 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1270 1271 /* Get some new tags to keep the communication clean */ 1272 tag1 = ((PetscObject)C)->tag; 1273 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1274 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1275 1276 /* evaluate communication - mesg to who, length of mesg, and buffer space 1277 required. Based on this, buffers are allocated, and data copied into them */ 1278 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1279 PetscCall(PetscMalloc1(nrow, &row2proc)); 1280 1281 /* w1[proc] = num of rows owned by proc -- to be requested */ 1282 proc = 0; 1283 nrqs = 0; /* num of outgoing messages */ 1284 for (PetscInt j = 0; j < nrow; j++) { 1285 row = irow[j]; 1286 if (!isrowsorted) proc = 0; 1287 while (row >= C->rmap->range[proc + 1]) proc++; 1288 w1[proc]++; 1289 row2proc[j] = proc; /* map row index to proc */ 1290 1291 if (proc != rank && !w2[proc]) { 1292 w2[proc] = 1; 1293 nrqs++; 1294 } 1295 } 1296 w1[rank] = 0; /* rows owned by self will not be requested */ 1297 1298 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1299 for (PetscMPIInt proc = 0, j = 0; proc < size; proc++) { 1300 if (w1[proc]) pa[j++] = proc; 1301 } 1302 1303 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1304 msz = 0; /* total mesg length (for all procs) */ 1305 for (PetscMPIInt i = 0; i < nrqs; i++) { 1306 proc = pa[i]; 1307 w1[proc] += 3; 1308 msz += w1[proc]; 1309 } 1310 PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1311 1312 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1313 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1314 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1315 1316 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1317 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1318 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1319 1320 /* Now post the Irecvs corresponding to these messages */ 1321 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1322 1323 PetscCall(PetscFree(onodes1)); 1324 PetscCall(PetscFree(olengths1)); 1325 1326 /* Allocate Memory for outgoing messages */ 1327 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1328 PetscCall(PetscArrayzero(sbuf1, size)); 1329 PetscCall(PetscArrayzero(ptr, size)); 1330 1331 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1332 iptr = tmp; 1333 for (PetscMPIInt i = 0; i < nrqs; i++) { 1334 proc = pa[i]; 1335 sbuf1[proc] = iptr; 1336 iptr += w1[proc]; 1337 } 1338 1339 /* Form the outgoing messages */ 1340 /* Initialize the header space */ 1341 for (PetscMPIInt i = 0; i < nrqs; i++) { 1342 proc = pa[i]; 1343 PetscCall(PetscArrayzero(sbuf1[proc], 3)); 1344 ptr[proc] = sbuf1[proc] + 3; 1345 } 1346 1347 /* Parse the isrow and copy data into outbuf */ 1348 PetscCall(PetscArrayzero(ctr, size)); 1349 for (PetscInt j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1350 proc = row2proc[j]; 1351 if (proc != rank) { /* copy to the outgoing buf*/ 1352 *ptr[proc] = irow[j]; 1353 ctr[proc]++; 1354 ptr[proc]++; 1355 } 1356 } 1357 1358 /* Update the headers for the current IS */ 1359 for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */ 1360 if ((ctr_j = ctr[j])) { 1361 sbuf1_j = sbuf1[j]; 1362 k = ++sbuf1_j[0]; 1363 sbuf1_j[2 * k] = ctr_j; 1364 sbuf1_j[2 * k - 1] = 0; 1365 } 1366 } 1367 1368 /* Now post the sends */ 1369 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1370 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1371 proc = pa[i]; 1372 PetscCallMPI(MPIU_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i)); 1373 } 1374 1375 /* Post Receives to capture the buffer size */ 1376 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1377 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1378 1379 if (nrqs) rbuf2[0] = tmp + msz; 1380 for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1381 1382 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1383 proc = pa[i]; 1384 PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i)); 1385 } 1386 1387 PetscCall(PetscFree2(w1, w2)); 1388 1389 /* Send to other procs the buf size they should allocate */ 1390 /* Receive messages*/ 1391 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1392 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1393 1394 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1395 for (PetscMPIInt i = 0; i < nrqr; ++i) { 1396 req_size[i] = 0; 1397 rbuf1_i = rbuf1[i]; 1398 start = 2 * rbuf1_i[0] + 1; 1399 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1400 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1401 sbuf2_i = sbuf2[i]; 1402 for (PetscInt j = start; j < end; j++) { 1403 k = rbuf1_i[j] - rstart; 1404 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1405 sbuf2_i[j] = ncols; 1406 req_size[i] += ncols; 1407 } 1408 req_source1[i] = r_status1[i].MPI_SOURCE; 1409 1410 /* form the header */ 1411 sbuf2_i[0] = req_size[i]; 1412 for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1413 1414 PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1415 } 1416 1417 PetscCall(PetscFree(r_status1)); 1418 PetscCall(PetscFree(r_waits1)); 1419 1420 /* rbuf2 is received, Post recv column indices a->j */ 1421 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1422 1423 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1424 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1425 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1426 req_source2[i] = r_status2[i].MPI_SOURCE; 1427 PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1428 } 1429 1430 /* Wait on sends1 and sends2 */ 1431 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1432 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1433 PetscCall(PetscFree(s_waits1)); 1434 PetscCall(PetscFree(s_status1)); 1435 1436 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1437 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1438 1439 /* Now allocate sending buffers for a->j, and send them off */ 1440 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1441 jcnt = 0; 1442 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 1443 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0])); 1444 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1445 1446 for (PetscMPIInt i = 0; i < nrqr; i++) { /* for each requested message */ 1447 rbuf1_i = rbuf1[i]; 1448 sbuf_aj_i = sbuf_aj[i]; 1449 ct1 = 2 * rbuf1_i[0] + 1; 1450 ct2 = 0; 1451 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1452 1453 kmax = rbuf1[i][2]; 1454 for (PetscInt k = 0; k < kmax; k++, ct1++) { /* for each row */ 1455 row = rbuf1_i[ct1] - rstart; 1456 nzA = ai[row + 1] - ai[row]; 1457 nzB = bi[row + 1] - bi[row]; 1458 ncols = nzA + nzB; 1459 cworkA = PetscSafePointerPlusOffset(aj, ai[row]); 1460 cworkB = PetscSafePointerPlusOffset(bj, bi[row]); 1461 1462 /* load the column indices for this row into cols*/ 1463 cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2); 1464 1465 lwrite = 0; 1466 for (PetscInt l = 0; l < nzB; l++) { 1467 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1468 } 1469 for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1470 for (PetscInt l = 0; l < nzB; l++) { 1471 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1472 } 1473 1474 ct2 += ncols; 1475 } 1476 PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1477 } 1478 1479 /* create column map (cmap): global col of C -> local col of submat */ 1480 #if defined(PETSC_USE_CTABLE) 1481 if (!allcolumns) { 1482 PetscCall(PetscHMapICreateWithSize(ncol, &cmap)); 1483 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1484 for (PetscInt j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1485 if (icol[j] >= cstart && icol[j] < cend) { 1486 cmap_loc[icol[j] - cstart] = j + 1; 1487 } else { /* use PetscHMapI for non-local col indices */ 1488 PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1)); 1489 } 1490 } 1491 } else { 1492 cmap = NULL; 1493 cmap_loc = NULL; 1494 } 1495 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1496 #else 1497 if (!allcolumns) { 1498 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1499 for (PetscInt j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1500 } else { 1501 cmap = NULL; 1502 } 1503 #endif 1504 1505 /* Create lens for MatSeqAIJSetPreallocation() */ 1506 PetscCall(PetscCalloc1(nrow, &lens)); 1507 1508 /* Compute lens from local part of C */ 1509 for (PetscInt j = 0; j < nrow; j++) { 1510 row = irow[j]; 1511 proc = row2proc[j]; 1512 if (proc == rank) { 1513 /* diagonal part A = c->A */ 1514 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1515 cols = PetscSafePointerPlusOffset(aj, ai[row - rstart]); 1516 if (!allcolumns) { 1517 for (PetscInt k = 0; k < ncols; k++) { 1518 #if defined(PETSC_USE_CTABLE) 1519 tcol = cmap_loc[cols[k]]; 1520 #else 1521 tcol = cmap[cols[k] + cstart]; 1522 #endif 1523 if (tcol) lens[j]++; 1524 } 1525 } else { /* allcolumns */ 1526 lens[j] = ncols; 1527 } 1528 1529 /* off-diagonal part B = c->B */ 1530 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1531 cols = PetscSafePointerPlusOffset(bj, bi[row - rstart]); 1532 if (!allcolumns) { 1533 for (PetscInt k = 0; k < ncols; k++) { 1534 #if defined(PETSC_USE_CTABLE) 1535 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1536 #else 1537 tcol = cmap[bmap[cols[k]]]; 1538 #endif 1539 if (tcol) lens[j]++; 1540 } 1541 } else { /* allcolumns */ 1542 lens[j] += ncols; 1543 } 1544 } 1545 } 1546 1547 /* Create row map (rmap): global row of C -> local row of submat */ 1548 #if defined(PETSC_USE_CTABLE) 1549 PetscCall(PetscHMapICreateWithSize(nrow, &rmap)); 1550 for (PetscInt j = 0; j < nrow; j++) { 1551 row = irow[j]; 1552 proc = row2proc[j]; 1553 if (proc == rank) { /* a local row */ 1554 rmap_loc[row - rstart] = j; 1555 } else { 1556 PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1)); 1557 } 1558 } 1559 #else 1560 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1561 for (PetscInt j = 0; j < nrow; j++) rmap[irow[j]] = j; 1562 #endif 1563 1564 /* Update lens from offproc data */ 1565 /* recv a->j is done */ 1566 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1567 for (PetscMPIInt i = 0; i < nrqs; i++) { 1568 proc = pa[i]; 1569 sbuf1_i = sbuf1[proc]; 1570 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1571 ct1 = 2 + 1; 1572 ct2 = 0; 1573 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1574 rbuf3_i = rbuf3[i]; /* received C->j */ 1575 1576 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1577 max1 = sbuf1_i[2]; 1578 for (PetscInt k = 0; k < max1; k++, ct1++) { 1579 #if defined(PETSC_USE_CTABLE) 1580 PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row)); 1581 row--; 1582 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1583 #else 1584 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1585 #endif 1586 /* Now, store row index of submat in sbuf1_i[ct1] */ 1587 sbuf1_i[ct1] = row; 1588 1589 nnz = rbuf2_i[ct1]; 1590 if (!allcolumns) { 1591 for (PetscMPIInt l = 0; l < nnz; l++, ct2++) { 1592 #if defined(PETSC_USE_CTABLE) 1593 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1594 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1595 } else { 1596 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1597 } 1598 #else 1599 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1600 #endif 1601 if (tcol) lens[row]++; 1602 } 1603 } else { /* allcolumns */ 1604 lens[row] += nnz; 1605 } 1606 } 1607 } 1608 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1609 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1610 1611 /* Create the submatrices */ 1612 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1613 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1614 1615 PetscCall(ISGetBlockSize(isrow[0], &ib)); 1616 PetscCall(ISGetBlockSize(iscol[0], &jb)); 1617 if (ib > 1 || jb > 1) PetscCall(MatSetBlockSizes(submat, ib, jb)); 1618 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1619 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1620 1621 /* create struct Mat_SubSppt and attached it to submat */ 1622 PetscCall(PetscNew(&smatis1)); 1623 subc = (Mat_SeqAIJ *)submat->data; 1624 subc->submatis1 = smatis1; 1625 1626 smatis1->id = 0; 1627 smatis1->nrqs = nrqs; 1628 smatis1->nrqr = nrqr; 1629 smatis1->rbuf1 = rbuf1; 1630 smatis1->rbuf2 = rbuf2; 1631 smatis1->rbuf3 = rbuf3; 1632 smatis1->sbuf2 = sbuf2; 1633 smatis1->req_source2 = req_source2; 1634 1635 smatis1->sbuf1 = sbuf1; 1636 smatis1->ptr = ptr; 1637 smatis1->tmp = tmp; 1638 smatis1->ctr = ctr; 1639 1640 smatis1->pa = pa; 1641 smatis1->req_size = req_size; 1642 smatis1->req_source1 = req_source1; 1643 1644 smatis1->allcolumns = allcolumns; 1645 smatis1->singleis = PETSC_TRUE; 1646 smatis1->row2proc = row2proc; 1647 smatis1->rmap = rmap; 1648 smatis1->cmap = cmap; 1649 #if defined(PETSC_USE_CTABLE) 1650 smatis1->rmap_loc = rmap_loc; 1651 smatis1->cmap_loc = cmap_loc; 1652 #endif 1653 1654 smatis1->destroy = submat->ops->destroy; 1655 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1656 submat->factortype = C->factortype; 1657 1658 /* compute rmax */ 1659 rmax = 0; 1660 for (PetscMPIInt i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1661 1662 } else { /* scall == MAT_REUSE_MATRIX */ 1663 submat = submats[0]; 1664 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1665 1666 subc = (Mat_SeqAIJ *)submat->data; 1667 rmax = subc->rmax; 1668 smatis1 = subc->submatis1; 1669 nrqs = smatis1->nrqs; 1670 nrqr = smatis1->nrqr; 1671 rbuf1 = smatis1->rbuf1; 1672 rbuf2 = smatis1->rbuf2; 1673 rbuf3 = smatis1->rbuf3; 1674 req_source2 = smatis1->req_source2; 1675 1676 sbuf1 = smatis1->sbuf1; 1677 sbuf2 = smatis1->sbuf2; 1678 ptr = smatis1->ptr; 1679 tmp = smatis1->tmp; 1680 ctr = smatis1->ctr; 1681 1682 pa = smatis1->pa; 1683 req_size = smatis1->req_size; 1684 req_source1 = smatis1->req_source1; 1685 1686 allcolumns = smatis1->allcolumns; 1687 row2proc = smatis1->row2proc; 1688 rmap = smatis1->rmap; 1689 cmap = smatis1->cmap; 1690 #if defined(PETSC_USE_CTABLE) 1691 rmap_loc = smatis1->rmap_loc; 1692 cmap_loc = smatis1->cmap_loc; 1693 #endif 1694 } 1695 1696 /* Post recv matrix values */ 1697 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1698 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1699 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1700 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1701 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1702 PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1703 } 1704 1705 /* Allocate sending buffers for a->a, and send them off */ 1706 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1707 jcnt = 0; 1708 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 1709 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0])); 1710 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1711 1712 for (PetscMPIInt i = 0; i < nrqr; i++) { 1713 rbuf1_i = rbuf1[i]; 1714 sbuf_aa_i = sbuf_aa[i]; 1715 ct1 = 2 * rbuf1_i[0] + 1; 1716 ct2 = 0; 1717 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1718 1719 kmax = rbuf1_i[2]; 1720 for (PetscInt k = 0; k < kmax; k++, ct1++) { 1721 row = rbuf1_i[ct1] - rstart; 1722 nzA = ai[row + 1] - ai[row]; 1723 nzB = bi[row + 1] - bi[row]; 1724 ncols = nzA + nzB; 1725 cworkB = PetscSafePointerPlusOffset(bj, bi[row]); 1726 vworkA = PetscSafePointerPlusOffset(a_a, ai[row]); 1727 vworkB = PetscSafePointerPlusOffset(b_a, bi[row]); 1728 1729 /* load the column values for this row into vals*/ 1730 vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2); 1731 1732 lwrite = 0; 1733 for (PetscInt l = 0; l < nzB; l++) { 1734 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1735 } 1736 for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1737 for (PetscInt l = 0; l < nzB; l++) { 1738 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1739 } 1740 1741 ct2 += ncols; 1742 } 1743 PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1744 } 1745 1746 /* Assemble submat */ 1747 /* First assemble the local rows */ 1748 for (PetscInt j = 0; j < nrow; j++) { 1749 row = irow[j]; 1750 proc = row2proc[j]; 1751 if (proc == rank) { 1752 Crow = row - rstart; /* local row index of C */ 1753 #if defined(PETSC_USE_CTABLE) 1754 row = rmap_loc[Crow]; /* row index of submat */ 1755 #else 1756 row = rmap[row]; 1757 #endif 1758 1759 if (allcolumns) { 1760 PetscInt ncol = 0; 1761 1762 /* diagonal part A = c->A */ 1763 ncols = ai[Crow + 1] - ai[Crow]; 1764 cols = PetscSafePointerPlusOffset(aj, ai[Crow]); 1765 vals = PetscSafePointerPlusOffset(a_a, ai[Crow]); 1766 for (PetscInt k = 0; k < ncols; k++) { 1767 subcols[ncol] = cols[k] + cstart; 1768 subvals[ncol++] = vals[k]; 1769 } 1770 1771 /* off-diagonal part B = c->B */ 1772 ncols = bi[Crow + 1] - bi[Crow]; 1773 cols = PetscSafePointerPlusOffset(bj, bi[Crow]); 1774 vals = PetscSafePointerPlusOffset(b_a, bi[Crow]); 1775 for (PetscInt k = 0; k < ncols; k++) { 1776 subcols[ncol] = bmap[cols[k]]; 1777 subvals[ncol++] = vals[k]; 1778 } 1779 1780 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES)); 1781 1782 } else { /* !allcolumns */ 1783 PetscInt ncol = 0; 1784 1785 #if defined(PETSC_USE_CTABLE) 1786 /* diagonal part A = c->A */ 1787 ncols = ai[Crow + 1] - ai[Crow]; 1788 cols = PetscSafePointerPlusOffset(aj, ai[Crow]); 1789 vals = PetscSafePointerPlusOffset(a_a, ai[Crow]); 1790 for (PetscInt k = 0; k < ncols; k++) { 1791 tcol = cmap_loc[cols[k]]; 1792 if (tcol) { 1793 subcols[ncol] = --tcol; 1794 subvals[ncol++] = vals[k]; 1795 } 1796 } 1797 1798 /* off-diagonal part B = c->B */ 1799 ncols = bi[Crow + 1] - bi[Crow]; 1800 cols = PetscSafePointerPlusOffset(bj, bi[Crow]); 1801 vals = PetscSafePointerPlusOffset(b_a, bi[Crow]); 1802 for (PetscInt k = 0; k < ncols; k++) { 1803 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1804 if (tcol) { 1805 subcols[ncol] = --tcol; 1806 subvals[ncol++] = vals[k]; 1807 } 1808 } 1809 #else 1810 /* diagonal part A = c->A */ 1811 ncols = ai[Crow + 1] - ai[Crow]; 1812 cols = aj + ai[Crow]; 1813 vals = a_a + ai[Crow]; 1814 for (PetscInt k = 0; k < ncols; k++) { 1815 tcol = cmap[cols[k] + cstart]; 1816 if (tcol) { 1817 subcols[ncol] = --tcol; 1818 subvals[ncol++] = vals[k]; 1819 } 1820 } 1821 1822 /* off-diagonal part B = c->B */ 1823 ncols = bi[Crow + 1] - bi[Crow]; 1824 cols = bj + bi[Crow]; 1825 vals = b_a + bi[Crow]; 1826 for (PetscInt k = 0; k < ncols; k++) { 1827 tcol = cmap[bmap[cols[k]]]; 1828 if (tcol) { 1829 subcols[ncol] = --tcol; 1830 subvals[ncol++] = vals[k]; 1831 } 1832 } 1833 #endif 1834 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES)); 1835 } 1836 } 1837 } 1838 1839 /* Now assemble the off-proc rows */ 1840 for (PetscMPIInt i = 0; i < nrqs; i++) { /* for each requested message */ 1841 /* recv values from other processes */ 1842 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1843 proc = pa[idex]; 1844 sbuf1_i = sbuf1[proc]; 1845 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1846 ct1 = 2 + 1; 1847 ct2 = 0; /* count of received C->j */ 1848 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1849 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1850 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1851 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1852 1853 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1854 max1 = sbuf1_i[2]; /* num of rows */ 1855 for (PetscInt k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1856 row = sbuf1_i[ct1]; /* row index of submat */ 1857 if (!allcolumns) { 1858 idex = 0; 1859 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1860 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1861 for (PetscInt l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1862 #if defined(PETSC_USE_CTABLE) 1863 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1864 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1865 } else { 1866 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1867 } 1868 #else 1869 tcol = cmap[rbuf3_i[ct2]]; 1870 #endif 1871 if (tcol) { 1872 subcols[idex] = --tcol; /* may not be sorted */ 1873 subvals[idex++] = rbuf4_i[ct2]; 1874 1875 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1876 For reuse, we replace received C->j with index that should be inserted to submat */ 1877 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1878 } 1879 } 1880 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1881 } else { /* scall == MAT_REUSE_MATRIX */ 1882 submat = submats[0]; 1883 subc = (Mat_SeqAIJ *)submat->data; 1884 1885 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1886 for (PetscInt l = 0; l < nnz; l++) { 1887 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1888 subvals[idex++] = rbuf4_i[ct2]; 1889 } 1890 1891 bj = subc->j + subc->i[row]; /* sorted column indices */ 1892 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1893 } 1894 } else { /* allcolumns */ 1895 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1896 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES)); 1897 ct2 += nnz; 1898 } 1899 } 1900 } 1901 1902 /* sending a->a are done */ 1903 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1904 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1905 1906 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1907 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1908 submats[0] = submat; 1909 1910 /* Restore the indices */ 1911 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1912 if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol)); 1913 1914 /* Destroy allocated memory */ 1915 for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 1916 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1917 if (sbuf_aa) { 1918 PetscCall(PetscFree(sbuf_aa[0])); 1919 PetscCall(PetscFree(sbuf_aa)); 1920 } 1921 1922 if (scall == MAT_INITIAL_MATRIX) { 1923 PetscCall(PetscFree(lens)); 1924 if (sbuf_aj) { 1925 PetscCall(PetscFree(sbuf_aj[0])); 1926 PetscCall(PetscFree(sbuf_aj)); 1927 } 1928 } 1929 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1930 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1931 PetscFunctionReturn(PETSC_SUCCESS); 1932 } 1933 1934 static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1935 { 1936 PetscInt ncol; 1937 PetscBool colflag, allcolumns = PETSC_FALSE; 1938 1939 PetscFunctionBegin; 1940 /* Allocate memory to hold all the submatrices */ 1941 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat)); 1942 1943 /* Check for special case: each processor gets entire matrix columns */ 1944 PetscCall(ISIdentity(iscol[0], &colflag)); 1945 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1946 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1947 1948 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1949 PetscFunctionReturn(PETSC_SUCCESS); 1950 } 1951 1952 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1953 { 1954 PetscInt nmax, nstages = 0, max_no, nrow, ncol, in[2], out[2]; 1955 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1956 Mat_SeqAIJ *subc; 1957 Mat_SubSppt *smat; 1958 1959 PetscFunctionBegin; 1960 /* Check for special case: each processor has a single IS */ 1961 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1962 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1963 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1964 PetscFunctionReturn(PETSC_SUCCESS); 1965 } 1966 1967 /* Collect global wantallmatrix and nstages */ 1968 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1969 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1970 if (!nmax) nmax = 1; 1971 1972 if (scall == MAT_INITIAL_MATRIX) { 1973 /* Collect global wantallmatrix and nstages */ 1974 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1975 PetscCall(ISIdentity(*isrow, &rowflag)); 1976 PetscCall(ISIdentity(*iscol, &colflag)); 1977 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1978 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1979 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1980 wantallmatrix = PETSC_TRUE; 1981 1982 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1983 } 1984 } 1985 1986 /* Determine the number of stages through which submatrices are done 1987 Each stage will extract nmax submatrices. 1988 nmax is determined by the matrix column dimension. 1989 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1990 */ 1991 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1992 1993 in[0] = -1 * (PetscInt)wantallmatrix; 1994 in[1] = nstages; 1995 PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 1996 wantallmatrix = (PetscBool)(-out[0]); 1997 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1998 1999 } else { /* MAT_REUSE_MATRIX */ 2000 if (ismax) { 2001 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2002 smat = subc->submatis1; 2003 } else { /* (*submat)[0] is a dummy matrix */ 2004 smat = (Mat_SubSppt *)(*submat)[0]->data; 2005 } 2006 if (!smat) { 2007 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2008 wantallmatrix = PETSC_TRUE; 2009 } else if (smat->singleis) { 2010 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 2011 PetscFunctionReturn(PETSC_SUCCESS); 2012 } else { 2013 nstages = smat->nstages; 2014 } 2015 } 2016 2017 if (wantallmatrix) { 2018 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 2019 PetscFunctionReturn(PETSC_SUCCESS); 2020 } 2021 2022 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2023 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat)); 2024 2025 for (PetscInt i = 0, pos = 0; i < nstages; i++) { 2026 if (pos + nmax <= ismax) max_no = nmax; 2027 else if (pos >= ismax) max_no = 0; 2028 else max_no = ismax - pos; 2029 2030 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos)); 2031 if (!max_no) { 2032 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2033 smat = (Mat_SubSppt *)(*submat)[pos]->data; 2034 smat->nstages = nstages; 2035 } 2036 pos++; /* advance to next dummy matrix if any */ 2037 } else pos += max_no; 2038 } 2039 2040 if (ismax && scall == MAT_INITIAL_MATRIX) { 2041 /* save nstages for reuse */ 2042 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2043 smat = subc->submatis1; 2044 smat->nstages = nstages; 2045 } 2046 PetscFunctionReturn(PETSC_SUCCESS); 2047 } 2048 2049 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) 2050 { 2051 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2052 Mat A = c->A; 2053 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2054 const PetscInt **icol, **irow; 2055 PetscInt *nrow, *ncol, start; 2056 PetscMPIInt nrqs = 0, *pa, proc = -1; 2057 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr, *req_source1 = NULL, *req_source2; 2058 PetscInt **sbuf1, **sbuf2, k, ct1, ct2, **rbuf1, row; 2059 PetscInt msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *tmp = NULL, tcol; 2060 PetscInt **rbuf3 = NULL, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2061 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2062 #if defined(PETSC_USE_CTABLE) 2063 PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i; 2064 #else 2065 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2066 #endif 2067 const PetscInt *irow_i; 2068 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2069 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2070 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2071 MPI_Comm comm; 2072 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2073 PetscMPIInt *onodes1, *olengths1, end, **row2proc, *row2proc_i; 2074 PetscInt ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2075 Mat_SubSppt *smat_i; 2076 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2077 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen, jcnt; 2078 2079 PetscFunctionBegin; 2080 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2081 size = c->size; 2082 rank = c->rank; 2083 2084 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2085 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2086 2087 for (PetscInt i = 0; i < ismax; i++) { 2088 PetscCall(ISSorted(iscol[i], &issorted[i])); 2089 if (!issorted[i]) iscsorted = issorted[i]; 2090 2091 PetscCall(ISSorted(isrow[i], &issorted[i])); 2092 2093 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2094 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2095 2096 /* Check for special case: allcolumn */ 2097 PetscCall(ISIdentity(iscol[i], &colflag)); 2098 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2099 if (colflag && ncol[i] == C->cmap->N) { 2100 allcolumns[i] = PETSC_TRUE; 2101 icol[i] = NULL; 2102 } else { 2103 allcolumns[i] = PETSC_FALSE; 2104 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2105 } 2106 } 2107 2108 if (scall == MAT_REUSE_MATRIX) { 2109 /* Assumes new rows are same length as the old rows */ 2110 for (PetscInt i = 0; i < ismax; i++) { 2111 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2112 subc = (Mat_SeqAIJ *)submats[i]->data; 2113 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2114 2115 /* Initial matrix as if empty */ 2116 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2117 2118 smat_i = subc->submatis1; 2119 2120 nrqs = smat_i->nrqs; 2121 nrqr = smat_i->nrqr; 2122 rbuf1 = smat_i->rbuf1; 2123 rbuf2 = smat_i->rbuf2; 2124 rbuf3 = smat_i->rbuf3; 2125 req_source2 = smat_i->req_source2; 2126 2127 sbuf1 = smat_i->sbuf1; 2128 sbuf2 = smat_i->sbuf2; 2129 ptr = smat_i->ptr; 2130 tmp = smat_i->tmp; 2131 ctr = smat_i->ctr; 2132 2133 pa = smat_i->pa; 2134 req_size = smat_i->req_size; 2135 req_source1 = smat_i->req_source1; 2136 2137 allcolumns[i] = smat_i->allcolumns; 2138 row2proc[i] = smat_i->row2proc; 2139 rmap[i] = smat_i->rmap; 2140 cmap[i] = smat_i->cmap; 2141 } 2142 2143 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2144 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2145 smat_i = (Mat_SubSppt *)submats[0]->data; 2146 2147 nrqs = smat_i->nrqs; 2148 nrqr = smat_i->nrqr; 2149 rbuf1 = smat_i->rbuf1; 2150 rbuf2 = smat_i->rbuf2; 2151 rbuf3 = smat_i->rbuf3; 2152 req_source2 = smat_i->req_source2; 2153 2154 sbuf1 = smat_i->sbuf1; 2155 sbuf2 = smat_i->sbuf2; 2156 ptr = smat_i->ptr; 2157 tmp = smat_i->tmp; 2158 ctr = smat_i->ctr; 2159 2160 pa = smat_i->pa; 2161 req_size = smat_i->req_size; 2162 req_source1 = smat_i->req_source1; 2163 2164 allcolumns[0] = PETSC_FALSE; 2165 } 2166 } else { /* scall == MAT_INITIAL_MATRIX */ 2167 /* Get some new tags to keep the communication clean */ 2168 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2169 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2170 2171 /* evaluate communication - mesg to who, length of mesg, and buffer space 2172 required. Based on this, buffers are allocated, and data copied into them*/ 2173 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2174 2175 for (PetscInt i = 0; i < ismax; i++) { 2176 jmax = nrow[i]; 2177 irow_i = irow[i]; 2178 2179 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2180 row2proc[i] = row2proc_i; 2181 2182 if (issorted[i]) proc = 0; 2183 for (PetscInt j = 0; j < jmax; j++) { 2184 if (!issorted[i]) proc = 0; 2185 row = irow_i[j]; 2186 while (row >= C->rmap->range[proc + 1]) proc++; 2187 w4[proc]++; 2188 row2proc_i[j] = proc; /* map row index to proc */ 2189 } 2190 for (PetscMPIInt j = 0; j < size; j++) { 2191 if (w4[j]) { 2192 w1[j] += w4[j]; 2193 w3[j]++; 2194 w4[j] = 0; 2195 } 2196 } 2197 } 2198 2199 nrqs = 0; /* no of outgoing messages */ 2200 msz = 0; /* total mesg length (for all procs) */ 2201 w1[rank] = 0; /* no mesg sent to self */ 2202 w3[rank] = 0; 2203 for (PetscMPIInt i = 0; i < size; i++) { 2204 if (w1[i]) { 2205 w2[i] = 1; 2206 nrqs++; 2207 } /* there exists a message to proc i */ 2208 } 2209 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2210 for (PetscMPIInt i = 0, j = 0; i < size; i++) { 2211 if (w1[i]) { 2212 pa[j] = i; 2213 j++; 2214 } 2215 } 2216 2217 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2218 for (PetscMPIInt i = 0; i < nrqs; i++) { 2219 PetscMPIInt j = pa[i]; 2220 w1[j] += w2[j] + 2 * w3[j]; 2221 msz += w1[j]; 2222 } 2223 PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2224 2225 /* Determine the number of messages to expect, their lengths, from from-ids */ 2226 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2227 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2228 2229 /* Now post the Irecvs corresponding to these messages */ 2230 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2231 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2232 2233 /* Allocate Memory for outgoing messages */ 2234 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2235 PetscCall(PetscArrayzero(sbuf1, size)); 2236 PetscCall(PetscArrayzero(ptr, size)); 2237 2238 { 2239 PetscInt *iptr = tmp; 2240 k = 0; 2241 for (PetscMPIInt i = 0; i < nrqs; i++) { 2242 PetscMPIInt j = pa[i]; 2243 iptr += k; 2244 sbuf1[j] = iptr; 2245 k = w1[j]; 2246 } 2247 } 2248 2249 /* Form the outgoing messages. Initialize the header space */ 2250 for (PetscMPIInt i = 0; i < nrqs; i++) { 2251 PetscMPIInt j = pa[i]; 2252 sbuf1[j][0] = 0; 2253 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2254 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2255 } 2256 2257 /* Parse the isrow and copy data into outbuf */ 2258 for (PetscInt i = 0; i < ismax; i++) { 2259 row2proc_i = row2proc[i]; 2260 PetscCall(PetscArrayzero(ctr, size)); 2261 irow_i = irow[i]; 2262 jmax = nrow[i]; 2263 for (PetscInt j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2264 proc = row2proc_i[j]; 2265 if (proc != rank) { /* copy to the outgoing buf*/ 2266 ctr[proc]++; 2267 *ptr[proc] = irow_i[j]; 2268 ptr[proc]++; 2269 } 2270 } 2271 /* Update the headers for the current IS */ 2272 for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */ 2273 if ((ctr_j = ctr[j])) { 2274 sbuf1_j = sbuf1[j]; 2275 k = ++sbuf1_j[0]; 2276 sbuf1_j[2 * k] = ctr_j; 2277 sbuf1_j[2 * k - 1] = i; 2278 } 2279 } 2280 } 2281 2282 /* Now post the sends */ 2283 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2284 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2285 PetscMPIInt j = pa[i]; 2286 PetscCallMPI(MPIU_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2287 } 2288 2289 /* Post Receives to capture the buffer size */ 2290 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2291 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2292 if (nrqs) rbuf2[0] = tmp + msz; 2293 for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 2294 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2295 PetscMPIInt j = pa[i]; 2296 PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2297 } 2298 2299 /* Send to other procs the buf size they should allocate */ 2300 /* Receive messages*/ 2301 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2302 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2303 { 2304 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2305 PetscInt *sbuf2_i; 2306 2307 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2308 for (PetscMPIInt i = 0; i < nrqr; ++i) { 2309 req_size[i] = 0; 2310 rbuf1_i = rbuf1[i]; 2311 start = 2 * rbuf1_i[0] + 1; 2312 end = olengths1[i]; 2313 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2314 sbuf2_i = sbuf2[i]; 2315 for (PetscInt j = start; j < end; j++) { 2316 id = rbuf1_i[j] - rstart; 2317 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2318 sbuf2_i[j] = ncols; 2319 req_size[i] += ncols; 2320 } 2321 req_source1[i] = onodes1[i]; 2322 /* form the header */ 2323 sbuf2_i[0] = req_size[i]; 2324 for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2325 2326 PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2327 } 2328 } 2329 2330 PetscCall(PetscFree(onodes1)); 2331 PetscCall(PetscFree(olengths1)); 2332 PetscCall(PetscFree(r_waits1)); 2333 PetscCall(PetscFree4(w1, w2, w3, w4)); 2334 2335 /* Receive messages*/ 2336 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2337 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2338 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2339 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2340 req_source2[i] = pa[i]; 2341 PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2342 } 2343 PetscCall(PetscFree(r_waits2)); 2344 2345 /* Wait on sends1 and sends2 */ 2346 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2347 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2348 PetscCall(PetscFree(s_waits1)); 2349 PetscCall(PetscFree(s_waits2)); 2350 2351 /* Now allocate sending buffers for a->j, and send them off */ 2352 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2353 jcnt = 0; 2354 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 2355 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0])); 2356 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2357 2358 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2359 { 2360 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2361 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2362 PetscInt cend = C->cmap->rend; 2363 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2364 2365 for (PetscMPIInt i = 0; i < nrqr; i++) { 2366 rbuf1_i = rbuf1[i]; 2367 sbuf_aj_i = sbuf_aj[i]; 2368 ct1 = 2 * rbuf1_i[0] + 1; 2369 ct2 = 0; 2370 for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2371 kmax = rbuf1[i][2 * j]; 2372 for (PetscInt k = 0; k < kmax; k++, ct1++) { 2373 row = rbuf1_i[ct1] - rstart; 2374 nzA = a_i[row + 1] - a_i[row]; 2375 nzB = b_i[row + 1] - b_i[row]; 2376 ncols = nzA + nzB; 2377 cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]); 2378 cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]); 2379 2380 /* load the column indices for this row into cols */ 2381 cols = sbuf_aj_i + ct2; 2382 2383 lwrite = 0; 2384 for (PetscInt l = 0; l < nzB; l++) { 2385 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2386 } 2387 for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2388 for (PetscInt l = 0; l < nzB; l++) { 2389 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2390 } 2391 2392 ct2 += ncols; 2393 } 2394 } 2395 PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2396 } 2397 } 2398 2399 /* create col map: global col of C -> local col of submatrices */ 2400 { 2401 const PetscInt *icol_i; 2402 #if defined(PETSC_USE_CTABLE) 2403 for (PetscInt i = 0; i < ismax; i++) { 2404 if (!allcolumns[i]) { 2405 PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i)); 2406 2407 jmax = ncol[i]; 2408 icol_i = icol[i]; 2409 cmap_i = cmap[i]; 2410 for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1)); 2411 } else cmap[i] = NULL; 2412 } 2413 #else 2414 for (PetscInt i = 0; i < ismax; i++) { 2415 if (!allcolumns[i]) { 2416 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2417 jmax = ncol[i]; 2418 icol_i = icol[i]; 2419 cmap_i = cmap[i]; 2420 for (PetscInt j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1; 2421 } else cmap[i] = NULL; 2422 } 2423 #endif 2424 } 2425 2426 /* Create lens which is required for MatCreate... */ 2427 jcnt = 0; 2428 for (PetscInt i = 0; i < ismax; i++) jcnt += nrow[i]; 2429 PetscCall(PetscMalloc1(ismax, &lens)); 2430 2431 if (ismax) PetscCall(PetscCalloc1(jcnt, &lens[0])); 2432 for (PetscInt i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]); 2433 2434 /* Update lens from local data */ 2435 for (PetscInt i = 0; i < ismax; i++) { 2436 row2proc_i = row2proc[i]; 2437 jmax = nrow[i]; 2438 if (!allcolumns[i]) cmap_i = cmap[i]; 2439 irow_i = irow[i]; 2440 lens_i = lens[i]; 2441 for (PetscInt j = 0; j < jmax; j++) { 2442 row = irow_i[j]; 2443 proc = row2proc_i[j]; 2444 if (proc == rank) { 2445 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2446 if (!allcolumns[i]) { 2447 for (PetscInt k = 0; k < ncols; k++) { 2448 #if defined(PETSC_USE_CTABLE) 2449 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2450 #else 2451 tcol = cmap_i[cols[k]]; 2452 #endif 2453 if (tcol) lens_i[j]++; 2454 } 2455 } else { /* allcolumns */ 2456 lens_i[j] = ncols; 2457 } 2458 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2459 } 2460 } 2461 } 2462 2463 /* Create row map: global row of C -> local row of submatrices */ 2464 #if defined(PETSC_USE_CTABLE) 2465 for (PetscInt i = 0; i < ismax; i++) { 2466 PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i)); 2467 irow_i = irow[i]; 2468 jmax = nrow[i]; 2469 for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1)); 2470 } 2471 #else 2472 for (PetscInt i = 0; i < ismax; i++) { 2473 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2474 rmap_i = rmap[i]; 2475 irow_i = irow[i]; 2476 jmax = nrow[i]; 2477 for (PetscInt j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j; 2478 } 2479 #endif 2480 2481 /* Update lens from offproc data */ 2482 { 2483 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2484 2485 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2486 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2487 sbuf1_i = sbuf1[pa[tmp2]]; 2488 jmax = sbuf1_i[0]; 2489 ct1 = 2 * jmax + 1; 2490 ct2 = 0; 2491 rbuf2_i = rbuf2[tmp2]; 2492 rbuf3_i = rbuf3[tmp2]; 2493 for (PetscInt j = 1; j <= jmax; j++) { 2494 is_no = sbuf1_i[2 * j - 1]; 2495 max1 = sbuf1_i[2 * j]; 2496 lens_i = lens[is_no]; 2497 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2498 rmap_i = rmap[is_no]; 2499 for (PetscInt k = 0; k < max1; k++, ct1++) { 2500 #if defined(PETSC_USE_CTABLE) 2501 PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row)); 2502 row--; 2503 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2504 #else 2505 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2506 #endif 2507 max2 = rbuf2_i[ct1]; 2508 for (PetscInt l = 0; l < max2; l++, ct2++) { 2509 if (!allcolumns[is_no]) { 2510 #if defined(PETSC_USE_CTABLE) 2511 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2512 #else 2513 tcol = cmap_i[rbuf3_i[ct2]]; 2514 #endif 2515 if (tcol) lens_i[row]++; 2516 } else { /* allcolumns */ 2517 lens_i[row]++; /* lens_i[row] += max2 ? */ 2518 } 2519 } 2520 } 2521 } 2522 } 2523 } 2524 PetscCall(PetscFree(r_waits3)); 2525 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2526 PetscCall(PetscFree(s_waits3)); 2527 2528 /* Create the submatrices */ 2529 for (PetscInt i = 0; i < ismax; i++) { 2530 PetscInt rbs, cbs; 2531 2532 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2533 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2534 2535 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2536 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2537 2538 if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2539 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2540 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2541 2542 /* create struct Mat_SubSppt and attached it to submat */ 2543 PetscCall(PetscNew(&smat_i)); 2544 subc = (Mat_SeqAIJ *)submats[i]->data; 2545 subc->submatis1 = smat_i; 2546 2547 smat_i->destroy = submats[i]->ops->destroy; 2548 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2549 submats[i]->factortype = C->factortype; 2550 2551 smat_i->id = i; 2552 smat_i->nrqs = nrqs; 2553 smat_i->nrqr = nrqr; 2554 smat_i->rbuf1 = rbuf1; 2555 smat_i->rbuf2 = rbuf2; 2556 smat_i->rbuf3 = rbuf3; 2557 smat_i->sbuf2 = sbuf2; 2558 smat_i->req_source2 = req_source2; 2559 2560 smat_i->sbuf1 = sbuf1; 2561 smat_i->ptr = ptr; 2562 smat_i->tmp = tmp; 2563 smat_i->ctr = ctr; 2564 2565 smat_i->pa = pa; 2566 smat_i->req_size = req_size; 2567 smat_i->req_source1 = req_source1; 2568 2569 smat_i->allcolumns = allcolumns[i]; 2570 smat_i->singleis = PETSC_FALSE; 2571 smat_i->row2proc = row2proc[i]; 2572 smat_i->rmap = rmap[i]; 2573 smat_i->cmap = cmap[i]; 2574 } 2575 2576 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2577 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2578 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2579 PetscCall(MatSetType(submats[0], MATDUMMY)); 2580 2581 /* create struct Mat_SubSppt and attached it to submat */ 2582 PetscCall(PetscNew(&smat_i)); 2583 submats[0]->data = (void *)smat_i; 2584 2585 smat_i->destroy = submats[0]->ops->destroy; 2586 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2587 submats[0]->factortype = C->factortype; 2588 2589 smat_i->id = 0; 2590 smat_i->nrqs = nrqs; 2591 smat_i->nrqr = nrqr; 2592 smat_i->rbuf1 = rbuf1; 2593 smat_i->rbuf2 = rbuf2; 2594 smat_i->rbuf3 = rbuf3; 2595 smat_i->sbuf2 = sbuf2; 2596 smat_i->req_source2 = req_source2; 2597 2598 smat_i->sbuf1 = sbuf1; 2599 smat_i->ptr = ptr; 2600 smat_i->tmp = tmp; 2601 smat_i->ctr = ctr; 2602 2603 smat_i->pa = pa; 2604 smat_i->req_size = req_size; 2605 smat_i->req_source1 = req_source1; 2606 2607 smat_i->allcolumns = PETSC_FALSE; 2608 smat_i->singleis = PETSC_FALSE; 2609 smat_i->row2proc = NULL; 2610 smat_i->rmap = NULL; 2611 smat_i->cmap = NULL; 2612 } 2613 2614 if (ismax) PetscCall(PetscFree(lens[0])); 2615 PetscCall(PetscFree(lens)); 2616 if (sbuf_aj) { 2617 PetscCall(PetscFree(sbuf_aj[0])); 2618 PetscCall(PetscFree(sbuf_aj)); 2619 } 2620 2621 } /* endof scall == MAT_INITIAL_MATRIX */ 2622 2623 /* Post recv matrix values */ 2624 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2625 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2626 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2627 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2628 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2629 PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2630 } 2631 2632 /* Allocate sending buffers for a->a, and send them off */ 2633 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2634 jcnt = 0; 2635 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 2636 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0])); 2637 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2638 2639 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2640 { 2641 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2642 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2643 PetscInt cend = C->cmap->rend; 2644 PetscInt *b_j = b->j; 2645 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2646 2647 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2648 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2649 for (PetscMPIInt i = 0; i < nrqr; i++) { 2650 rbuf1_i = rbuf1[i]; 2651 sbuf_aa_i = sbuf_aa[i]; 2652 ct1 = 2 * rbuf1_i[0] + 1; 2653 ct2 = 0; 2654 for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2655 kmax = rbuf1_i[2 * j]; 2656 for (PetscInt k = 0; k < kmax; k++, ct1++) { 2657 row = rbuf1_i[ct1] - rstart; 2658 nzA = a_i[row + 1] - a_i[row]; 2659 nzB = b_i[row + 1] - b_i[row]; 2660 ncols = nzA + nzB; 2661 cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]); 2662 vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]); 2663 vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]); 2664 2665 /* load the column values for this row into vals*/ 2666 vals = sbuf_aa_i + ct2; 2667 2668 lwrite = 0; 2669 for (PetscInt l = 0; l < nzB; l++) { 2670 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2671 } 2672 for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2673 for (PetscInt l = 0; l < nzB; l++) { 2674 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2675 } 2676 2677 ct2 += ncols; 2678 } 2679 } 2680 PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2681 } 2682 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2683 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2684 } 2685 2686 /* Assemble the matrices */ 2687 /* First assemble the local rows */ 2688 for (PetscInt i = 0; i < ismax; i++) { 2689 row2proc_i = row2proc[i]; 2690 subc = (Mat_SeqAIJ *)submats[i]->data; 2691 imat_ilen = subc->ilen; 2692 imat_j = subc->j; 2693 imat_i = subc->i; 2694 imat_a = subc->a; 2695 2696 if (!allcolumns[i]) cmap_i = cmap[i]; 2697 rmap_i = rmap[i]; 2698 irow_i = irow[i]; 2699 jmax = nrow[i]; 2700 for (PetscInt j = 0; j < jmax; j++) { 2701 row = irow_i[j]; 2702 proc = row2proc_i[j]; 2703 if (proc == rank) { 2704 old_row = row; 2705 #if defined(PETSC_USE_CTABLE) 2706 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2707 row--; 2708 #else 2709 row = rmap_i[row]; 2710 #endif 2711 ilen_row = imat_ilen[row]; 2712 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2713 mat_i = imat_i[row]; 2714 mat_a = imat_a + mat_i; 2715 mat_j = imat_j + mat_i; 2716 if (!allcolumns[i]) { 2717 for (PetscInt k = 0; k < ncols; k++) { 2718 #if defined(PETSC_USE_CTABLE) 2719 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2720 #else 2721 tcol = cmap_i[cols[k]]; 2722 #endif 2723 if (tcol) { 2724 *mat_j++ = tcol - 1; 2725 *mat_a++ = vals[k]; 2726 ilen_row++; 2727 } 2728 } 2729 } else { /* allcolumns */ 2730 for (PetscInt k = 0; k < ncols; k++) { 2731 *mat_j++ = cols[k]; /* global col index! */ 2732 *mat_a++ = vals[k]; 2733 ilen_row++; 2734 } 2735 } 2736 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2737 2738 imat_ilen[row] = ilen_row; 2739 } 2740 } 2741 } 2742 2743 /* Now assemble the off proc rows */ 2744 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2745 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2746 sbuf1_i = sbuf1[pa[tmp2]]; 2747 jmax = sbuf1_i[0]; 2748 ct1 = 2 * jmax + 1; 2749 ct2 = 0; 2750 rbuf2_i = rbuf2[tmp2]; 2751 rbuf3_i = rbuf3[tmp2]; 2752 rbuf4_i = rbuf4[tmp2]; 2753 for (PetscInt j = 1; j <= jmax; j++) { 2754 is_no = sbuf1_i[2 * j - 1]; 2755 rmap_i = rmap[is_no]; 2756 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2757 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2758 imat_ilen = subc->ilen; 2759 imat_j = subc->j; 2760 imat_i = subc->i; 2761 imat_a = subc->a; 2762 max1 = sbuf1_i[2 * j]; 2763 for (PetscInt k = 0; k < max1; k++, ct1++) { 2764 row = sbuf1_i[ct1]; 2765 #if defined(PETSC_USE_CTABLE) 2766 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2767 row--; 2768 #else 2769 row = rmap_i[row]; 2770 #endif 2771 ilen = imat_ilen[row]; 2772 mat_i = imat_i[row]; 2773 mat_a = PetscSafePointerPlusOffset(imat_a, mat_i); 2774 mat_j = PetscSafePointerPlusOffset(imat_j, mat_i); 2775 max2 = rbuf2_i[ct1]; 2776 if (!allcolumns[is_no]) { 2777 for (PetscInt l = 0; l < max2; l++, ct2++) { 2778 #if defined(PETSC_USE_CTABLE) 2779 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2780 #else 2781 tcol = cmap_i[rbuf3_i[ct2]]; 2782 #endif 2783 if (tcol) { 2784 *mat_j++ = tcol - 1; 2785 *mat_a++ = rbuf4_i[ct2]; 2786 ilen++; 2787 } 2788 } 2789 } else { /* allcolumns */ 2790 for (PetscInt l = 0; l < max2; l++, ct2++) { 2791 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2792 *mat_a++ = rbuf4_i[ct2]; 2793 ilen++; 2794 } 2795 } 2796 imat_ilen[row] = ilen; 2797 } 2798 } 2799 } 2800 2801 if (!iscsorted) { /* sort column indices of the rows */ 2802 for (PetscInt i = 0; i < ismax; i++) { 2803 subc = (Mat_SeqAIJ *)submats[i]->data; 2804 imat_j = subc->j; 2805 imat_i = subc->i; 2806 imat_a = subc->a; 2807 imat_ilen = subc->ilen; 2808 2809 if (allcolumns[i]) continue; 2810 jmax = nrow[i]; 2811 for (PetscInt j = 0; j < jmax; j++) { 2812 mat_i = imat_i[j]; 2813 mat_a = imat_a + mat_i; 2814 mat_j = imat_j + mat_i; 2815 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2816 } 2817 } 2818 } 2819 2820 PetscCall(PetscFree(r_waits4)); 2821 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2822 PetscCall(PetscFree(s_waits4)); 2823 2824 /* Restore the indices */ 2825 for (PetscInt i = 0; i < ismax; i++) { 2826 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2827 if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i)); 2828 } 2829 2830 for (PetscInt i = 0; i < ismax; i++) { 2831 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2832 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2833 } 2834 2835 /* Destroy allocated memory */ 2836 if (sbuf_aa) { 2837 PetscCall(PetscFree(sbuf_aa[0])); 2838 PetscCall(PetscFree(sbuf_aa)); 2839 } 2840 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2841 2842 for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 2843 PetscCall(PetscFree(rbuf4)); 2844 2845 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2846 PetscFunctionReturn(PETSC_SUCCESS); 2847 } 2848 2849 /* 2850 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2851 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2852 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2853 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2854 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2855 state, and needs to be "assembled" later by compressing B's column space. 2856 2857 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2858 Following this call, C->A & C->B have been created, even if empty. 2859 */ 2860 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) 2861 { 2862 /* If making this function public, change the error returned in this function away from _PLIB. */ 2863 Mat_MPIAIJ *aij; 2864 Mat_SeqAIJ *Baij; 2865 PetscBool seqaij, Bdisassembled; 2866 PetscInt m, n, *nz, ngcol, col, rstart, rend, shift, count; 2867 PetscScalar v; 2868 const PetscInt *rowindices, *colindices; 2869 2870 PetscFunctionBegin; 2871 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2872 if (A) { 2873 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2874 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2875 if (rowemb) { 2876 PetscCall(ISGetLocalSize(rowemb, &m)); 2877 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2878 } else { 2879 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2880 } 2881 if (dcolemb) { 2882 PetscCall(ISGetLocalSize(dcolemb, &n)); 2883 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2884 } else { 2885 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2886 } 2887 } 2888 if (B) { 2889 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2890 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2891 if (rowemb) { 2892 PetscCall(ISGetLocalSize(rowemb, &m)); 2893 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2894 } else { 2895 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2896 } 2897 if (ocolemb) { 2898 PetscCall(ISGetLocalSize(ocolemb, &n)); 2899 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2900 } else { 2901 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2902 } 2903 } 2904 2905 aij = (Mat_MPIAIJ *)C->data; 2906 if (!aij->A) { 2907 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2908 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2909 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2910 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2911 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2912 } 2913 if (A) { 2914 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2915 } else { 2916 PetscCall(MatSetUp(aij->A)); 2917 } 2918 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2919 /* 2920 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2921 need to "disassemble" B -- convert it to using C's global indices. 2922 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2923 2924 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2925 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2926 2927 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2928 At least avoid calling MatSetValues() and the implied searches? 2929 */ 2930 2931 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2932 #if defined(PETSC_USE_CTABLE) 2933 PetscCall(PetscHMapIDestroy(&aij->colmap)); 2934 #else 2935 PetscCall(PetscFree(aij->colmap)); 2936 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2937 #endif 2938 ngcol = 0; 2939 if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol)); 2940 if (aij->garray) PetscCall(PetscFree(aij->garray)); 2941 PetscCall(VecDestroy(&aij->lvec)); 2942 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2943 } 2944 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B)); 2945 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B)); 2946 } 2947 Bdisassembled = PETSC_FALSE; 2948 if (!aij->B) { 2949 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2950 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2951 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2952 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2953 Bdisassembled = PETSC_TRUE; 2954 } 2955 if (B) { 2956 Baij = (Mat_SeqAIJ *)B->data; 2957 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2958 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2959 for (PetscInt i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i]; 2960 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2961 PetscCall(PetscFree(nz)); 2962 } 2963 2964 PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend)); 2965 shift = rend - rstart; 2966 count = 0; 2967 rowindices = NULL; 2968 colindices = NULL; 2969 if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices)); 2970 if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices)); 2971 for (PetscInt i = 0; i < B->rmap->n; i++) { 2972 PetscInt row; 2973 row = i; 2974 if (rowindices) row = rowindices[i]; 2975 for (PetscInt j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2976 col = Baij->j[count]; 2977 if (colindices) col = colindices[col]; 2978 if (Bdisassembled && col >= rstart) col += shift; 2979 v = Baij->a[count]; 2980 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2981 ++count; 2982 } 2983 } 2984 /* No assembly for aij->B is necessary. */ 2985 /* FIXME: set aij->B's nonzerostate correctly. */ 2986 } else { 2987 PetscCall(MatSetUp(aij->B)); 2988 } 2989 C->preallocated = PETSC_TRUE; 2990 C->was_assembled = PETSC_FALSE; 2991 C->assembled = PETSC_FALSE; 2992 /* 2993 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2994 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2995 */ 2996 PetscFunctionReturn(PETSC_SUCCESS); 2997 } 2998 2999 /* 3000 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3001 */ 3002 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) 3003 { 3004 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 3005 3006 PetscFunctionBegin; 3007 PetscAssertPointer(A, 2); 3008 PetscAssertPointer(B, 3); 3009 /* FIXME: make sure C is assembled */ 3010 *A = aij->A; 3011 *B = aij->B; 3012 /* Note that we don't incref *A and *B, so be careful! */ 3013 PetscFunctionReturn(PETSC_SUCCESS); 3014 } 3015 3016 /* 3017 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3018 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3019 */ 3020 static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) 3021 { 3022 PetscMPIInt size, flag; 3023 PetscInt cismax, ispar; 3024 Mat *A, *B; 3025 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 3026 3027 PetscFunctionBegin; 3028 if (!ismax) PetscFunctionReturn(PETSC_SUCCESS); 3029 3030 cismax = 0; 3031 for (PetscInt i = 0; i < ismax; ++i) { 3032 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 3033 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3034 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3035 if (size > 1) ++cismax; 3036 } 3037 3038 /* 3039 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3040 ispar counts the number of parallel ISs across C's comm. 3041 */ 3042 PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 3043 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3044 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 3045 PetscFunctionReturn(PETSC_SUCCESS); 3046 } 3047 3048 /* if (ispar) */ 3049 /* 3050 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3051 These are used to extract the off-diag portion of the resulting parallel matrix. 3052 The row IS for the off-diag portion is the same as for the diag portion, 3053 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3054 */ 3055 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3056 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3057 for (PetscInt i = 0, ii = 0; i < ismax; ++i) { 3058 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3059 if (size > 1) { 3060 /* 3061 TODO: This is the part that's ***NOT SCALABLE***. 3062 To fix this we need to extract just the indices of C's nonzero columns 3063 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3064 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3065 to be done without serializing on the IS list, so, most likely, it is best 3066 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3067 */ 3068 PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii])); 3069 /* Now we have to 3070 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3071 were sorted on each rank, concatenated they might no longer be sorted; 3072 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3073 indices in the nondecreasing order to the original index positions. 3074 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3075 */ 3076 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3077 PetscCall(ISSort(ciscol[ii])); 3078 ++ii; 3079 } 3080 } 3081 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3082 for (PetscInt i = 0, ii = 0; i < ismax; ++i) { 3083 PetscInt issize; 3084 const PetscInt *indices; 3085 3086 /* 3087 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3088 */ 3089 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3090 PetscCall(ISSort(isrow[i])); 3091 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3092 PetscCall(ISGetIndices(isrow[i], &indices)); 3093 for (PetscInt j = 1; j < issize; ++j) { 3094 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3095 } 3096 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3097 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3098 PetscCall(ISSort(iscol[i])); 3099 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3100 PetscCall(ISGetIndices(iscol[i], &indices)); 3101 for (PetscInt j = 1; j < issize; ++j) { 3102 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3103 } 3104 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3105 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3106 if (size > 1) { 3107 cisrow[ii] = isrow[i]; 3108 ++ii; 3109 } 3110 } 3111 /* 3112 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3113 array of sequential matrices underlying the resulting parallel matrices. 3114 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3115 contain duplicates. 3116 3117 There are as many diag matrices as there are original index sets. There are only as many parallel 3118 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3119 3120 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3121 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3122 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3123 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3124 with A[i] and B[ii] extracted from the corresponding MPI submat. 3125 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3126 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3127 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3128 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3129 values into A[i] and B[ii] sitting inside the corresponding submat. 3130 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3131 A[i], B[ii], AA[i] or BB[ii] matrices. 3132 */ 3133 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3134 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat)); 3135 3136 /* Now obtain the sequential A and B submatrices separately. */ 3137 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3138 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3139 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3140 3141 /* 3142 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3143 matrices A & B have been extracted directly into the parallel matrices containing them, or 3144 simply into the sequential matrix identical with the corresponding A (if size == 1). 3145 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3146 to have the same sparsity pattern. 3147 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3148 must be constructed for C. This is done by setseqmat(s). 3149 */ 3150 for (PetscInt i = 0, ii = 0; i < ismax; ++i) { 3151 /* 3152 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3153 That way we can avoid sorting and computing permutations when reusing. 3154 To this end: 3155 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3156 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3157 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3158 */ 3159 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3160 3161 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3162 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3163 if (size > 1) { 3164 if (scall == MAT_INITIAL_MATRIX) { 3165 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3166 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3167 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3168 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3169 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3170 } 3171 /* 3172 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3173 */ 3174 { 3175 Mat AA = A[i], BB = B[ii]; 3176 3177 if (AA || BB) { 3178 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3179 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3180 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3181 } 3182 PetscCall(MatDestroy(&AA)); 3183 } 3184 PetscCall(ISDestroy(ciscol + ii)); 3185 PetscCall(ISDestroy(ciscol_p + ii)); 3186 ++ii; 3187 } else { /* if (size == 1) */ 3188 if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i])); 3189 if (isrow_p[i] || iscol_p[i]) { 3190 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3191 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3192 /* Otherwise A is extracted straight into (*submats)[i]. */ 3193 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3194 PetscCall(MatDestroy(A + i)); 3195 } else (*submat)[i] = A[i]; 3196 } 3197 PetscCall(ISDestroy(&isrow_p[i])); 3198 PetscCall(ISDestroy(&iscol_p[i])); 3199 } 3200 PetscCall(PetscFree2(cisrow, ciscol)); 3201 PetscCall(PetscFree2(isrow_p, iscol_p)); 3202 PetscCall(PetscFree(ciscol_p)); 3203 PetscCall(PetscFree(A)); 3204 PetscCall(MatDestroySubMatrices(cismax, &B)); 3205 PetscFunctionReturn(PETSC_SUCCESS); 3206 } 3207 3208 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 3209 { 3210 PetscFunctionBegin; 3211 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3212 PetscFunctionReturn(PETSC_SUCCESS); 3213 } 3214