1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 /* 22 Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks 23 24 The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but 25 that is a very major recoding job. 26 27 Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin 28 */ 29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[]) 30 { 31 PetscFunctionBegin; 32 for (PetscInt i = 0; i < imax; i++) { 33 if (!is[i]) break; 34 PetscInt n = 0, N, Nmax, Nmin; 35 const PetscInt *idx; 36 PetscInt *nidx = NULL; 37 MPI_Comm comm; 38 PetscBT bt; 39 40 PetscCall(ISGetLocalSize(is[i], &N)); 41 if (N > 0) { /* Nmax and Nmin are garbage for empty IS */ 42 PetscCall(ISGetIndices(is[i], &idx)); 43 PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax)); 44 Nmin = Nmin / bs; 45 Nmax = Nmax / bs; 46 PetscCall(PetscBTCreate(Nmax - Nmin, &bt)); 47 for (PetscInt j = 0; j < N; j++) { 48 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++; 49 } 50 PetscCall(PetscMalloc1(n, &nidx)); 51 n = 0; 52 PetscCall(PetscBTMemzero(Nmax - Nmin, bt)); 53 for (PetscInt j = 0; j < N; j++) { 54 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs; 55 } 56 PetscCall(PetscBTDestroy(&bt)); 57 PetscCall(ISRestoreIndices(is[i], &idx)); 58 } 59 PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm)); 60 PetscCall(ISDestroy(is + i)); 61 PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i)); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) 67 { 68 PetscInt i; 69 70 PetscFunctionBegin; 71 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 72 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); 73 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 74 PetscFunctionReturn(PETSC_SUCCESS); 75 } 76 77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) 78 { 79 PetscInt i; 80 81 PetscFunctionBegin; 82 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 83 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); 84 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) 89 { 90 MPI_Comm comm; 91 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_isids, j; 92 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 93 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 94 const PetscInt *indices_i, **indices; 95 PetscLayout rmap; 96 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner, *rrow_ranks; 97 PetscSF sf; 98 PetscSFNode *remote; 99 100 PetscFunctionBegin; 101 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 102 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 103 PetscCallMPI(MPI_Comm_size(comm, &size)); 104 /* get row map to determine where rows should be going */ 105 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 106 /* retrieve IS data and put all together so that we 107 * can optimize communication 108 * */ 109 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 110 tlength = 0; 111 for (PetscInt i = 0; i < nidx; i++) { 112 PetscCall(ISGetLocalSize(is[i], &length[i])); 113 tlength += length[i]; 114 PetscCall(ISGetIndices(is[i], &indices[i])); 115 } 116 /* find these rows on remote processors */ 117 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 118 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 119 nrrows = 0; 120 for (PetscInt i = 0; i < nidx; i++) { 121 length_i = length[i]; 122 indices_i = indices[i]; 123 for (PetscInt j = 0; j < length_i; j++) { 124 owner = -1; 125 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 126 /* remote processors */ 127 if (owner != rank) { 128 tosizes_temp[owner]++; /* number of rows to owner */ 129 rrow_ranks[nrrows] = owner; /* processor */ 130 rrow_isids[nrrows] = i; /* is id */ 131 remoterows[nrrows++] = indices_i[j]; /* row */ 132 } 133 } 134 PetscCall(ISRestoreIndices(is[i], &indices[i])); 135 } 136 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 137 /* test if we need to exchange messages 138 * generally speaking, we do not need to exchange 139 * data when overlap is 1 140 * */ 141 PetscCallMPI(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 142 /* we do not have any messages 143 * It usually corresponds to overlap 1 144 * */ 145 if (!reducednrrows) { 146 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 147 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 148 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 149 PetscFunctionReturn(PETSC_SUCCESS); 150 } 151 nto = 0; 152 /* send sizes and ranks for building a two-sided communication */ 153 for (PetscMPIInt i = 0; i < size; i++) { 154 if (tosizes_temp[i]) { 155 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 156 tosizes_temp[i] = nto; /* a map from processor to index */ 157 toranks[nto++] = i; /* MPI process */ 158 } 159 } 160 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 161 toffsets[0] = 0; 162 for (PetscInt i = 0; i < nto; i++) { 163 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 164 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 165 } 166 /* send information to other processors */ 167 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 168 nrecvrows = 0; 169 for (PetscMPIInt i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 170 PetscCall(PetscMalloc1(nrecvrows, &remote)); 171 nrecvrows = 0; 172 for (PetscMPIInt i = 0; i < nfrom; i++) { 173 for (PetscInt j = 0; j < fromsizes[2 * i]; j++) { 174 remote[nrecvrows].rank = fromranks[i]; 175 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 176 } 177 } 178 PetscCall(PetscSFCreate(comm, &sf)); 179 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 180 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 181 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 182 PetscCall(PetscSFSetFromOptions(sf)); 183 /* message pair <no of is, row> */ 184 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 185 for (PetscInt i = 0; i < nrrows; i++) { 186 owner = rrow_ranks[i]; /* process */ 187 j = tosizes_temp[owner]; /* index */ 188 todata[toffsets[j]++] = rrow_isids[i]; 189 todata[toffsets[j]++] = remoterows[i]; 190 } 191 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 192 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 193 PetscCall(PetscFree(toffsets)); 194 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 195 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 196 PetscCall(PetscSFDestroy(&sf)); 197 /* send rows belonging to the remote so that then we could get the overlapping data back */ 198 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 199 PetscCall(PetscFree2(todata, fromdata)); 200 PetscCall(PetscFree(fromsizes)); 201 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 202 PetscCall(PetscFree(fromranks)); 203 nrecvrows = 0; 204 for (PetscInt i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 205 PetscCall(PetscCalloc1(nrecvrows, &todata)); 206 PetscCall(PetscMalloc1(nrecvrows, &remote)); 207 nrecvrows = 0; 208 for (PetscInt i = 0; i < nto; i++) { 209 for (PetscInt j = 0; j < tosizes[2 * i]; j++) { 210 remote[nrecvrows].rank = toranks[i]; 211 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 212 } 213 } 214 PetscCall(PetscSFCreate(comm, &sf)); 215 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 216 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 217 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 218 PetscCall(PetscSFSetFromOptions(sf)); 219 /* overlap communication and computation */ 220 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 221 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 222 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 223 PetscCall(PetscSFDestroy(&sf)); 224 PetscCall(PetscFree2(sbdata, sbsizes)); 225 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 226 PetscCall(PetscFree(toranks)); 227 PetscCall(PetscFree(tosizes)); 228 PetscCall(PetscFree(todata)); 229 PetscFunctionReturn(PETSC_SUCCESS); 230 } 231 232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 233 { 234 PetscInt *isz, isz_i, i, j, is_id, data_size; 235 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 236 const PetscInt *indices_i_temp; 237 MPI_Comm *iscomms; 238 239 PetscFunctionBegin; 240 max_lsize = 0; 241 PetscCall(PetscMalloc1(nidx, &isz)); 242 for (i = 0; i < nidx; i++) { 243 PetscCall(ISGetLocalSize(is[i], &lsize)); 244 max_lsize = lsize > max_lsize ? lsize : max_lsize; 245 isz[i] = lsize; 246 } 247 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 248 for (i = 0; i < nidx; i++) { 249 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 250 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 251 PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i])); 252 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 253 PetscCall(ISDestroy(&is[i])); 254 } 255 /* retrieve information to get row id and its overlap */ 256 for (i = 0; i < nrecvs;) { 257 is_id = recvdata[i++]; 258 data_size = recvdata[i++]; 259 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 260 isz_i = isz[is_id]; 261 for (j = 0; j < data_size; j++) { 262 col = recvdata[i++]; 263 indices_i[isz_i++] = col; 264 } 265 isz[is_id] = isz_i; 266 } 267 /* remove duplicate entities */ 268 for (i = 0; i < nidx; i++) { 269 indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i); 270 isz_i = isz[i]; 271 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 272 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 273 PetscCall(PetscCommDestroy(&iscomms[i])); 274 } 275 PetscCall(PetscFree(isz)); 276 PetscCall(PetscFree2(indices_temp, iscomms)); 277 PetscFunctionReturn(PETSC_SUCCESS); 278 } 279 280 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 281 { 282 PetscLayout rmap, cmap; 283 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 284 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 285 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 286 const PetscInt *gcols, *ai, *aj, *bi, *bj; 287 Mat amat, bmat; 288 PetscMPIInt rank; 289 PetscBool done; 290 MPI_Comm comm; 291 292 PetscFunctionBegin; 293 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 294 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 295 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 296 /* Even if the mat is symmetric, we still assume it is not symmetric */ 297 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 298 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 299 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 300 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 301 /* total number of nonzero values is used to estimate the memory usage in the next step */ 302 tnz = ai[an] + bi[bn]; 303 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 304 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 305 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 306 /* to find the longest message */ 307 max_fszs = 0; 308 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 309 /* better way to estimate number of nonzero in the mat??? */ 310 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 311 for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i); 312 rows_pos = 0; 313 totalrows = 0; 314 for (i = 0; i < nfrom; i++) { 315 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 316 /* group data together */ 317 for (j = 0; j < fromsizes[2 * i]; j += 2) { 318 is_id = fromrows[rows_pos++]; /* no of is */ 319 rows_i = rows_data[is_id]; 320 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 321 } 322 /* estimate a space to avoid multiple allocations */ 323 for (j = 0; j < nidx; j++) { 324 indvc_ij = 0; 325 rows_i = rows_data[j]; 326 for (l = 0; l < rows_pos_i[j]; l++) { 327 row = rows_i[l] - rstart; 328 start = ai[row]; 329 end = ai[row + 1]; 330 for (k = start; k < end; k++) { /* Amat */ 331 col = aj[k] + cstart; 332 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 333 } 334 start = bi[row]; 335 end = bi[row + 1]; 336 for (k = start; k < end; k++) { /* Bmat */ 337 col = gcols[bj[k]]; 338 indices_tmp[indvc_ij++] = col; 339 } 340 } 341 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 342 indv_counts[i * nidx + j] = indvc_ij; 343 totalrows += indvc_ij; 344 } 345 } 346 /* message triple <no of is, number of rows, rows> */ 347 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 348 totalrows = 0; 349 rows_pos = 0; 350 /* use this code again */ 351 for (i = 0; i < nfrom; i++) { 352 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 353 for (j = 0; j < fromsizes[2 * i]; j += 2) { 354 is_id = fromrows[rows_pos++]; 355 rows_i = rows_data[is_id]; 356 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 357 } 358 /* add data */ 359 for (j = 0; j < nidx; j++) { 360 if (!indv_counts[i * nidx + j]) continue; 361 indvc_ij = 0; 362 sbdata[totalrows++] = j; 363 sbdata[totalrows++] = indv_counts[i * nidx + j]; 364 sbsizes[2 * i] += 2; 365 rows_i = rows_data[j]; 366 for (l = 0; l < rows_pos_i[j]; l++) { 367 row = rows_i[l] - rstart; 368 start = ai[row]; 369 end = ai[row + 1]; 370 for (k = start; k < end; k++) { /* Amat */ 371 col = aj[k] + cstart; 372 indices_tmp[indvc_ij++] = col; 373 } 374 start = bi[row]; 375 end = bi[row + 1]; 376 for (k = start; k < end; k++) { /* Bmat */ 377 col = gcols[bj[k]]; 378 indices_tmp[indvc_ij++] = col; 379 } 380 } 381 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 382 sbsizes[2 * i] += indvc_ij; 383 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 384 totalrows += indvc_ij; 385 } 386 } 387 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 388 offsets[0] = 0; 389 for (i = 0; i < nfrom; i++) { 390 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 391 sbsizes[2 * i + 1] = offsets[i]; 392 } 393 PetscCall(PetscFree(offsets)); 394 if (sbrowsizes) *sbrowsizes = sbsizes; 395 if (sbrows) *sbrows = sbdata; 396 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 397 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 398 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 399 PetscFunctionReturn(PETSC_SUCCESS); 400 } 401 402 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) 403 { 404 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 405 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 406 PetscInt lsize, lsize_tmp; 407 PetscMPIInt rank, owner; 408 Mat amat, bmat; 409 PetscBool done; 410 PetscLayout cmap, rmap; 411 MPI_Comm comm; 412 413 PetscFunctionBegin; 414 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 415 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 416 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 417 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 418 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 419 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 420 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 421 /* is it a safe way to compute number of nonzero values ? */ 422 tnz = ai[an] + bi[bn]; 423 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 424 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 425 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 426 /* it is a better way to estimate memory than the old implementation 427 * where global size of matrix is used 428 * */ 429 PetscCall(PetscMalloc1(tnz, &indices_temp)); 430 for (i = 0; i < nidx; i++) { 431 MPI_Comm iscomm; 432 433 PetscCall(ISGetLocalSize(is[i], &lsize)); 434 PetscCall(ISGetIndices(is[i], &indices)); 435 lsize_tmp = 0; 436 for (j = 0; j < lsize; j++) { 437 owner = -1; 438 row = indices[j]; 439 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 440 if (owner != rank) continue; 441 /* local number */ 442 row -= rstart; 443 start = ai[row]; 444 end = ai[row + 1]; 445 for (k = start; k < end; k++) { /* Amat */ 446 col = aj[k] + cstart; 447 indices_temp[lsize_tmp++] = col; 448 } 449 start = bi[row]; 450 end = bi[row + 1]; 451 for (k = start; k < end; k++) { /* Bmat */ 452 col = gcols[bj[k]]; 453 indices_temp[lsize_tmp++] = col; 454 } 455 } 456 PetscCall(ISRestoreIndices(is[i], &indices)); 457 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 458 PetscCall(ISDestroy(&is[i])); 459 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 460 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 461 PetscCall(PetscCommDestroy(&iscomm)); 462 } 463 PetscCall(PetscFree(indices_temp)); 464 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 465 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 466 PetscFunctionReturn(PETSC_SUCCESS); 467 } 468 469 /* 470 Sample message format: 471 If a processor A wants processor B to process some elements corresponding 472 to index sets is[1],is[5] 473 mesg [0] = 2 (no of index sets in the mesg) 474 ----------- 475 mesg [1] = 1 => is[1] 476 mesg [2] = sizeof(is[1]); 477 ----------- 478 mesg [3] = 5 => is[5] 479 mesg [4] = sizeof(is[5]); 480 ----------- 481 mesg [5] 482 mesg [n] datas[1] 483 ----------- 484 mesg[n+1] 485 mesg[m] data(is[5]) 486 ----------- 487 488 Notes: 489 nrqs - no of requests sent (or to be sent out) 490 nrqr - no of requests received (which have to be or which have been processed) 491 */ 492 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) 493 { 494 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 495 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 496 const PetscInt **idx, *idx_i; 497 PetscInt *n, **data, len; 498 #if defined(PETSC_USE_CTABLE) 499 PetscHMapI *table_data, table_data_i; 500 PetscInt *tdata, tcount, tcount_max; 501 #else 502 PetscInt *data_i, *d_p; 503 #endif 504 PetscMPIInt size, rank, tag1, tag2, proc = 0, nrqs, *pa; 505 PetscInt M, k, **rbuf, row, msz, **outdat, **ptr, *isz1; 506 PetscInt *ctr, *tmp, *isz, **xdata, **rbuf2; 507 PetscBT *table; 508 MPI_Comm comm; 509 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 510 MPI_Status *recv_status; 511 MPI_Comm *iscomms; 512 char *t_p; 513 514 PetscFunctionBegin; 515 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 516 size = c->size; 517 rank = c->rank; 518 M = C->rmap->N; 519 520 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 521 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 522 523 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 524 525 for (PetscInt i = 0; i < imax; i++) { 526 PetscCall(ISGetIndices(is[i], &idx[i])); 527 PetscCall(ISGetLocalSize(is[i], &n[i])); 528 } 529 530 /* evaluate communication - mesg to who,length of mesg, and buffer space 531 required. Based on this, buffers are allocated, and data copied into them */ 532 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 533 for (PetscInt i = 0; i < imax; i++) { 534 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 535 idx_i = idx[i]; 536 len = n[i]; 537 for (PetscInt j = 0; j < len; j++) { 538 row = idx_i[j]; 539 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 540 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 541 w4[proc]++; 542 } 543 for (PetscMPIInt j = 0; j < size; j++) { 544 if (w4[j]) { 545 w1[j] += w4[j]; 546 w3[j]++; 547 } 548 } 549 } 550 551 nrqs = 0; /* no of outgoing messages */ 552 msz = 0; /* total mesg length (for all proc */ 553 w1[rank] = 0; /* no mesg sent to intself */ 554 w3[rank] = 0; 555 for (PetscMPIInt i = 0; i < size; i++) { 556 if (w1[i]) { 557 w2[i] = 1; 558 nrqs++; 559 } /* there exists a message to proc i */ 560 } 561 /* pa - is list of processors to communicate with */ 562 PetscCall(PetscMalloc1(nrqs, &pa)); 563 for (PetscMPIInt i = 0, j = 0; i < size; i++) { 564 if (w1[i]) { 565 pa[j] = i; 566 j++; 567 } 568 } 569 570 /* Each message would have a header = 1 + 2*(no of IS) + data */ 571 for (PetscMPIInt i = 0; i < nrqs; i++) { 572 PetscMPIInt j = pa[i]; 573 w1[j] += w2[j] + 2 * w3[j]; 574 msz += w1[j]; 575 } 576 577 /* Determine the number of messages to expect, their lengths, from from-ids */ 578 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 579 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 580 581 /* Now post the Irecvs corresponding to these messages */ 582 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 583 584 /* Allocate Memory for outgoing messages */ 585 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 586 PetscCall(PetscArrayzero(outdat, size)); 587 PetscCall(PetscArrayzero(ptr, size)); 588 589 { 590 PetscInt *iptr = tmp, ict = 0; 591 for (PetscMPIInt i = 0; i < nrqs; i++) { 592 PetscMPIInt j = pa[i]; 593 iptr += ict; 594 outdat[j] = iptr; 595 ict = w1[j]; 596 } 597 } 598 599 /* Form the outgoing messages */ 600 /* plug in the headers */ 601 for (PetscMPIInt i = 0; i < nrqs; i++) { 602 PetscMPIInt j = pa[i]; 603 outdat[j][0] = 0; 604 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 605 ptr[j] = outdat[j] + 2 * w3[j] + 1; 606 } 607 608 /* Memory for doing local proc's work */ 609 { 610 PetscInt M_BPB_imax = 0; 611 #if defined(PETSC_USE_CTABLE) 612 PetscCall(PetscIntMultError(M / PETSC_BITS_PER_BYTE + 1, imax, &M_BPB_imax)); 613 PetscCall(PetscMalloc1(imax, &table_data)); 614 for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i)); 615 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 616 for (PetscInt i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 617 #else 618 PetscInt Mimax = 0; 619 PetscCall(PetscIntMultError(M, imax, &Mimax)); 620 PetscCall(PetscIntMultError(M / PETSC_BITS_PER_BYTE + 1, imax, &M_BPB_imax)); 621 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 622 for (PetscInt i = 0; i < imax; i++) { 623 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 624 data[i] = d_p + M * i; 625 } 626 #endif 627 } 628 629 /* Parse the IS and update local tables and the outgoing buf with the data */ 630 { 631 PetscInt n_i, isz_i, *outdat_j, ctr_j; 632 PetscBT table_i; 633 634 for (PetscInt i = 0; i < imax; i++) { 635 PetscCall(PetscArrayzero(ctr, size)); 636 n_i = n[i]; 637 table_i = table[i]; 638 idx_i = idx[i]; 639 #if defined(PETSC_USE_CTABLE) 640 table_data_i = table_data[i]; 641 #else 642 data_i = data[i]; 643 #endif 644 isz_i = isz[i]; 645 for (PetscInt j = 0; j < n_i; j++) { /* parse the indices of each IS */ 646 row = idx_i[j]; 647 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 648 if (proc != rank) { /* copy to the outgoing buffer */ 649 ctr[proc]++; 650 *ptr[proc] = row; 651 ptr[proc]++; 652 } else if (!PetscBTLookupSet(table_i, row)) { 653 #if defined(PETSC_USE_CTABLE) 654 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 655 #else 656 data_i[isz_i] = row; /* Update the local table */ 657 #endif 658 isz_i++; 659 } 660 } 661 /* Update the headers for the current IS */ 662 for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 663 if ((ctr_j = ctr[j])) { 664 outdat_j = outdat[j]; 665 k = ++outdat_j[0]; 666 outdat_j[2 * k] = ctr_j; 667 outdat_j[2 * k - 1] = i; 668 } 669 } 670 isz[i] = isz_i; 671 } 672 } 673 674 /* Now post the sends */ 675 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 676 for (PetscMPIInt i = 0; i < nrqs; ++i) { 677 PetscMPIInt j = pa[i]; 678 PetscCallMPI(MPIU_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 679 } 680 681 /* No longer need the original indices */ 682 PetscCall(PetscMalloc1(imax, &iscomms)); 683 for (PetscInt i = 0; i < imax; ++i) { 684 PetscCall(ISRestoreIndices(is[i], idx + i)); 685 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 686 } 687 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 688 689 for (PetscInt i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i])); 690 691 /* Do Local work */ 692 #if defined(PETSC_USE_CTABLE) 693 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 694 #else 695 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 696 #endif 697 698 /* Receive messages */ 699 PetscCall(PetscMalloc1(nrqr, &recv_status)); 700 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 701 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 702 703 /* Phase 1 sends are complete - deallocate buffers */ 704 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 705 PetscCall(PetscFree4(w1, w2, w3, w4)); 706 707 PetscCall(PetscMalloc1(nrqr, &xdata)); 708 PetscCall(PetscMalloc1(nrqr, &isz1)); 709 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 710 PetscCall(PetscFree(rbuf[0])); 711 PetscCall(PetscFree(rbuf)); 712 713 /* Send the data back */ 714 /* Do a global reduction to know the buffer space req for incoming messages */ 715 { 716 PetscMPIInt *rw1; 717 718 PetscCall(PetscCalloc1(size, &rw1)); 719 for (PetscMPIInt i = 0; i < nrqr; ++i) { 720 proc = recv_status[i].MPI_SOURCE; 721 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 722 PetscCall(PetscMPIIntCast(isz1[i], &rw1[proc])); 723 } 724 PetscCall(PetscFree(onodes1)); 725 PetscCall(PetscFree(olengths1)); 726 727 /* Determine the number of messages to expect, their lengths, from from-ids */ 728 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 729 PetscCall(PetscFree(rw1)); 730 } 731 /* Now post the Irecvs corresponding to these messages */ 732 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 733 734 /* Now post the sends */ 735 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 736 for (PetscMPIInt i = 0; i < nrqr; ++i) PetscCallMPI(MPIU_Isend(xdata[i], isz1[i], MPIU_INT, recv_status[i].MPI_SOURCE, tag2, comm, s_waits2 + i)); 737 738 /* receive work done on other processors */ 739 { 740 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 741 PetscMPIInt idex; 742 PetscBT table_i; 743 744 for (PetscMPIInt i = 0; i < nrqs; ++i) { 745 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 746 /* Process the message */ 747 rbuf2_i = rbuf2[idex]; 748 ct1 = 2 * rbuf2_i[0] + 1; 749 jmax = rbuf2[idex][0]; 750 for (PetscInt j = 1; j <= jmax; j++) { 751 max = rbuf2_i[2 * j]; 752 is_no = rbuf2_i[2 * j - 1]; 753 isz_i = isz[is_no]; 754 table_i = table[is_no]; 755 #if defined(PETSC_USE_CTABLE) 756 table_data_i = table_data[is_no]; 757 #else 758 data_i = data[is_no]; 759 #endif 760 for (k = 0; k < max; k++, ct1++) { 761 row = rbuf2_i[ct1]; 762 if (!PetscBTLookupSet(table_i, row)) { 763 #if defined(PETSC_USE_CTABLE) 764 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 765 #else 766 data_i[isz_i] = row; 767 #endif 768 isz_i++; 769 } 770 } 771 isz[is_no] = isz_i; 772 } 773 } 774 775 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 776 } 777 778 #if defined(PETSC_USE_CTABLE) 779 tcount_max = 0; 780 for (PetscInt i = 0; i < imax; ++i) { 781 table_data_i = table_data[i]; 782 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 783 if (tcount_max < tcount) tcount_max = tcount; 784 } 785 PetscCall(PetscMalloc1(tcount_max, &tdata)); 786 #endif 787 788 for (PetscInt i = 0; i < imax; ++i) { 789 #if defined(PETSC_USE_CTABLE) 790 PetscHashIter tpos; 791 PetscInt j; 792 793 table_data_i = table_data[i]; 794 PetscHashIterBegin(table_data_i, tpos); 795 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 796 PetscHashIterGetKey(table_data_i, tpos, k); 797 PetscHashIterGetVal(table_data_i, tpos, j); 798 PetscHashIterNext(table_data_i, tpos); 799 tdata[--j] = --k; 800 } 801 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 802 #else 803 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 804 #endif 805 PetscCall(PetscCommDestroy(&iscomms[i])); 806 } 807 808 PetscCall(PetscFree(iscomms)); 809 PetscCall(PetscFree(onodes2)); 810 PetscCall(PetscFree(olengths2)); 811 812 PetscCall(PetscFree(pa)); 813 PetscCall(PetscFree(rbuf2[0])); 814 PetscCall(PetscFree(rbuf2)); 815 PetscCall(PetscFree(s_waits1)); 816 PetscCall(PetscFree(r_waits1)); 817 PetscCall(PetscFree(s_waits2)); 818 PetscCall(PetscFree(r_waits2)); 819 PetscCall(PetscFree(recv_status)); 820 if (xdata) { 821 PetscCall(PetscFree(xdata[0])); 822 PetscCall(PetscFree(xdata)); 823 } 824 PetscCall(PetscFree(isz1)); 825 #if defined(PETSC_USE_CTABLE) 826 for (PetscInt i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i)); 827 PetscCall(PetscFree(table_data)); 828 PetscCall(PetscFree(tdata)); 829 PetscCall(PetscFree4(table, data, isz, t_p)); 830 #else 831 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 832 #endif 833 PetscFunctionReturn(PETSC_SUCCESS); 834 } 835 836 /* 837 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 838 the work on the local processor. 839 840 Inputs: 841 C - MAT_MPIAIJ; 842 imax - total no of index sets processed at a time; 843 table - an array of char - size = m bits. 844 845 Output: 846 isz - array containing the count of the solution elements corresponding 847 to each index set; 848 data or table_data - pointer to the solutions 849 */ 850 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data) 851 { 852 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 853 Mat A = c->A, B = c->B; 854 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 855 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 856 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 857 PetscBT table_i; 858 #if defined(PETSC_USE_CTABLE) 859 PetscHMapI table_data_i; 860 PetscHashIter tpos; 861 PetscInt tcount, *tdata; 862 #else 863 PetscInt *data_i; 864 #endif 865 866 PetscFunctionBegin; 867 rstart = C->rmap->rstart; 868 cstart = C->cmap->rstart; 869 ai = a->i; 870 aj = a->j; 871 bi = b->i; 872 bj = b->j; 873 garray = c->garray; 874 875 for (i = 0; i < imax; i++) { 876 #if defined(PETSC_USE_CTABLE) 877 /* copy existing entries of table_data_i into tdata[] */ 878 table_data_i = table_data[i]; 879 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 880 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 881 882 PetscCall(PetscMalloc1(tcount, &tdata)); 883 PetscHashIterBegin(table_data_i, tpos); 884 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 885 PetscHashIterGetKey(table_data_i, tpos, row); 886 PetscHashIterGetVal(table_data_i, tpos, j); 887 PetscHashIterNext(table_data_i, tpos); 888 tdata[--j] = --row; 889 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 890 } 891 #else 892 data_i = data[i]; 893 #endif 894 table_i = table[i]; 895 isz_i = isz[i]; 896 max = isz[i]; 897 898 for (j = 0; j < max; j++) { 899 #if defined(PETSC_USE_CTABLE) 900 row = tdata[j] - rstart; 901 #else 902 row = data_i[j] - rstart; 903 #endif 904 start = ai[row]; 905 end = ai[row + 1]; 906 for (k = start; k < end; k++) { /* Amat */ 907 val = aj[k] + cstart; 908 if (!PetscBTLookupSet(table_i, val)) { 909 #if defined(PETSC_USE_CTABLE) 910 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 911 #else 912 data_i[isz_i] = val; 913 #endif 914 isz_i++; 915 } 916 } 917 start = bi[row]; 918 end = bi[row + 1]; 919 for (k = start; k < end; k++) { /* Bmat */ 920 val = garray[bj[k]]; 921 if (!PetscBTLookupSet(table_i, val)) { 922 #if defined(PETSC_USE_CTABLE) 923 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 924 #else 925 data_i[isz_i] = val; 926 #endif 927 isz_i++; 928 } 929 } 930 } 931 isz[i] = isz_i; 932 933 #if defined(PETSC_USE_CTABLE) 934 PetscCall(PetscFree(tdata)); 935 #endif 936 } 937 PetscFunctionReturn(PETSC_SUCCESS); 938 } 939 940 /* 941 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 942 and return the output 943 944 Input: 945 C - the matrix 946 nrqr - no of messages being processed. 947 rbuf - an array of pointers to the received requests 948 949 Output: 950 xdata - array of messages to be sent back 951 isz1 - size of each message 952 953 For better efficiency perhaps we should malloc separately each xdata[i], 954 then if a remalloc is required we need only copy the data for that one row 955 rather than all previous rows as it is now where a single large chunk of 956 memory is used. 957 958 */ 959 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) 960 { 961 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 962 Mat A = c->A, B = c->B; 963 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 964 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 965 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 966 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 967 PetscInt *rbuf_i, kmax, rbuf_0; 968 PetscBT xtable; 969 970 PetscFunctionBegin; 971 m = C->rmap->N; 972 rstart = C->rmap->rstart; 973 cstart = C->cmap->rstart; 974 ai = a->i; 975 aj = a->j; 976 bi = b->i; 977 bj = b->j; 978 garray = c->garray; 979 980 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 981 rbuf_i = rbuf[i]; 982 rbuf_0 = rbuf_i[0]; 983 ct += rbuf_0; 984 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 985 } 986 987 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 988 else max1 = 1; 989 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 990 if (nrqr) { 991 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 992 ++no_malloc; 993 } 994 PetscCall(PetscBTCreate(m, &xtable)); 995 PetscCall(PetscArrayzero(isz1, nrqr)); 996 997 ct3 = 0; 998 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 999 rbuf_i = rbuf[i]; 1000 rbuf_0 = rbuf_i[0]; 1001 ct1 = 2 * rbuf_0 + 1; 1002 ct2 = ct1; 1003 ct3 += ct1; 1004 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 1005 PetscCall(PetscBTMemzero(m, xtable)); 1006 oct2 = ct2; 1007 kmax = rbuf_i[2 * j]; 1008 for (k = 0; k < kmax; k++, ct1++) { 1009 row = rbuf_i[ct1]; 1010 if (!PetscBTLookupSet(xtable, row)) { 1011 if (!(ct3 < mem_estimate)) { 1012 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1013 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1014 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1015 PetscCall(PetscFree(xdata[0])); 1016 xdata[0] = tmp; 1017 mem_estimate = new_estimate; 1018 ++no_malloc; 1019 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1020 } 1021 xdata[i][ct2++] = row; 1022 ct3++; 1023 } 1024 } 1025 for (k = oct2, max2 = ct2; k < max2; k++) { 1026 row = xdata[i][k] - rstart; 1027 start = ai[row]; 1028 end = ai[row + 1]; 1029 for (l = start; l < end; l++) { 1030 val = aj[l] + cstart; 1031 if (!PetscBTLookupSet(xtable, val)) { 1032 if (!(ct3 < mem_estimate)) { 1033 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1034 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1035 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1036 PetscCall(PetscFree(xdata[0])); 1037 xdata[0] = tmp; 1038 mem_estimate = new_estimate; 1039 ++no_malloc; 1040 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1041 } 1042 xdata[i][ct2++] = val; 1043 ct3++; 1044 } 1045 } 1046 start = bi[row]; 1047 end = bi[row + 1]; 1048 for (l = start; l < end; l++) { 1049 val = garray[bj[l]]; 1050 if (!PetscBTLookupSet(xtable, val)) { 1051 if (!(ct3 < mem_estimate)) { 1052 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1053 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1054 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1055 PetscCall(PetscFree(xdata[0])); 1056 xdata[0] = tmp; 1057 mem_estimate = new_estimate; 1058 ++no_malloc; 1059 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1060 } 1061 xdata[i][ct2++] = val; 1062 ct3++; 1063 } 1064 } 1065 } 1066 /* Update the header*/ 1067 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1068 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1069 } 1070 xdata[i][0] = rbuf_0; 1071 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1072 isz1[i] = ct2; /* size of each message */ 1073 } 1074 PetscCall(PetscBTDestroy(&xtable)); 1075 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1076 PetscFunctionReturn(PETSC_SUCCESS); 1077 } 1078 1079 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1080 /* 1081 Every processor gets the entire matrix 1082 */ 1083 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) 1084 { 1085 Mat B; 1086 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1087 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1088 PetscMPIInt size, rank; 1089 PetscInt sendcount, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0; 1090 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1091 1092 PetscFunctionBegin; 1093 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1094 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1095 if (scall == MAT_INITIAL_MATRIX) { 1096 /* Tell every processor the number of nonzeros per row */ 1097 PetscCall(PetscCalloc1(A->rmap->N, &lens)); 1098 for (PetscInt i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; 1099 1100 /* All MPI processes get the same matrix */ 1101 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A))); 1102 1103 /* Create the sequential matrix of the same type as the local block diagonal */ 1104 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1105 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1106 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1107 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1108 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1109 PetscCall(PetscCalloc1(2, Bin)); 1110 **Bin = B; 1111 b = (Mat_SeqAIJ *)B->data; 1112 1113 /* zero column space */ 1114 nrecv = 0; 1115 for (PetscMPIInt i = 0; i < size; i++) { 1116 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j]; 1117 } 1118 PetscCall(PetscArrayzero(b->j, nrecv)); 1119 1120 /* Copy my part of matrix column indices over */ 1121 sendcount = ad->nz + bd->nz; 1122 jsendbuf = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]); 1123 a_jsendbuf = ad->j; 1124 b_jsendbuf = bd->j; 1125 n = A->rmap->rend - A->rmap->rstart; 1126 cnt = 0; 1127 for (PetscInt i = 0; i < n; i++) { 1128 /* put in lower diagonal portion */ 1129 m = bd->i[i + 1] - bd->i[i]; 1130 while (m > 0) { 1131 /* is it above diagonal (in bd (compressed) numbering) */ 1132 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1133 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1134 m--; 1135 } 1136 1137 /* put in diagonal portion */ 1138 for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1139 1140 /* put in upper diagonal portion */ 1141 while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1142 } 1143 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1144 1145 /* send column indices, b->j was zeroed */ 1146 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A))); 1147 1148 /* Assemble the matrix into useable form (numerical values not yet set) */ 1149 /* set the b->ilen (length of each row) values */ 1150 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1151 /* set the b->i indices */ 1152 b->i[0] = 0; 1153 for (PetscInt i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1]; 1154 PetscCall(PetscFree(lens)); 1155 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1156 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1157 1158 } else { 1159 B = **Bin; 1160 b = (Mat_SeqAIJ *)B->data; 1161 } 1162 1163 /* Copy my part of matrix numerical values into the values location */ 1164 if (flag == MAT_GET_VALUES) { 1165 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1166 MatScalar *sendbuf; 1167 1168 /* initialize b->a */ 1169 PetscCall(PetscArrayzero(b->a, b->nz)); 1170 1171 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1172 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1173 sendcount = ad->nz + bd->nz; 1174 sendbuf = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]); 1175 a_sendbuf = ada; 1176 b_sendbuf = bda; 1177 b_sendj = bd->j; 1178 n = A->rmap->rend - A->rmap->rstart; 1179 cnt = 0; 1180 for (PetscInt i = 0; i < n; i++) { 1181 /* put in lower diagonal portion */ 1182 m = bd->i[i + 1] - bd->i[i]; 1183 while (m > 0) { 1184 /* is it above diagonal (in bd (compressed) numbering) */ 1185 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 m--; 1188 b_sendj++; 1189 } 1190 1191 /* put in diagonal portion */ 1192 for (PetscInt j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++; 1193 1194 /* put in upper diagonal portion */ 1195 while (m-- > 0) { 1196 sendbuf[cnt++] = *b_sendbuf++; 1197 b_sendj++; 1198 } 1199 } 1200 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1201 1202 /* send values, b->a was zeroed */ 1203 PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A))); 1204 1205 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1206 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1207 } /* endof (flag == MAT_GET_VALUES) */ 1208 1209 PetscCall(MatPropagateSymmetryOptions(A, B)); 1210 PetscFunctionReturn(PETSC_SUCCESS); 1211 } 1212 1213 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) 1214 { 1215 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1216 Mat submat, A = c->A, B = c->B; 1217 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1218 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1219 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1220 const PetscInt *icol, *irow; 1221 PetscInt nrow, ncol, start; 1222 PetscMPIInt rank, size, *req_source1, *req_source2, tag1, tag2, tag3, tag4, *w1, *w2, nrqr, nrqs = 0, proc, *pa; 1223 PetscInt **sbuf1, **sbuf2, k, ct1, ct2, ct3, **rbuf1, row; 1224 PetscInt msz, **ptr, *req_size, *ctr, *tmp, tcol, *iptr; 1225 PetscInt **rbuf3, **sbuf_aj, **rbuf2, max1, nnz; 1226 PetscInt *lens, rmax, ncols, *cols, Crow; 1227 #if defined(PETSC_USE_CTABLE) 1228 PetscHMapI cmap, rmap; 1229 PetscInt *cmap_loc, *rmap_loc; 1230 #else 1231 PetscInt *cmap, *rmap; 1232 #endif 1233 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i, jcnt; 1234 PetscInt *cworkB, lwrite, *subcols, ib, jb; 1235 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1236 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1237 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1238 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1239 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1240 MPI_Comm comm; 1241 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1242 PetscMPIInt *onodes1, *olengths1, idex, end, *row2proc; 1243 Mat_SubSppt *smatis1; 1244 PetscBool isrowsorted, iscolsorted; 1245 1246 PetscFunctionBegin; 1247 PetscValidLogicalCollectiveInt(C, ismax, 2); 1248 PetscValidLogicalCollectiveEnum(C, scall, 5); 1249 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1250 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1251 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1252 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1253 size = c->size; 1254 rank = c->rank; 1255 1256 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1257 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1258 PetscCall(ISGetIndices(isrow[0], &irow)); 1259 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1260 if (allcolumns) { 1261 icol = NULL; 1262 ncol = C->cmap->N; 1263 } else { 1264 PetscCall(ISGetIndices(iscol[0], &icol)); 1265 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1266 } 1267 1268 if (scall == MAT_INITIAL_MATRIX) { 1269 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1270 1271 /* Get some new tags to keep the communication clean */ 1272 tag1 = ((PetscObject)C)->tag; 1273 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1274 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1275 1276 /* evaluate communication - mesg to who, length of mesg, and buffer space 1277 required. Based on this, buffers are allocated, and data copied into them */ 1278 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1279 PetscCall(PetscMalloc1(nrow, &row2proc)); 1280 1281 /* w1[proc] = num of rows owned by proc -- to be requested */ 1282 proc = 0; 1283 nrqs = 0; /* num of outgoing messages */ 1284 for (PetscInt j = 0; j < nrow; j++) { 1285 row = irow[j]; 1286 if (!isrowsorted) proc = 0; 1287 while (row >= C->rmap->range[proc + 1]) proc++; 1288 w1[proc]++; 1289 row2proc[j] = proc; /* map row index to proc */ 1290 1291 if (proc != rank && !w2[proc]) { 1292 w2[proc] = 1; 1293 nrqs++; 1294 } 1295 } 1296 w1[rank] = 0; /* rows owned by self will not be requested */ 1297 1298 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1299 for (PetscMPIInt proc = 0, j = 0; proc < size; proc++) { 1300 if (w1[proc]) pa[j++] = proc; 1301 } 1302 1303 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1304 msz = 0; /* total mesg length (for all procs) */ 1305 for (PetscMPIInt i = 0; i < nrqs; i++) { 1306 w1[pa[i]] += 3; 1307 msz += w1[pa[i]]; 1308 } 1309 PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1310 1311 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1312 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1313 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1314 1315 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1316 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1317 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1318 1319 /* Now post the Irecvs corresponding to these messages */ 1320 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1321 1322 PetscCall(PetscFree(onodes1)); 1323 PetscCall(PetscFree(olengths1)); 1324 1325 /* Allocate Memory for outgoing messages */ 1326 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1327 PetscCall(PetscArrayzero(sbuf1, size)); 1328 PetscCall(PetscArrayzero(ptr, size)); 1329 1330 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1331 iptr = tmp; 1332 for (PetscMPIInt i = 0; i < nrqs; i++) { 1333 sbuf1[pa[i]] = iptr; 1334 iptr += w1[pa[i]]; 1335 } 1336 1337 /* Form the outgoing messages */ 1338 /* Initialize the header space */ 1339 for (PetscMPIInt i = 0; i < nrqs; i++) { 1340 PetscCall(PetscArrayzero(sbuf1[pa[i]], 3)); 1341 ptr[pa[i]] = sbuf1[pa[i]] + 3; 1342 } 1343 1344 /* Parse the isrow and copy data into outbuf */ 1345 PetscCall(PetscArrayzero(ctr, size)); 1346 for (PetscInt j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1347 if (row2proc[j] != rank) { /* copy to the outgoing buf */ 1348 *ptr[row2proc[j]] = irow[j]; 1349 ctr[row2proc[j]]++; 1350 ptr[row2proc[j]]++; 1351 } 1352 } 1353 1354 /* Update the headers for the current IS */ 1355 for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */ 1356 if ((ctr_j = ctr[j])) { 1357 sbuf1_j = sbuf1[j]; 1358 k = ++sbuf1_j[0]; 1359 sbuf1_j[2 * k] = ctr_j; 1360 sbuf1_j[2 * k - 1] = 0; 1361 } 1362 } 1363 1364 /* Now post the sends */ 1365 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1366 for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCallMPI(MPIU_Isend(sbuf1[pa[i]], w1[pa[i]], MPIU_INT, pa[i], tag1, comm, s_waits1 + i)); 1367 1368 /* Post Receives to capture the buffer size */ 1369 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1370 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1371 1372 if (nrqs) rbuf2[0] = tmp + msz; 1373 for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1374 1375 for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[pa[i]], MPIU_INT, pa[i], tag2, comm, r_waits2 + i)); 1376 1377 PetscCall(PetscFree2(w1, w2)); 1378 1379 /* Send to other procs the buf size they should allocate */ 1380 /* Receive messages*/ 1381 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1382 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1383 1384 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1385 for (PetscMPIInt i = 0; i < nrqr; ++i) { 1386 req_size[i] = 0; 1387 rbuf1_i = rbuf1[i]; 1388 start = 2 * rbuf1_i[0] + 1; 1389 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1390 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1391 sbuf2_i = sbuf2[i]; 1392 for (PetscInt j = start; j < end; j++) { 1393 k = rbuf1_i[j] - rstart; 1394 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1395 sbuf2_i[j] = ncols; 1396 req_size[i] += ncols; 1397 } 1398 req_source1[i] = r_status1[i].MPI_SOURCE; 1399 1400 /* form the header */ 1401 sbuf2_i[0] = req_size[i]; 1402 for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1403 1404 PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1405 } 1406 1407 PetscCall(PetscFree(r_status1)); 1408 PetscCall(PetscFree(r_waits1)); 1409 1410 /* rbuf2 is received, Post recv column indices a->j */ 1411 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1412 1413 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1414 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1415 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1416 req_source2[i] = r_status2[i].MPI_SOURCE; 1417 PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1418 } 1419 1420 /* Wait on sends1 and sends2 */ 1421 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1422 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1423 PetscCall(PetscFree(s_waits1)); 1424 PetscCall(PetscFree(s_status1)); 1425 1426 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1427 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1428 1429 /* Now allocate sending buffers for a->j, and send them off */ 1430 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1431 jcnt = 0; 1432 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 1433 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0])); 1434 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1435 1436 for (PetscMPIInt i = 0; i < nrqr; i++) { /* for each requested message */ 1437 rbuf1_i = rbuf1[i]; 1438 sbuf_aj_i = sbuf_aj[i]; 1439 ct1 = 2 * rbuf1_i[0] + 1; 1440 ct2 = 0; 1441 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1442 1443 kmax = rbuf1[i][2]; 1444 for (PetscInt k = 0; k < kmax; k++, ct1++) { /* for each row */ 1445 row = rbuf1_i[ct1] - rstart; 1446 nzA = ai[row + 1] - ai[row]; 1447 nzB = bi[row + 1] - bi[row]; 1448 ncols = nzA + nzB; 1449 cworkA = PetscSafePointerPlusOffset(aj, ai[row]); 1450 cworkB = PetscSafePointerPlusOffset(bj, bi[row]); 1451 1452 /* load the column indices for this row into cols*/ 1453 cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2); 1454 1455 lwrite = 0; 1456 for (PetscInt l = 0; l < nzB; l++) { 1457 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1458 } 1459 for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1460 for (PetscInt l = 0; l < nzB; l++) { 1461 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1462 } 1463 1464 ct2 += ncols; 1465 } 1466 PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1467 } 1468 1469 /* create column map (cmap): global col of C -> local col of submat */ 1470 #if defined(PETSC_USE_CTABLE) 1471 if (!allcolumns) { 1472 PetscCall(PetscHMapICreateWithSize(ncol, &cmap)); 1473 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1474 for (PetscInt j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1475 if (icol[j] >= cstart && icol[j] < cend) { 1476 cmap_loc[icol[j] - cstart] = j + 1; 1477 } else { /* use PetscHMapI for non-local col indices */ 1478 PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1)); 1479 } 1480 } 1481 } else { 1482 cmap = NULL; 1483 cmap_loc = NULL; 1484 } 1485 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1486 #else 1487 if (!allcolumns) { 1488 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1489 for (PetscInt j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1490 } else { 1491 cmap = NULL; 1492 } 1493 #endif 1494 1495 /* Create lens for MatSeqAIJSetPreallocation() */ 1496 PetscCall(PetscCalloc1(nrow, &lens)); 1497 1498 /* Compute lens from local part of C */ 1499 for (PetscInt j = 0; j < nrow; j++) { 1500 row = irow[j]; 1501 if (row2proc[j] == rank) { 1502 /* diagonal part A = c->A */ 1503 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1504 cols = PetscSafePointerPlusOffset(aj, ai[row - rstart]); 1505 if (!allcolumns) { 1506 for (PetscInt k = 0; k < ncols; k++) { 1507 #if defined(PETSC_USE_CTABLE) 1508 tcol = cmap_loc[cols[k]]; 1509 #else 1510 tcol = cmap[cols[k] + cstart]; 1511 #endif 1512 if (tcol) lens[j]++; 1513 } 1514 } else { /* allcolumns */ 1515 lens[j] = ncols; 1516 } 1517 1518 /* off-diagonal part B = c->B */ 1519 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1520 cols = PetscSafePointerPlusOffset(bj, bi[row - rstart]); 1521 if (!allcolumns) { 1522 for (PetscInt k = 0; k < ncols; k++) { 1523 #if defined(PETSC_USE_CTABLE) 1524 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1525 #else 1526 tcol = cmap[bmap[cols[k]]]; 1527 #endif 1528 if (tcol) lens[j]++; 1529 } 1530 } else { /* allcolumns */ 1531 lens[j] += ncols; 1532 } 1533 } 1534 } 1535 1536 /* Create row map (rmap): global row of C -> local row of submat */ 1537 #if defined(PETSC_USE_CTABLE) 1538 PetscCall(PetscHMapICreateWithSize(nrow, &rmap)); 1539 for (PetscInt j = 0; j < nrow; j++) { 1540 row = irow[j]; 1541 if (row2proc[j] == rank) { /* a local row */ 1542 rmap_loc[row - rstart] = j; 1543 } else { 1544 PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1)); 1545 } 1546 } 1547 #else 1548 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1549 for (PetscInt j = 0; j < nrow; j++) rmap[irow[j]] = j; 1550 #endif 1551 1552 /* Update lens from offproc data */ 1553 /* recv a->j is done */ 1554 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1555 for (PetscMPIInt i = 0; i < nrqs; i++) { 1556 sbuf1_i = sbuf1[pa[i]]; 1557 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1558 ct1 = 2 + 1; 1559 ct2 = 0; 1560 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1561 rbuf3_i = rbuf3[i]; /* received C->j */ 1562 1563 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1564 max1 = sbuf1_i[2]; 1565 for (PetscInt k = 0; k < max1; k++, ct1++) { 1566 #if defined(PETSC_USE_CTABLE) 1567 PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row)); 1568 row--; 1569 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1570 #else 1571 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1572 #endif 1573 /* Now, store row index of submat in sbuf1_i[ct1] */ 1574 sbuf1_i[ct1] = row; 1575 1576 nnz = rbuf2_i[ct1]; 1577 if (!allcolumns) { 1578 for (PetscMPIInt l = 0; l < nnz; l++, ct2++) { 1579 #if defined(PETSC_USE_CTABLE) 1580 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1581 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1582 } else { 1583 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1584 } 1585 #else 1586 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1587 #endif 1588 if (tcol) lens[row]++; 1589 } 1590 } else { /* allcolumns */ 1591 lens[row] += nnz; 1592 } 1593 } 1594 } 1595 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1596 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1597 1598 /* Create the submatrices */ 1599 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1600 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1601 1602 PetscCall(ISGetBlockSize(isrow[0], &ib)); 1603 PetscCall(ISGetBlockSize(iscol[0], &jb)); 1604 if (ib > 1 || jb > 1) PetscCall(MatSetBlockSizes(submat, ib, jb)); 1605 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1606 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1607 1608 /* create struct Mat_SubSppt and attached it to submat */ 1609 PetscCall(PetscNew(&smatis1)); 1610 subc = (Mat_SeqAIJ *)submat->data; 1611 subc->submatis1 = smatis1; 1612 1613 smatis1->id = 0; 1614 smatis1->nrqs = nrqs; 1615 smatis1->nrqr = nrqr; 1616 smatis1->rbuf1 = rbuf1; 1617 smatis1->rbuf2 = rbuf2; 1618 smatis1->rbuf3 = rbuf3; 1619 smatis1->sbuf2 = sbuf2; 1620 smatis1->req_source2 = req_source2; 1621 1622 smatis1->sbuf1 = sbuf1; 1623 smatis1->ptr = ptr; 1624 smatis1->tmp = tmp; 1625 smatis1->ctr = ctr; 1626 1627 smatis1->pa = pa; 1628 smatis1->req_size = req_size; 1629 smatis1->req_source1 = req_source1; 1630 1631 smatis1->allcolumns = allcolumns; 1632 smatis1->singleis = PETSC_TRUE; 1633 smatis1->row2proc = row2proc; 1634 smatis1->rmap = rmap; 1635 smatis1->cmap = cmap; 1636 #if defined(PETSC_USE_CTABLE) 1637 smatis1->rmap_loc = rmap_loc; 1638 smatis1->cmap_loc = cmap_loc; 1639 #endif 1640 1641 smatis1->destroy = submat->ops->destroy; 1642 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1643 submat->factortype = C->factortype; 1644 1645 /* compute rmax */ 1646 rmax = 0; 1647 for (PetscMPIInt i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1648 1649 } else { /* scall == MAT_REUSE_MATRIX */ 1650 submat = submats[0]; 1651 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1652 1653 subc = (Mat_SeqAIJ *)submat->data; 1654 rmax = subc->rmax; 1655 smatis1 = subc->submatis1; 1656 nrqs = smatis1->nrqs; 1657 nrqr = smatis1->nrqr; 1658 rbuf1 = smatis1->rbuf1; 1659 rbuf2 = smatis1->rbuf2; 1660 rbuf3 = smatis1->rbuf3; 1661 req_source2 = smatis1->req_source2; 1662 1663 sbuf1 = smatis1->sbuf1; 1664 sbuf2 = smatis1->sbuf2; 1665 ptr = smatis1->ptr; 1666 tmp = smatis1->tmp; 1667 ctr = smatis1->ctr; 1668 1669 pa = smatis1->pa; 1670 req_size = smatis1->req_size; 1671 req_source1 = smatis1->req_source1; 1672 1673 allcolumns = smatis1->allcolumns; 1674 row2proc = smatis1->row2proc; 1675 rmap = smatis1->rmap; 1676 cmap = smatis1->cmap; 1677 #if defined(PETSC_USE_CTABLE) 1678 rmap_loc = smatis1->rmap_loc; 1679 cmap_loc = smatis1->cmap_loc; 1680 #endif 1681 } 1682 1683 /* Post recv matrix values */ 1684 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1685 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1686 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1687 for (PetscMPIInt i = 0; i < nrqs; ++i) { 1688 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1689 PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1690 } 1691 1692 /* Allocate sending buffers for a->a, and send them off */ 1693 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1694 jcnt = 0; 1695 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 1696 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0])); 1697 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1698 1699 for (PetscMPIInt i = 0; i < nrqr; i++) { 1700 rbuf1_i = rbuf1[i]; 1701 sbuf_aa_i = sbuf_aa[i]; 1702 ct1 = 2 * rbuf1_i[0] + 1; 1703 ct2 = 0; 1704 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1705 1706 kmax = rbuf1_i[2]; 1707 for (PetscInt k = 0; k < kmax; k++, ct1++) { 1708 row = rbuf1_i[ct1] - rstart; 1709 nzA = ai[row + 1] - ai[row]; 1710 nzB = bi[row + 1] - bi[row]; 1711 ncols = nzA + nzB; 1712 cworkB = PetscSafePointerPlusOffset(bj, bi[row]); 1713 vworkA = PetscSafePointerPlusOffset(a_a, ai[row]); 1714 vworkB = PetscSafePointerPlusOffset(b_a, bi[row]); 1715 1716 /* load the column values for this row into vals*/ 1717 vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2); 1718 1719 lwrite = 0; 1720 for (PetscInt l = 0; l < nzB; l++) { 1721 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1722 } 1723 for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1724 for (PetscInt l = 0; l < nzB; l++) { 1725 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1726 } 1727 1728 ct2 += ncols; 1729 } 1730 PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1731 } 1732 1733 /* Assemble submat */ 1734 /* First assemble the local rows */ 1735 for (PetscInt j = 0; j < nrow; j++) { 1736 row = irow[j]; 1737 if (row2proc[j] == rank) { 1738 Crow = row - rstart; /* local row index of C */ 1739 #if defined(PETSC_USE_CTABLE) 1740 row = rmap_loc[Crow]; /* row index of submat */ 1741 #else 1742 row = rmap[row]; 1743 #endif 1744 1745 if (allcolumns) { 1746 PetscInt ncol = 0; 1747 1748 /* diagonal part A = c->A */ 1749 ncols = ai[Crow + 1] - ai[Crow]; 1750 cols = PetscSafePointerPlusOffset(aj, ai[Crow]); 1751 vals = PetscSafePointerPlusOffset(a_a, ai[Crow]); 1752 for (PetscInt k = 0; k < ncols; k++) { 1753 subcols[ncol] = cols[k] + cstart; 1754 subvals[ncol++] = vals[k]; 1755 } 1756 1757 /* off-diagonal part B = c->B */ 1758 ncols = bi[Crow + 1] - bi[Crow]; 1759 cols = PetscSafePointerPlusOffset(bj, bi[Crow]); 1760 vals = PetscSafePointerPlusOffset(b_a, bi[Crow]); 1761 for (PetscInt k = 0; k < ncols; k++) { 1762 subcols[ncol] = bmap[cols[k]]; 1763 subvals[ncol++] = vals[k]; 1764 } 1765 1766 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES)); 1767 1768 } else { /* !allcolumns */ 1769 PetscInt ncol = 0; 1770 1771 #if defined(PETSC_USE_CTABLE) 1772 /* diagonal part A = c->A */ 1773 ncols = ai[Crow + 1] - ai[Crow]; 1774 cols = PetscSafePointerPlusOffset(aj, ai[Crow]); 1775 vals = PetscSafePointerPlusOffset(a_a, ai[Crow]); 1776 for (PetscInt k = 0; k < ncols; k++) { 1777 tcol = cmap_loc[cols[k]]; 1778 if (tcol) { 1779 subcols[ncol] = --tcol; 1780 subvals[ncol++] = vals[k]; 1781 } 1782 } 1783 1784 /* off-diagonal part B = c->B */ 1785 ncols = bi[Crow + 1] - bi[Crow]; 1786 cols = PetscSafePointerPlusOffset(bj, bi[Crow]); 1787 vals = PetscSafePointerPlusOffset(b_a, bi[Crow]); 1788 for (PetscInt k = 0; k < ncols; k++) { 1789 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1790 if (tcol) { 1791 subcols[ncol] = --tcol; 1792 subvals[ncol++] = vals[k]; 1793 } 1794 } 1795 #else 1796 /* diagonal part A = c->A */ 1797 ncols = ai[Crow + 1] - ai[Crow]; 1798 cols = aj + ai[Crow]; 1799 vals = a_a + ai[Crow]; 1800 for (PetscInt k = 0; k < ncols; k++) { 1801 tcol = cmap[cols[k] + cstart]; 1802 if (tcol) { 1803 subcols[ncol] = --tcol; 1804 subvals[ncol++] = vals[k]; 1805 } 1806 } 1807 1808 /* off-diagonal part B = c->B */ 1809 ncols = bi[Crow + 1] - bi[Crow]; 1810 cols = bj + bi[Crow]; 1811 vals = b_a + bi[Crow]; 1812 for (PetscInt k = 0; k < ncols; k++) { 1813 tcol = cmap[bmap[cols[k]]]; 1814 if (tcol) { 1815 subcols[ncol] = --tcol; 1816 subvals[ncol++] = vals[k]; 1817 } 1818 } 1819 #endif 1820 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, ncol, subcols, subvals, INSERT_VALUES)); 1821 } 1822 } 1823 } 1824 1825 /* Now assemble the off-proc rows */ 1826 for (PetscMPIInt i = 0; i < nrqs; i++) { /* for each requested message */ 1827 /* recv values from other processes */ 1828 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1829 sbuf1_i = sbuf1[pa[idex]]; 1830 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1831 ct1 = 2 + 1; 1832 ct2 = 0; /* count of received C->j */ 1833 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1834 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1835 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1836 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1837 1838 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1839 max1 = sbuf1_i[2]; /* num of rows */ 1840 for (PetscInt k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1841 row = sbuf1_i[ct1]; /* row index of submat */ 1842 if (!allcolumns) { 1843 idex = 0; 1844 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1845 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1846 for (PetscInt l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1847 #if defined(PETSC_USE_CTABLE) 1848 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1849 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1850 } else { 1851 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1852 } 1853 #else 1854 tcol = cmap[rbuf3_i[ct2]]; 1855 #endif 1856 if (tcol) { 1857 subcols[idex] = --tcol; /* may not be sorted */ 1858 subvals[idex++] = rbuf4_i[ct2]; 1859 1860 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1861 For reuse, we replace received C->j with index that should be inserted to submat */ 1862 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1863 } 1864 } 1865 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1866 } else { /* scall == MAT_REUSE_MATRIX */ 1867 submat = submats[0]; 1868 subc = (Mat_SeqAIJ *)submat->data; 1869 1870 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1871 for (PetscInt l = 0; l < nnz; l++) { 1872 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1873 subvals[idex++] = rbuf4_i[ct2]; 1874 } 1875 1876 bj = subc->j + subc->i[row]; /* sorted column indices */ 1877 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1878 } 1879 } else { /* allcolumns */ 1880 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1881 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES)); 1882 ct2 += nnz; 1883 } 1884 } 1885 } 1886 1887 /* sending a->a are done */ 1888 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1889 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1890 1891 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1892 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1893 submats[0] = submat; 1894 1895 /* Restore the indices */ 1896 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1897 if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol)); 1898 1899 /* Destroy allocated memory */ 1900 for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 1901 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1902 if (sbuf_aa) { 1903 PetscCall(PetscFree(sbuf_aa[0])); 1904 PetscCall(PetscFree(sbuf_aa)); 1905 } 1906 1907 if (scall == MAT_INITIAL_MATRIX) { 1908 PetscCall(PetscFree(lens)); 1909 if (sbuf_aj) { 1910 PetscCall(PetscFree(sbuf_aj[0])); 1911 PetscCall(PetscFree(sbuf_aj)); 1912 } 1913 } 1914 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1915 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1916 PetscFunctionReturn(PETSC_SUCCESS); 1917 } 1918 1919 static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1920 { 1921 PetscInt ncol; 1922 PetscBool colflag, allcolumns = PETSC_FALSE; 1923 1924 PetscFunctionBegin; 1925 /* Allocate memory to hold all the submatrices */ 1926 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat)); 1927 1928 /* Check for special case: each processor gets entire matrix columns */ 1929 PetscCall(ISIdentity(iscol[0], &colflag)); 1930 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1931 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1932 1933 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1934 PetscFunctionReturn(PETSC_SUCCESS); 1935 } 1936 1937 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1938 { 1939 PetscInt nmax, nstages = 0, max_no, nrow, ncol, in[2], out[2]; 1940 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1941 Mat_SeqAIJ *subc; 1942 Mat_SubSppt *smat; 1943 1944 PetscFunctionBegin; 1945 /* Check for special case: each processor has a single IS */ 1946 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1947 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1948 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1949 PetscFunctionReturn(PETSC_SUCCESS); 1950 } 1951 1952 /* Collect global wantallmatrix and nstages */ 1953 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1954 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1955 if (!nmax) nmax = 1; 1956 1957 if (scall == MAT_INITIAL_MATRIX) { 1958 /* Collect global wantallmatrix and nstages */ 1959 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1960 PetscCall(ISIdentity(*isrow, &rowflag)); 1961 PetscCall(ISIdentity(*iscol, &colflag)); 1962 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1963 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1964 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1965 wantallmatrix = PETSC_TRUE; 1966 1967 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1968 } 1969 } 1970 1971 /* Determine the number of stages through which submatrices are done 1972 Each stage will extract nmax submatrices. 1973 nmax is determined by the matrix column dimension. 1974 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1975 */ 1976 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1977 1978 in[0] = -1 * (PetscInt)wantallmatrix; 1979 in[1] = nstages; 1980 PetscCallMPI(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 1981 wantallmatrix = (PetscBool)(-out[0]); 1982 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1983 1984 } else { /* MAT_REUSE_MATRIX */ 1985 if (ismax) { 1986 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 1987 smat = subc->submatis1; 1988 } else { /* (*submat)[0] is a dummy matrix */ 1989 smat = (Mat_SubSppt *)(*submat)[0]->data; 1990 } 1991 if (!smat) { 1992 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 1993 wantallmatrix = PETSC_TRUE; 1994 } else if (smat->singleis) { 1995 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1996 PetscFunctionReturn(PETSC_SUCCESS); 1997 } else { 1998 nstages = smat->nstages; 1999 } 2000 } 2001 2002 if (wantallmatrix) { 2003 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 2004 PetscFunctionReturn(PETSC_SUCCESS); 2005 } 2006 2007 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2008 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat)); 2009 2010 for (PetscInt i = 0, pos = 0; i < nstages; i++) { 2011 if (pos + nmax <= ismax) max_no = nmax; 2012 else if (pos >= ismax) max_no = 0; 2013 else max_no = ismax - pos; 2014 2015 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos)); 2016 if (!max_no) { 2017 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2018 smat = (Mat_SubSppt *)(*submat)[pos]->data; 2019 smat->nstages = nstages; 2020 } 2021 pos++; /* advance to next dummy matrix if any */ 2022 } else pos += max_no; 2023 } 2024 2025 if (ismax && scall == MAT_INITIAL_MATRIX) { 2026 /* save nstages for reuse */ 2027 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2028 smat = subc->submatis1; 2029 smat->nstages = nstages; 2030 } 2031 PetscFunctionReturn(PETSC_SUCCESS); 2032 } 2033 2034 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) 2035 { 2036 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2037 Mat A = c->A; 2038 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2039 const PetscInt **icol, **irow; 2040 PetscInt *nrow, *ncol, start; 2041 PetscMPIInt nrqs = 0, *pa, proc = -1; 2042 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr, *req_source1 = NULL, *req_source2; 2043 PetscInt **sbuf1, **sbuf2, k, ct1, ct2, **rbuf1, row; 2044 PetscInt msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *tmp = NULL, tcol; 2045 PetscInt **rbuf3 = NULL, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2046 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2047 #if defined(PETSC_USE_CTABLE) 2048 PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i; 2049 #else 2050 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2051 #endif 2052 const PetscInt *irow_i; 2053 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2054 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2055 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2056 MPI_Comm comm; 2057 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2058 PetscMPIInt *onodes1, *olengths1, end, **row2proc, *row2proc_i; 2059 PetscInt ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2060 Mat_SubSppt *smat_i; 2061 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2062 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen, jcnt; 2063 2064 PetscFunctionBegin; 2065 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2066 size = c->size; 2067 rank = c->rank; 2068 2069 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2070 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2071 2072 for (PetscInt i = 0; i < ismax; i++) { 2073 PetscCall(ISSorted(iscol[i], &issorted[i])); 2074 if (!issorted[i]) iscsorted = issorted[i]; 2075 2076 PetscCall(ISSorted(isrow[i], &issorted[i])); 2077 2078 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2079 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2080 2081 /* Check for special case: allcolumn */ 2082 PetscCall(ISIdentity(iscol[i], &colflag)); 2083 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2084 if (colflag && ncol[i] == C->cmap->N) { 2085 allcolumns[i] = PETSC_TRUE; 2086 icol[i] = NULL; 2087 } else { 2088 allcolumns[i] = PETSC_FALSE; 2089 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2090 } 2091 } 2092 2093 if (scall == MAT_REUSE_MATRIX) { 2094 /* Assumes new rows are same length as the old rows */ 2095 for (PetscInt i = 0; i < ismax; i++) { 2096 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2097 subc = (Mat_SeqAIJ *)submats[i]->data; 2098 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2099 2100 /* Initial matrix as if empty */ 2101 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2102 2103 smat_i = subc->submatis1; 2104 2105 nrqs = smat_i->nrqs; 2106 nrqr = smat_i->nrqr; 2107 rbuf1 = smat_i->rbuf1; 2108 rbuf2 = smat_i->rbuf2; 2109 rbuf3 = smat_i->rbuf3; 2110 req_source2 = smat_i->req_source2; 2111 2112 sbuf1 = smat_i->sbuf1; 2113 sbuf2 = smat_i->sbuf2; 2114 ptr = smat_i->ptr; 2115 tmp = smat_i->tmp; 2116 ctr = smat_i->ctr; 2117 2118 pa = smat_i->pa; 2119 req_size = smat_i->req_size; 2120 req_source1 = smat_i->req_source1; 2121 2122 allcolumns[i] = smat_i->allcolumns; 2123 row2proc[i] = smat_i->row2proc; 2124 rmap[i] = smat_i->rmap; 2125 cmap[i] = smat_i->cmap; 2126 } 2127 2128 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2129 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2130 smat_i = (Mat_SubSppt *)submats[0]->data; 2131 2132 nrqs = smat_i->nrqs; 2133 nrqr = smat_i->nrqr; 2134 rbuf1 = smat_i->rbuf1; 2135 rbuf2 = smat_i->rbuf2; 2136 rbuf3 = smat_i->rbuf3; 2137 req_source2 = smat_i->req_source2; 2138 2139 sbuf1 = smat_i->sbuf1; 2140 sbuf2 = smat_i->sbuf2; 2141 ptr = smat_i->ptr; 2142 tmp = smat_i->tmp; 2143 ctr = smat_i->ctr; 2144 2145 pa = smat_i->pa; 2146 req_size = smat_i->req_size; 2147 req_source1 = smat_i->req_source1; 2148 2149 allcolumns[0] = PETSC_FALSE; 2150 } 2151 } else { /* scall == MAT_INITIAL_MATRIX */ 2152 /* Get some new tags to keep the communication clean */ 2153 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2154 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2155 2156 /* evaluate communication - mesg to who, length of mesg, and buffer space 2157 required. Based on this, buffers are allocated, and data copied into them*/ 2158 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2159 2160 for (PetscInt i = 0; i < ismax; i++) { 2161 jmax = nrow[i]; 2162 irow_i = irow[i]; 2163 2164 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2165 row2proc[i] = row2proc_i; 2166 2167 if (issorted[i]) proc = 0; 2168 for (PetscInt j = 0; j < jmax; j++) { 2169 if (!issorted[i]) proc = 0; 2170 row = irow_i[j]; 2171 while (row >= C->rmap->range[proc + 1]) proc++; 2172 w4[proc]++; 2173 row2proc_i[j] = proc; /* map row index to proc */ 2174 } 2175 for (PetscMPIInt j = 0; j < size; j++) { 2176 if (w4[j]) { 2177 w1[j] += w4[j]; 2178 w3[j]++; 2179 w4[j] = 0; 2180 } 2181 } 2182 } 2183 2184 nrqs = 0; /* no of outgoing messages */ 2185 msz = 0; /* total mesg length (for all procs) */ 2186 w1[rank] = 0; /* no mesg sent to self */ 2187 w3[rank] = 0; 2188 for (PetscMPIInt i = 0; i < size; i++) { 2189 if (w1[i]) { 2190 w2[i] = 1; 2191 nrqs++; 2192 } /* there exists a message to proc i */ 2193 } 2194 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2195 for (PetscMPIInt i = 0, j = 0; i < size; i++) { 2196 if (w1[i]) { 2197 pa[j] = i; 2198 j++; 2199 } 2200 } 2201 2202 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2203 for (PetscMPIInt i = 0; i < nrqs; i++) { 2204 PetscMPIInt j = pa[i]; 2205 w1[j] += w2[j] + 2 * w3[j]; 2206 msz += w1[j]; 2207 } 2208 PetscCall(PetscInfo(0, "Number of outgoing messages %d Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2209 2210 /* Determine the number of messages to expect, their lengths, from from-ids */ 2211 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2212 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2213 2214 /* Now post the Irecvs corresponding to these messages */ 2215 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2216 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2217 2218 /* Allocate Memory for outgoing messages */ 2219 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2220 PetscCall(PetscArrayzero(sbuf1, size)); 2221 PetscCall(PetscArrayzero(ptr, size)); 2222 2223 { 2224 PetscInt *iptr = tmp; 2225 k = 0; 2226 for (PetscMPIInt i = 0; i < nrqs; i++) { 2227 PetscMPIInt j = pa[i]; 2228 iptr += k; 2229 sbuf1[j] = iptr; 2230 k = w1[j]; 2231 } 2232 } 2233 2234 /* Form the outgoing messages. Initialize the header space */ 2235 for (PetscMPIInt i = 0; i < nrqs; i++) { 2236 PetscMPIInt j = pa[i]; 2237 sbuf1[j][0] = 0; 2238 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2239 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2240 } 2241 2242 /* Parse the isrow and copy data into outbuf */ 2243 for (PetscInt i = 0; i < ismax; i++) { 2244 row2proc_i = row2proc[i]; 2245 PetscCall(PetscArrayzero(ctr, size)); 2246 irow_i = irow[i]; 2247 jmax = nrow[i]; 2248 for (PetscInt j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2249 proc = row2proc_i[j]; 2250 if (proc != rank) { /* copy to the outgoing buf */ 2251 ctr[proc]++; 2252 *ptr[proc] = irow_i[j]; 2253 ptr[proc]++; 2254 } 2255 } 2256 /* Update the headers for the current IS */ 2257 for (PetscMPIInt j = 0; j < size; j++) { /* Can Optimise this loop too */ 2258 if ((ctr_j = ctr[j])) { 2259 sbuf1_j = sbuf1[j]; 2260 k = ++sbuf1_j[0]; 2261 sbuf1_j[2 * k] = ctr_j; 2262 sbuf1_j[2 * k - 1] = i; 2263 } 2264 } 2265 } 2266 2267 /* Now post the sends */ 2268 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2269 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2270 PetscMPIInt j = pa[i]; 2271 PetscCallMPI(MPIU_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2272 } 2273 2274 /* Post Receives to capture the buffer size */ 2275 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2276 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2277 if (nrqs) rbuf2[0] = tmp + msz; 2278 for (PetscMPIInt i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 2279 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2280 PetscMPIInt j = pa[i]; 2281 PetscCallMPI(MPIU_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2282 } 2283 2284 /* Send to other procs the buf size they should allocate */ 2285 /* Receive messages*/ 2286 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2287 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2288 { 2289 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2290 PetscInt *sbuf2_i; 2291 2292 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2293 for (PetscMPIInt i = 0; i < nrqr; ++i) { 2294 req_size[i] = 0; 2295 rbuf1_i = rbuf1[i]; 2296 start = 2 * rbuf1_i[0] + 1; 2297 end = olengths1[i]; 2298 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2299 sbuf2_i = sbuf2[i]; 2300 for (PetscInt j = start; j < end; j++) { 2301 id = rbuf1_i[j] - rstart; 2302 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2303 sbuf2_i[j] = ncols; 2304 req_size[i] += ncols; 2305 } 2306 req_source1[i] = onodes1[i]; 2307 /* form the header */ 2308 sbuf2_i[0] = req_size[i]; 2309 for (PetscInt j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2310 2311 PetscCallMPI(MPIU_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2312 } 2313 } 2314 2315 PetscCall(PetscFree(onodes1)); 2316 PetscCall(PetscFree(olengths1)); 2317 PetscCall(PetscFree(r_waits1)); 2318 PetscCall(PetscFree4(w1, w2, w3, w4)); 2319 2320 /* Receive messages*/ 2321 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2322 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2323 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2324 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2325 req_source2[i] = pa[i]; 2326 PetscCallMPI(MPIU_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2327 } 2328 PetscCall(PetscFree(r_waits2)); 2329 2330 /* Wait on sends1 and sends2 */ 2331 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2332 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2333 PetscCall(PetscFree(s_waits1)); 2334 PetscCall(PetscFree(s_waits2)); 2335 2336 /* Now allocate sending buffers for a->j, and send them off */ 2337 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2338 jcnt = 0; 2339 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 2340 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aj[0])); 2341 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2342 2343 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2344 { 2345 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2346 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2347 PetscInt cend = C->cmap->rend; 2348 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2349 2350 for (PetscMPIInt i = 0; i < nrqr; i++) { 2351 rbuf1_i = rbuf1[i]; 2352 sbuf_aj_i = sbuf_aj[i]; 2353 ct1 = 2 * rbuf1_i[0] + 1; 2354 ct2 = 0; 2355 for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2356 kmax = rbuf1[i][2 * j]; 2357 for (PetscInt k = 0; k < kmax; k++, ct1++) { 2358 row = rbuf1_i[ct1] - rstart; 2359 nzA = a_i[row + 1] - a_i[row]; 2360 nzB = b_i[row + 1] - b_i[row]; 2361 ncols = nzA + nzB; 2362 cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]); 2363 cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]); 2364 2365 /* load the column indices for this row into cols */ 2366 cols = sbuf_aj_i + ct2; 2367 2368 lwrite = 0; 2369 for (PetscInt l = 0; l < nzB; l++) { 2370 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2371 } 2372 for (PetscInt l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2373 for (PetscInt l = 0; l < nzB; l++) { 2374 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2375 } 2376 2377 ct2 += ncols; 2378 } 2379 } 2380 PetscCallMPI(MPIU_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2381 } 2382 } 2383 2384 /* create col map: global col of C -> local col of submatrices */ 2385 { 2386 const PetscInt *icol_i; 2387 #if defined(PETSC_USE_CTABLE) 2388 for (PetscInt i = 0; i < ismax; i++) { 2389 if (!allcolumns[i]) { 2390 PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i)); 2391 2392 jmax = ncol[i]; 2393 icol_i = icol[i]; 2394 cmap_i = cmap[i]; 2395 for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1)); 2396 } else cmap[i] = NULL; 2397 } 2398 #else 2399 for (PetscInt i = 0; i < ismax; i++) { 2400 if (!allcolumns[i]) { 2401 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2402 jmax = ncol[i]; 2403 icol_i = icol[i]; 2404 cmap_i = cmap[i]; 2405 for (PetscInt j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1; 2406 } else cmap[i] = NULL; 2407 } 2408 #endif 2409 } 2410 2411 /* Create lens which is required for MatCreate... */ 2412 jcnt = 0; 2413 for (PetscInt i = 0; i < ismax; i++) jcnt += nrow[i]; 2414 PetscCall(PetscMalloc1(ismax, &lens)); 2415 2416 if (ismax) PetscCall(PetscCalloc1(jcnt, &lens[0])); 2417 for (PetscInt i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]); 2418 2419 /* Update lens from local data */ 2420 for (PetscInt i = 0; i < ismax; i++) { 2421 row2proc_i = row2proc[i]; 2422 jmax = nrow[i]; 2423 if (!allcolumns[i]) cmap_i = cmap[i]; 2424 irow_i = irow[i]; 2425 lens_i = lens[i]; 2426 for (PetscInt j = 0; j < jmax; j++) { 2427 row = irow_i[j]; 2428 proc = row2proc_i[j]; 2429 if (proc == rank) { 2430 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2431 if (!allcolumns[i]) { 2432 for (PetscInt k = 0; k < ncols; k++) { 2433 #if defined(PETSC_USE_CTABLE) 2434 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2435 #else 2436 tcol = cmap_i[cols[k]]; 2437 #endif 2438 if (tcol) lens_i[j]++; 2439 } 2440 } else { /* allcolumns */ 2441 lens_i[j] = ncols; 2442 } 2443 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2444 } 2445 } 2446 } 2447 2448 /* Create row map: global row of C -> local row of submatrices */ 2449 #if defined(PETSC_USE_CTABLE) 2450 for (PetscInt i = 0; i < ismax; i++) { 2451 PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i)); 2452 irow_i = irow[i]; 2453 jmax = nrow[i]; 2454 for (PetscInt j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1)); 2455 } 2456 #else 2457 for (PetscInt i = 0; i < ismax; i++) { 2458 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2459 rmap_i = rmap[i]; 2460 irow_i = irow[i]; 2461 jmax = nrow[i]; 2462 for (PetscInt j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j; 2463 } 2464 #endif 2465 2466 /* Update lens from offproc data */ 2467 { 2468 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2469 2470 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2471 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2472 sbuf1_i = sbuf1[pa[tmp2]]; 2473 jmax = sbuf1_i[0]; 2474 ct1 = 2 * jmax + 1; 2475 ct2 = 0; 2476 rbuf2_i = rbuf2[tmp2]; 2477 rbuf3_i = rbuf3[tmp2]; 2478 for (PetscInt j = 1; j <= jmax; j++) { 2479 is_no = sbuf1_i[2 * j - 1]; 2480 max1 = sbuf1_i[2 * j]; 2481 lens_i = lens[is_no]; 2482 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2483 rmap_i = rmap[is_no]; 2484 for (PetscInt k = 0; k < max1; k++, ct1++) { 2485 #if defined(PETSC_USE_CTABLE) 2486 PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row)); 2487 row--; 2488 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2489 #else 2490 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2491 #endif 2492 max2 = rbuf2_i[ct1]; 2493 for (PetscInt l = 0; l < max2; l++, ct2++) { 2494 if (!allcolumns[is_no]) { 2495 #if defined(PETSC_USE_CTABLE) 2496 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2497 #else 2498 tcol = cmap_i[rbuf3_i[ct2]]; 2499 #endif 2500 if (tcol) lens_i[row]++; 2501 } else { /* allcolumns */ 2502 lens_i[row]++; /* lens_i[row] += max2 ? */ 2503 } 2504 } 2505 } 2506 } 2507 } 2508 } 2509 PetscCall(PetscFree(r_waits3)); 2510 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2511 PetscCall(PetscFree(s_waits3)); 2512 2513 /* Create the submatrices */ 2514 for (PetscInt i = 0; i < ismax; i++) { 2515 PetscInt rbs, cbs; 2516 2517 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2518 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2519 2520 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2521 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2522 2523 if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2524 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2525 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2526 2527 /* create struct Mat_SubSppt and attached it to submat */ 2528 PetscCall(PetscNew(&smat_i)); 2529 subc = (Mat_SeqAIJ *)submats[i]->data; 2530 subc->submatis1 = smat_i; 2531 2532 smat_i->destroy = submats[i]->ops->destroy; 2533 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2534 submats[i]->factortype = C->factortype; 2535 2536 smat_i->id = i; 2537 smat_i->nrqs = nrqs; 2538 smat_i->nrqr = nrqr; 2539 smat_i->rbuf1 = rbuf1; 2540 smat_i->rbuf2 = rbuf2; 2541 smat_i->rbuf3 = rbuf3; 2542 smat_i->sbuf2 = sbuf2; 2543 smat_i->req_source2 = req_source2; 2544 2545 smat_i->sbuf1 = sbuf1; 2546 smat_i->ptr = ptr; 2547 smat_i->tmp = tmp; 2548 smat_i->ctr = ctr; 2549 2550 smat_i->pa = pa; 2551 smat_i->req_size = req_size; 2552 smat_i->req_source1 = req_source1; 2553 2554 smat_i->allcolumns = allcolumns[i]; 2555 smat_i->singleis = PETSC_FALSE; 2556 smat_i->row2proc = row2proc[i]; 2557 smat_i->rmap = rmap[i]; 2558 smat_i->cmap = cmap[i]; 2559 } 2560 2561 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2562 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2563 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2564 PetscCall(MatSetType(submats[0], MATDUMMY)); 2565 2566 /* create struct Mat_SubSppt and attached it to submat */ 2567 PetscCall(PetscNew(&smat_i)); 2568 submats[0]->data = (void *)smat_i; 2569 2570 smat_i->destroy = submats[0]->ops->destroy; 2571 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2572 submats[0]->factortype = C->factortype; 2573 2574 smat_i->id = 0; 2575 smat_i->nrqs = nrqs; 2576 smat_i->nrqr = nrqr; 2577 smat_i->rbuf1 = rbuf1; 2578 smat_i->rbuf2 = rbuf2; 2579 smat_i->rbuf3 = rbuf3; 2580 smat_i->sbuf2 = sbuf2; 2581 smat_i->req_source2 = req_source2; 2582 2583 smat_i->sbuf1 = sbuf1; 2584 smat_i->ptr = ptr; 2585 smat_i->tmp = tmp; 2586 smat_i->ctr = ctr; 2587 2588 smat_i->pa = pa; 2589 smat_i->req_size = req_size; 2590 smat_i->req_source1 = req_source1; 2591 2592 smat_i->allcolumns = PETSC_FALSE; 2593 smat_i->singleis = PETSC_FALSE; 2594 smat_i->row2proc = NULL; 2595 smat_i->rmap = NULL; 2596 smat_i->cmap = NULL; 2597 } 2598 2599 if (ismax) PetscCall(PetscFree(lens[0])); 2600 PetscCall(PetscFree(lens)); 2601 if (sbuf_aj) { 2602 PetscCall(PetscFree(sbuf_aj[0])); 2603 PetscCall(PetscFree(sbuf_aj)); 2604 } 2605 2606 } /* endof scall == MAT_INITIAL_MATRIX */ 2607 2608 /* Post recv matrix values */ 2609 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2610 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2611 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2612 for (PetscMPIInt i = 0; i < nrqs; ++i) { 2613 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2614 PetscCallMPI(MPIU_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2615 } 2616 2617 /* Allocate sending buffers for a->a, and send them off */ 2618 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2619 jcnt = 0; 2620 for (PetscMPIInt i = 0; i < nrqr; i++) jcnt += req_size[i]; 2621 if (nrqr) PetscCall(PetscMalloc1(jcnt, &sbuf_aa[0])); 2622 for (PetscMPIInt i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2623 2624 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2625 { 2626 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2627 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2628 PetscInt cend = C->cmap->rend; 2629 PetscInt *b_j = b->j; 2630 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2631 2632 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2633 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2634 for (PetscMPIInt i = 0; i < nrqr; i++) { 2635 rbuf1_i = rbuf1[i]; 2636 sbuf_aa_i = sbuf_aa[i]; 2637 ct1 = 2 * rbuf1_i[0] + 1; 2638 ct2 = 0; 2639 for (PetscInt j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2640 kmax = rbuf1_i[2 * j]; 2641 for (PetscInt k = 0; k < kmax; k++, ct1++) { 2642 row = rbuf1_i[ct1] - rstart; 2643 nzA = a_i[row + 1] - a_i[row]; 2644 nzB = b_i[row + 1] - b_i[row]; 2645 ncols = nzA + nzB; 2646 cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]); 2647 vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]); 2648 vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]); 2649 2650 /* load the column values for this row into vals*/ 2651 vals = sbuf_aa_i + ct2; 2652 2653 lwrite = 0; 2654 for (PetscInt l = 0; l < nzB; l++) { 2655 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2656 } 2657 for (PetscInt l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2658 for (PetscInt l = 0; l < nzB; l++) { 2659 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2660 } 2661 2662 ct2 += ncols; 2663 } 2664 } 2665 PetscCallMPI(MPIU_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2666 } 2667 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2668 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2669 } 2670 2671 /* Assemble the matrices */ 2672 /* First assemble the local rows */ 2673 for (PetscInt i = 0; i < ismax; i++) { 2674 row2proc_i = row2proc[i]; 2675 subc = (Mat_SeqAIJ *)submats[i]->data; 2676 imat_ilen = subc->ilen; 2677 imat_j = subc->j; 2678 imat_i = subc->i; 2679 imat_a = subc->a; 2680 2681 if (!allcolumns[i]) cmap_i = cmap[i]; 2682 rmap_i = rmap[i]; 2683 irow_i = irow[i]; 2684 jmax = nrow[i]; 2685 for (PetscInt j = 0; j < jmax; j++) { 2686 row = irow_i[j]; 2687 proc = row2proc_i[j]; 2688 if (proc == rank) { 2689 old_row = row; 2690 #if defined(PETSC_USE_CTABLE) 2691 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2692 row--; 2693 #else 2694 row = rmap_i[row]; 2695 #endif 2696 ilen_row = imat_ilen[row]; 2697 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2698 mat_i = imat_i[row]; 2699 mat_a = imat_a + mat_i; 2700 mat_j = imat_j + mat_i; 2701 if (!allcolumns[i]) { 2702 for (PetscInt k = 0; k < ncols; k++) { 2703 #if defined(PETSC_USE_CTABLE) 2704 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2705 #else 2706 tcol = cmap_i[cols[k]]; 2707 #endif 2708 if (tcol) { 2709 *mat_j++ = tcol - 1; 2710 *mat_a++ = vals[k]; 2711 ilen_row++; 2712 } 2713 } 2714 } else { /* allcolumns */ 2715 for (PetscInt k = 0; k < ncols; k++) { 2716 *mat_j++ = cols[k]; /* global col index! */ 2717 *mat_a++ = vals[k]; 2718 ilen_row++; 2719 } 2720 } 2721 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2722 2723 imat_ilen[row] = ilen_row; 2724 } 2725 } 2726 } 2727 2728 /* Now assemble the off proc rows */ 2729 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2730 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2731 sbuf1_i = sbuf1[pa[tmp2]]; 2732 jmax = sbuf1_i[0]; 2733 ct1 = 2 * jmax + 1; 2734 ct2 = 0; 2735 rbuf2_i = rbuf2[tmp2]; 2736 rbuf3_i = rbuf3[tmp2]; 2737 rbuf4_i = rbuf4[tmp2]; 2738 for (PetscInt j = 1; j <= jmax; j++) { 2739 is_no = sbuf1_i[2 * j - 1]; 2740 rmap_i = rmap[is_no]; 2741 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2742 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2743 imat_ilen = subc->ilen; 2744 imat_j = subc->j; 2745 imat_i = subc->i; 2746 imat_a = subc->a; 2747 max1 = sbuf1_i[2 * j]; 2748 for (PetscInt k = 0; k < max1; k++, ct1++) { 2749 row = sbuf1_i[ct1]; 2750 #if defined(PETSC_USE_CTABLE) 2751 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2752 row--; 2753 #else 2754 row = rmap_i[row]; 2755 #endif 2756 ilen = imat_ilen[row]; 2757 mat_i = imat_i[row]; 2758 mat_a = PetscSafePointerPlusOffset(imat_a, mat_i); 2759 mat_j = PetscSafePointerPlusOffset(imat_j, mat_i); 2760 max2 = rbuf2_i[ct1]; 2761 if (!allcolumns[is_no]) { 2762 for (PetscInt l = 0; l < max2; l++, ct2++) { 2763 #if defined(PETSC_USE_CTABLE) 2764 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2765 #else 2766 tcol = cmap_i[rbuf3_i[ct2]]; 2767 #endif 2768 if (tcol) { 2769 *mat_j++ = tcol - 1; 2770 *mat_a++ = rbuf4_i[ct2]; 2771 ilen++; 2772 } 2773 } 2774 } else { /* allcolumns */ 2775 for (PetscInt l = 0; l < max2; l++, ct2++) { 2776 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2777 *mat_a++ = rbuf4_i[ct2]; 2778 ilen++; 2779 } 2780 } 2781 imat_ilen[row] = ilen; 2782 } 2783 } 2784 } 2785 2786 if (!iscsorted) { /* sort column indices of the rows */ 2787 for (PetscInt i = 0; i < ismax; i++) { 2788 subc = (Mat_SeqAIJ *)submats[i]->data; 2789 imat_j = subc->j; 2790 imat_i = subc->i; 2791 imat_a = subc->a; 2792 imat_ilen = subc->ilen; 2793 2794 if (allcolumns[i]) continue; 2795 jmax = nrow[i]; 2796 for (PetscInt j = 0; j < jmax; j++) { 2797 mat_i = imat_i[j]; 2798 mat_a = imat_a + mat_i; 2799 mat_j = imat_j + mat_i; 2800 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2801 } 2802 } 2803 } 2804 2805 PetscCall(PetscFree(r_waits4)); 2806 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2807 PetscCall(PetscFree(s_waits4)); 2808 2809 /* Restore the indices */ 2810 for (PetscInt i = 0; i < ismax; i++) { 2811 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2812 if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i)); 2813 } 2814 2815 for (PetscInt i = 0; i < ismax; i++) { 2816 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2817 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2818 } 2819 2820 /* Destroy allocated memory */ 2821 if (sbuf_aa) { 2822 PetscCall(PetscFree(sbuf_aa[0])); 2823 PetscCall(PetscFree(sbuf_aa)); 2824 } 2825 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2826 2827 for (PetscMPIInt i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 2828 PetscCall(PetscFree(rbuf4)); 2829 2830 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2831 PetscFunctionReturn(PETSC_SUCCESS); 2832 } 2833 2834 /* 2835 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2836 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2837 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2838 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2839 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2840 state, and needs to be "assembled" later by compressing B's column space. 2841 2842 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2843 Following this call, C->A & C->B have been created, even if empty. 2844 */ 2845 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) 2846 { 2847 /* If making this function public, change the error returned in this function away from _PLIB. */ 2848 Mat_MPIAIJ *aij; 2849 Mat_SeqAIJ *Baij; 2850 PetscBool seqaij, Bdisassembled; 2851 PetscInt m, n, *nz, ngcol, col, cstart, cend, shift, count; 2852 PetscScalar v; 2853 const PetscInt *rowindices, *colindices; 2854 2855 PetscFunctionBegin; 2856 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2857 if (A) { 2858 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2859 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2860 if (rowemb) { 2861 PetscCall(ISGetLocalSize(rowemb, &m)); 2862 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2863 } else { 2864 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2865 } 2866 if (dcolemb) { 2867 PetscCall(ISGetLocalSize(dcolemb, &n)); 2868 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2869 } else { 2870 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2871 } 2872 } 2873 if (B) { 2874 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2875 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2876 if (rowemb) { 2877 PetscCall(ISGetLocalSize(rowemb, &m)); 2878 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2879 } else { 2880 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2881 } 2882 if (ocolemb) { 2883 PetscCall(ISGetLocalSize(ocolemb, &n)); 2884 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2885 } else { 2886 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2887 } 2888 } 2889 2890 aij = (Mat_MPIAIJ *)C->data; 2891 if (!aij->A) { 2892 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2893 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2894 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2895 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2896 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2897 } 2898 if (A) { 2899 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2900 } else { 2901 PetscCall(MatSetUp(aij->A)); 2902 } 2903 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2904 /* 2905 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2906 need to "disassemble" B -- convert it to using C's global indices. 2907 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2908 2909 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2910 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2911 2912 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2913 At least avoid calling MatSetValues() and the implied searches? 2914 */ 2915 2916 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2917 #if defined(PETSC_USE_CTABLE) 2918 PetscCall(PetscHMapIDestroy(&aij->colmap)); 2919 #else 2920 PetscCall(PetscFree(aij->colmap)); 2921 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2922 #endif 2923 ngcol = 0; 2924 if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol)); 2925 if (aij->garray) PetscCall(PetscFree(aij->garray)); 2926 PetscCall(VecDestroy(&aij->lvec)); 2927 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2928 } 2929 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B)); 2930 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B)); 2931 } 2932 Bdisassembled = PETSC_FALSE; 2933 if (!aij->B) { 2934 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2935 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2936 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2937 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2938 Bdisassembled = PETSC_TRUE; 2939 } 2940 if (B) { 2941 Baij = (Mat_SeqAIJ *)B->data; 2942 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2943 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2944 for (PetscInt i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i]; 2945 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2946 PetscCall(PetscFree(nz)); 2947 } 2948 2949 PetscCall(PetscLayoutGetRange(C->cmap, &cstart, &cend)); 2950 shift = cend - cstart; 2951 count = 0; 2952 rowindices = NULL; 2953 colindices = NULL; 2954 if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices)); 2955 if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices)); 2956 for (PetscInt i = 0; i < B->rmap->n; i++) { 2957 PetscInt row; 2958 row = i; 2959 if (rowindices) row = rowindices[i]; 2960 for (PetscInt j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2961 col = Baij->j[count]; 2962 if (colindices) col = colindices[col]; 2963 if (Bdisassembled && col >= cstart) col += shift; 2964 v = Baij->a[count]; 2965 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2966 ++count; 2967 } 2968 } 2969 /* No assembly for aij->B is necessary. */ 2970 /* FIXME: set aij->B's nonzerostate correctly. */ 2971 } else { 2972 PetscCall(MatSetUp(aij->B)); 2973 } 2974 C->preallocated = PETSC_TRUE; 2975 C->was_assembled = PETSC_FALSE; 2976 C->assembled = PETSC_FALSE; 2977 /* 2978 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2979 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2980 */ 2981 PetscFunctionReturn(PETSC_SUCCESS); 2982 } 2983 2984 /* 2985 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2986 */ 2987 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) 2988 { 2989 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 2990 2991 PetscFunctionBegin; 2992 PetscAssertPointer(A, 2); 2993 PetscAssertPointer(B, 3); 2994 /* FIXME: make sure C is assembled */ 2995 *A = aij->A; 2996 *B = aij->B; 2997 /* Note that we don't incref *A and *B, so be careful! */ 2998 PetscFunctionReturn(PETSC_SUCCESS); 2999 } 3000 3001 /* 3002 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3003 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3004 */ 3005 static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) 3006 { 3007 PetscMPIInt size, flag; 3008 PetscInt cismax, ispar; 3009 Mat *A, *B; 3010 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 3011 3012 PetscFunctionBegin; 3013 if (!ismax) PetscFunctionReturn(PETSC_SUCCESS); 3014 3015 cismax = 0; 3016 for (PetscInt i = 0; i < ismax; ++i) { 3017 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 3018 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3019 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3020 if (size > 1) ++cismax; 3021 } 3022 3023 /* 3024 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3025 ispar counts the number of parallel ISs across C's comm. 3026 */ 3027 PetscCallMPI(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 3028 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3029 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 3030 PetscFunctionReturn(PETSC_SUCCESS); 3031 } 3032 3033 /* if (ispar) */ 3034 /* 3035 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3036 These are used to extract the off-diag portion of the resulting parallel matrix. 3037 The row IS for the off-diag portion is the same as for the diag portion, 3038 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3039 */ 3040 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3041 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3042 for (PetscInt i = 0, ii = 0; i < ismax; ++i) { 3043 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3044 if (size > 1) { 3045 /* 3046 TODO: This is the part that's ***NOT SCALABLE***. 3047 To fix this we need to extract just the indices of C's nonzero columns 3048 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3049 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3050 to be done without serializing on the IS list, so, most likely, it is best 3051 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3052 */ 3053 PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii])); 3054 /* Now we have to 3055 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3056 were sorted on each rank, concatenated they might no longer be sorted; 3057 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3058 indices in the nondecreasing order to the original index positions. 3059 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3060 */ 3061 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3062 PetscCall(ISSort(ciscol[ii])); 3063 ++ii; 3064 } 3065 } 3066 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3067 for (PetscInt i = 0, ii = 0; i < ismax; ++i) { 3068 PetscInt issize; 3069 const PetscInt *indices; 3070 3071 /* 3072 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3073 */ 3074 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3075 PetscCall(ISSort(isrow[i])); 3076 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3077 PetscCall(ISGetIndices(isrow[i], &indices)); 3078 for (PetscInt j = 1; j < issize; ++j) { 3079 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3080 } 3081 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3082 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3083 PetscCall(ISSort(iscol[i])); 3084 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3085 PetscCall(ISGetIndices(iscol[i], &indices)); 3086 for (PetscInt j = 1; j < issize; ++j) { 3087 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3088 } 3089 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3090 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3091 if (size > 1) { 3092 cisrow[ii] = isrow[i]; 3093 ++ii; 3094 } 3095 } 3096 /* 3097 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3098 array of sequential matrices underlying the resulting parallel matrices. 3099 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3100 contain duplicates. 3101 3102 There are as many diag matrices as there are original index sets. There are only as many parallel 3103 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3104 3105 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3106 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3107 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3108 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3109 with A[i] and B[ii] extracted from the corresponding MPI submat. 3110 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3111 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3112 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3113 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3114 values into A[i] and B[ii] sitting inside the corresponding submat. 3115 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3116 A[i], B[ii], AA[i] or BB[ii] matrices. 3117 */ 3118 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3119 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat)); 3120 3121 /* Now obtain the sequential A and B submatrices separately. */ 3122 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3123 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3124 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3125 3126 /* 3127 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3128 matrices A & B have been extracted directly into the parallel matrices containing them, or 3129 simply into the sequential matrix identical with the corresponding A (if size == 1). 3130 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3131 to have the same sparsity pattern. 3132 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3133 must be constructed for C. This is done by setseqmat(s). 3134 */ 3135 for (PetscInt i = 0, ii = 0; i < ismax; ++i) { 3136 /* 3137 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3138 That way we can avoid sorting and computing permutations when reusing. 3139 To this end: 3140 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3141 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3142 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3143 */ 3144 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3145 3146 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3147 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3148 if (size > 1) { 3149 if (scall == MAT_INITIAL_MATRIX) { 3150 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3151 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3152 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3153 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3154 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3155 } 3156 /* 3157 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3158 */ 3159 { 3160 Mat AA = A[i], BB = B[ii]; 3161 3162 if (AA || BB) { 3163 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3164 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3165 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3166 } 3167 PetscCall(MatDestroy(&AA)); 3168 } 3169 PetscCall(ISDestroy(ciscol + ii)); 3170 PetscCall(ISDestroy(ciscol_p + ii)); 3171 ++ii; 3172 } else { /* if (size == 1) */ 3173 if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i])); 3174 if (isrow_p[i] || iscol_p[i]) { 3175 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3176 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3177 /* Otherwise A is extracted straight into (*submats)[i]. */ 3178 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3179 PetscCall(MatDestroy(A + i)); 3180 } else (*submat)[i] = A[i]; 3181 } 3182 PetscCall(ISDestroy(&isrow_p[i])); 3183 PetscCall(ISDestroy(&iscol_p[i])); 3184 } 3185 PetscCall(PetscFree2(cisrow, ciscol)); 3186 PetscCall(PetscFree2(isrow_p, iscol_p)); 3187 PetscCall(PetscFree(ciscol_p)); 3188 PetscCall(PetscFree(A)); 3189 PetscCall(MatDestroySubMatrices(cismax, &B)); 3190 PetscFunctionReturn(PETSC_SUCCESS); 3191 } 3192 3193 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 3194 { 3195 PetscFunctionBegin; 3196 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3197 PetscFunctionReturn(PETSC_SUCCESS); 3198 } 3199