1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 /* 22 Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks 23 24 The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but 25 that is a very major recoding job. 26 27 Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin 28 */ 29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[]) 30 { 31 PetscFunctionBegin; 32 for (PetscInt i = 0; i < imax; i++) { 33 if (!is[i]) break; 34 PetscInt n = 0, N, Nmax, Nmin; 35 const PetscInt *idx; 36 PetscInt *nidx = NULL; 37 MPI_Comm comm; 38 PetscBT bt; 39 40 PetscCall(ISGetLocalSize(is[i], &N)); 41 if (N > 0) { /* Nmax and Nmin are garbage for empty IS */ 42 PetscCall(ISGetIndices(is[i], &idx)); 43 PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax)); 44 Nmin = Nmin / bs; 45 Nmax = Nmax / bs; 46 PetscCall(PetscBTCreate(Nmax - Nmin, &bt)); 47 for (PetscInt j = 0; j < N; j++) { 48 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++; 49 } 50 PetscCall(PetscMalloc1(n, &nidx)); 51 n = 0; 52 PetscCall(PetscBTMemzero(Nmax - Nmin, bt)); 53 for (PetscInt j = 0; j < N; j++) { 54 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs; 55 } 56 PetscCall(PetscBTDestroy(&bt)); 57 PetscCall(ISRestoreIndices(is[i], &idx)); 58 } 59 PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm)); 60 PetscCall(ISDestroy(is + i)); 61 PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i)); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) 67 { 68 PetscInt i; 69 70 PetscFunctionBegin; 71 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 72 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); 73 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 74 PetscFunctionReturn(PETSC_SUCCESS); 75 } 76 77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) 78 { 79 PetscInt i; 80 81 PetscFunctionBegin; 82 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 83 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); 84 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) 89 { 90 MPI_Comm comm; 91 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j; 92 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 93 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 94 const PetscInt *indices_i, **indices; 95 PetscLayout rmap; 96 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner; 97 PetscSF sf; 98 PetscSFNode *remote; 99 100 PetscFunctionBegin; 101 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 102 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 103 PetscCallMPI(MPI_Comm_size(comm, &size)); 104 /* get row map to determine where rows should be going */ 105 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 106 /* retrieve IS data and put all together so that we 107 * can optimize communication 108 * */ 109 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 110 for (i = 0, tlength = 0; i < nidx; i++) { 111 PetscCall(ISGetLocalSize(is[i], &length[i])); 112 tlength += length[i]; 113 PetscCall(ISGetIndices(is[i], &indices[i])); 114 } 115 /* find these rows on remote processors */ 116 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 117 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 118 nrrows = 0; 119 for (i = 0; i < nidx; i++) { 120 length_i = length[i]; 121 indices_i = indices[i]; 122 for (j = 0; j < length_i; j++) { 123 owner = -1; 124 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 125 /* remote processors */ 126 if (owner != rank) { 127 tosizes_temp[owner]++; /* number of rows to owner */ 128 rrow_ranks[nrrows] = owner; /* processor */ 129 rrow_isids[nrrows] = i; /* is id */ 130 remoterows[nrrows++] = indices_i[j]; /* row */ 131 } 132 } 133 PetscCall(ISRestoreIndices(is[i], &indices[i])); 134 } 135 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 136 /* test if we need to exchange messages 137 * generally speaking, we do not need to exchange 138 * data when overlap is 1 139 * */ 140 PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 141 /* we do not have any messages 142 * It usually corresponds to overlap 1 143 * */ 144 if (!reducednrrows) { 145 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 146 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 147 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 148 PetscFunctionReturn(PETSC_SUCCESS); 149 } 150 nto = 0; 151 /* send sizes and ranks for building a two-sided communication */ 152 for (i = 0; i < size; i++) { 153 if (tosizes_temp[i]) { 154 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 155 tosizes_temp[i] = nto; /* a map from processor to index */ 156 toranks[nto++] = i; /* processor */ 157 } 158 } 159 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 160 toffsets[0] = 0; 161 for (i = 0; i < nto; i++) { 162 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 163 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 164 } 165 /* send information to other processors */ 166 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 167 nrecvrows = 0; 168 for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 169 PetscCall(PetscMalloc1(nrecvrows, &remote)); 170 nrecvrows = 0; 171 for (i = 0; i < nfrom; i++) { 172 for (j = 0; j < fromsizes[2 * i]; j++) { 173 remote[nrecvrows].rank = fromranks[i]; 174 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 175 } 176 } 177 PetscCall(PetscSFCreate(comm, &sf)); 178 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 179 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 180 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 181 PetscCall(PetscSFSetFromOptions(sf)); 182 /* message pair <no of is, row> */ 183 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 184 for (i = 0; i < nrrows; i++) { 185 owner = rrow_ranks[i]; /* processor */ 186 j = tosizes_temp[owner]; /* index */ 187 todata[toffsets[j]++] = rrow_isids[i]; 188 todata[toffsets[j]++] = remoterows[i]; 189 } 190 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 191 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 192 PetscCall(PetscFree(toffsets)); 193 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 194 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 195 PetscCall(PetscSFDestroy(&sf)); 196 /* send rows belonging to the remote so that then we could get the overlapping data back */ 197 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 198 PetscCall(PetscFree2(todata, fromdata)); 199 PetscCall(PetscFree(fromsizes)); 200 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 201 PetscCall(PetscFree(fromranks)); 202 nrecvrows = 0; 203 for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 204 PetscCall(PetscCalloc1(nrecvrows, &todata)); 205 PetscCall(PetscMalloc1(nrecvrows, &remote)); 206 nrecvrows = 0; 207 for (i = 0; i < nto; i++) { 208 for (j = 0; j < tosizes[2 * i]; j++) { 209 remote[nrecvrows].rank = toranks[i]; 210 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 211 } 212 } 213 PetscCall(PetscSFCreate(comm, &sf)); 214 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 215 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 216 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 217 PetscCall(PetscSFSetFromOptions(sf)); 218 /* overlap communication and computation */ 219 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 220 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 221 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 222 PetscCall(PetscSFDestroy(&sf)); 223 PetscCall(PetscFree2(sbdata, sbsizes)); 224 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 225 PetscCall(PetscFree(toranks)); 226 PetscCall(PetscFree(tosizes)); 227 PetscCall(PetscFree(todata)); 228 PetscFunctionReturn(PETSC_SUCCESS); 229 } 230 231 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 232 { 233 PetscInt *isz, isz_i, i, j, is_id, data_size; 234 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 235 const PetscInt *indices_i_temp; 236 MPI_Comm *iscomms; 237 238 PetscFunctionBegin; 239 max_lsize = 0; 240 PetscCall(PetscMalloc1(nidx, &isz)); 241 for (i = 0; i < nidx; i++) { 242 PetscCall(ISGetLocalSize(is[i], &lsize)); 243 max_lsize = lsize > max_lsize ? lsize : max_lsize; 244 isz[i] = lsize; 245 } 246 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 247 for (i = 0; i < nidx; i++) { 248 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 249 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 250 PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(indices_temp, i * (max_lsize + nrecvs)), indices_i_temp, isz[i])); 251 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 252 PetscCall(ISDestroy(&is[i])); 253 } 254 /* retrieve information to get row id and its overlap */ 255 for (i = 0; i < nrecvs;) { 256 is_id = recvdata[i++]; 257 data_size = recvdata[i++]; 258 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 259 isz_i = isz[is_id]; 260 for (j = 0; j < data_size; j++) { 261 col = recvdata[i++]; 262 indices_i[isz_i++] = col; 263 } 264 isz[is_id] = isz_i; 265 } 266 /* remove duplicate entities */ 267 for (i = 0; i < nidx; i++) { 268 indices_i = PetscSafePointerPlusOffset(indices_temp, (max_lsize + nrecvs) * i); 269 isz_i = isz[i]; 270 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 271 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 272 PetscCall(PetscCommDestroy(&iscomms[i])); 273 } 274 PetscCall(PetscFree(isz)); 275 PetscCall(PetscFree2(indices_temp, iscomms)); 276 PetscFunctionReturn(PETSC_SUCCESS); 277 } 278 279 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 280 { 281 PetscLayout rmap, cmap; 282 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 283 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 284 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 285 const PetscInt *gcols, *ai, *aj, *bi, *bj; 286 Mat amat, bmat; 287 PetscMPIInt rank; 288 PetscBool done; 289 MPI_Comm comm; 290 291 PetscFunctionBegin; 292 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 293 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 294 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 295 /* Even if the mat is symmetric, we still assume it is not symmetric */ 296 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 297 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 298 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 299 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 300 /* total number of nonzero values is used to estimate the memory usage in the next step */ 301 tnz = ai[an] + bi[bn]; 302 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 303 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 304 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 305 /* to find the longest message */ 306 max_fszs = 0; 307 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 308 /* better way to estimate number of nonzero in the mat??? */ 309 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 310 for (i = 0; i < nidx; i++) rows_data[i] = PetscSafePointerPlusOffset(rows_data_ptr, max_fszs * i); 311 rows_pos = 0; 312 totalrows = 0; 313 for (i = 0; i < nfrom; i++) { 314 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 315 /* group data together */ 316 for (j = 0; j < fromsizes[2 * i]; j += 2) { 317 is_id = fromrows[rows_pos++]; /* no of is */ 318 rows_i = rows_data[is_id]; 319 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 320 } 321 /* estimate a space to avoid multiple allocations */ 322 for (j = 0; j < nidx; j++) { 323 indvc_ij = 0; 324 rows_i = rows_data[j]; 325 for (l = 0; l < rows_pos_i[j]; l++) { 326 row = rows_i[l] - rstart; 327 start = ai[row]; 328 end = ai[row + 1]; 329 for (k = start; k < end; k++) { /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 332 } 333 start = bi[row]; 334 end = bi[row + 1]; 335 for (k = start; k < end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 341 indv_counts[i * nidx + j] = indvc_ij; 342 totalrows += indvc_ij; 343 } 344 } 345 /* message triple <no of is, number of rows, rows> */ 346 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 347 totalrows = 0; 348 rows_pos = 0; 349 /* use this code again */ 350 for (i = 0; i < nfrom; i++) { 351 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 352 for (j = 0; j < fromsizes[2 * i]; j += 2) { 353 is_id = fromrows[rows_pos++]; 354 rows_i = rows_data[is_id]; 355 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 356 } 357 /* add data */ 358 for (j = 0; j < nidx; j++) { 359 if (!indv_counts[i * nidx + j]) continue; 360 indvc_ij = 0; 361 sbdata[totalrows++] = j; 362 sbdata[totalrows++] = indv_counts[i * nidx + j]; 363 sbsizes[2 * i] += 2; 364 rows_i = rows_data[j]; 365 for (l = 0; l < rows_pos_i[j]; l++) { 366 row = rows_i[l] - rstart; 367 start = ai[row]; 368 end = ai[row + 1]; 369 for (k = start; k < end; k++) { /* Amat */ 370 col = aj[k] + cstart; 371 indices_tmp[indvc_ij++] = col; 372 } 373 start = bi[row]; 374 end = bi[row + 1]; 375 for (k = start; k < end; k++) { /* Bmat */ 376 col = gcols[bj[k]]; 377 indices_tmp[indvc_ij++] = col; 378 } 379 } 380 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 381 sbsizes[2 * i] += indvc_ij; 382 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 383 totalrows += indvc_ij; 384 } 385 } 386 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 387 offsets[0] = 0; 388 for (i = 0; i < nfrom; i++) { 389 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 390 sbsizes[2 * i + 1] = offsets[i]; 391 } 392 PetscCall(PetscFree(offsets)); 393 if (sbrowsizes) *sbrowsizes = sbsizes; 394 if (sbrows) *sbrows = sbdata; 395 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 396 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 397 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 398 PetscFunctionReturn(PETSC_SUCCESS); 399 } 400 401 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) 402 { 403 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 404 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 405 PetscInt lsize, lsize_tmp; 406 PetscMPIInt rank, owner; 407 Mat amat, bmat; 408 PetscBool done; 409 PetscLayout cmap, rmap; 410 MPI_Comm comm; 411 412 PetscFunctionBegin; 413 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 414 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 415 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 416 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 417 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 418 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 419 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 420 /* is it a safe way to compute number of nonzero values ? */ 421 tnz = ai[an] + bi[bn]; 422 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 423 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 424 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 425 /* it is a better way to estimate memory than the old implementation 426 * where global size of matrix is used 427 * */ 428 PetscCall(PetscMalloc1(tnz, &indices_temp)); 429 for (i = 0; i < nidx; i++) { 430 MPI_Comm iscomm; 431 432 PetscCall(ISGetLocalSize(is[i], &lsize)); 433 PetscCall(ISGetIndices(is[i], &indices)); 434 lsize_tmp = 0; 435 for (j = 0; j < lsize; j++) { 436 owner = -1; 437 row = indices[j]; 438 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 439 if (owner != rank) continue; 440 /* local number */ 441 row -= rstart; 442 start = ai[row]; 443 end = ai[row + 1]; 444 for (k = start; k < end; k++) { /* Amat */ 445 col = aj[k] + cstart; 446 indices_temp[lsize_tmp++] = col; 447 } 448 start = bi[row]; 449 end = bi[row + 1]; 450 for (k = start; k < end; k++) { /* Bmat */ 451 col = gcols[bj[k]]; 452 indices_temp[lsize_tmp++] = col; 453 } 454 } 455 PetscCall(ISRestoreIndices(is[i], &indices)); 456 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 457 PetscCall(ISDestroy(&is[i])); 458 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 459 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 460 PetscCall(PetscCommDestroy(&iscomm)); 461 } 462 PetscCall(PetscFree(indices_temp)); 463 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 464 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 465 PetscFunctionReturn(PETSC_SUCCESS); 466 } 467 468 /* 469 Sample message format: 470 If a processor A wants processor B to process some elements corresponding 471 to index sets is[1],is[5] 472 mesg [0] = 2 (no of index sets in the mesg) 473 ----------- 474 mesg [1] = 1 => is[1] 475 mesg [2] = sizeof(is[1]); 476 ----------- 477 mesg [3] = 5 => is[5] 478 mesg [4] = sizeof(is[5]); 479 ----------- 480 mesg [5] 481 mesg [n] datas[1] 482 ----------- 483 mesg[n+1] 484 mesg[m] data(is[5]) 485 ----------- 486 487 Notes: 488 nrqs - no of requests sent (or to be sent out) 489 nrqr - no of requests received (which have to be or which have been processed) 490 */ 491 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) 492 { 493 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 494 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 495 const PetscInt **idx, *idx_i; 496 PetscInt *n, **data, len; 497 #if defined(PETSC_USE_CTABLE) 498 PetscHMapI *table_data, table_data_i; 499 PetscInt *tdata, tcount, tcount_max; 500 #else 501 PetscInt *data_i, *d_p; 502 #endif 503 PetscMPIInt size, rank, tag1, tag2, proc = 0; 504 PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr; 505 PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2; 506 PetscBT *table; 507 MPI_Comm comm; 508 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 509 MPI_Status *recv_status; 510 MPI_Comm *iscomms; 511 char *t_p; 512 513 PetscFunctionBegin; 514 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 515 size = c->size; 516 rank = c->rank; 517 M = C->rmap->N; 518 519 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 520 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 521 522 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 523 524 for (i = 0; i < imax; i++) { 525 PetscCall(ISGetIndices(is[i], &idx[i])); 526 PetscCall(ISGetLocalSize(is[i], &n[i])); 527 } 528 529 /* evaluate communication - mesg to who,length of mesg, and buffer space 530 required. Based on this, buffers are allocated, and data copied into them */ 531 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 532 for (i = 0; i < imax; i++) { 533 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 534 idx_i = idx[i]; 535 len = n[i]; 536 for (j = 0; j < len; j++) { 537 row = idx_i[j]; 538 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 539 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 540 w4[proc]++; 541 } 542 for (j = 0; j < size; j++) { 543 if (w4[j]) { 544 w1[j] += w4[j]; 545 w3[j]++; 546 } 547 } 548 } 549 550 nrqs = 0; /* no of outgoing messages */ 551 msz = 0; /* total mesg length (for all proc */ 552 w1[rank] = 0; /* no mesg sent to intself */ 553 w3[rank] = 0; 554 for (i = 0; i < size; i++) { 555 if (w1[i]) { 556 w2[i] = 1; 557 nrqs++; 558 } /* there exists a message to proc i */ 559 } 560 /* pa - is list of processors to communicate with */ 561 PetscCall(PetscMalloc1(nrqs, &pa)); 562 for (i = 0, j = 0; i < size; i++) { 563 if (w1[i]) { 564 pa[j] = i; 565 j++; 566 } 567 } 568 569 /* Each message would have a header = 1 + 2*(no of IS) + data */ 570 for (i = 0; i < nrqs; i++) { 571 j = pa[i]; 572 w1[j] += w2[j] + 2 * w3[j]; 573 msz += w1[j]; 574 } 575 576 /* Determine the number of messages to expect, their lengths, from from-ids */ 577 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 578 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 579 580 /* Now post the Irecvs corresponding to these messages */ 581 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 582 583 /* Allocate Memory for outgoing messages */ 584 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 585 PetscCall(PetscArrayzero(outdat, size)); 586 PetscCall(PetscArrayzero(ptr, size)); 587 588 { 589 PetscInt *iptr = tmp, ict = 0; 590 for (i = 0; i < nrqs; i++) { 591 j = pa[i]; 592 iptr += ict; 593 outdat[j] = iptr; 594 ict = w1[j]; 595 } 596 } 597 598 /* Form the outgoing messages */ 599 /* plug in the headers */ 600 for (i = 0; i < nrqs; i++) { 601 j = pa[i]; 602 outdat[j][0] = 0; 603 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 604 ptr[j] = outdat[j] + 2 * w3[j] + 1; 605 } 606 607 /* Memory for doing local proc's work */ 608 { 609 PetscInt M_BPB_imax = 0; 610 #if defined(PETSC_USE_CTABLE) 611 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 612 PetscCall(PetscMalloc1(imax, &table_data)); 613 for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i)); 614 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 615 for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 616 #else 617 PetscInt Mimax = 0; 618 PetscCall(PetscIntMultError(M, imax, &Mimax)); 619 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 620 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 621 for (i = 0; i < imax; i++) { 622 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 623 data[i] = d_p + M * i; 624 } 625 #endif 626 } 627 628 /* Parse the IS and update local tables and the outgoing buf with the data */ 629 { 630 PetscInt n_i, isz_i, *outdat_j, ctr_j; 631 PetscBT table_i; 632 633 for (i = 0; i < imax; i++) { 634 PetscCall(PetscArrayzero(ctr, size)); 635 n_i = n[i]; 636 table_i = table[i]; 637 idx_i = idx[i]; 638 #if defined(PETSC_USE_CTABLE) 639 table_data_i = table_data[i]; 640 #else 641 data_i = data[i]; 642 #endif 643 isz_i = isz[i]; 644 for (j = 0; j < n_i; j++) { /* parse the indices of each IS */ 645 row = idx_i[j]; 646 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 647 if (proc != rank) { /* copy to the outgoing buffer */ 648 ctr[proc]++; 649 *ptr[proc] = row; 650 ptr[proc]++; 651 } else if (!PetscBTLookupSet(table_i, row)) { 652 #if defined(PETSC_USE_CTABLE) 653 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 654 #else 655 data_i[isz_i] = row; /* Update the local table */ 656 #endif 657 isz_i++; 658 } 659 } 660 /* Update the headers for the current IS */ 661 for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 662 if ((ctr_j = ctr[j])) { 663 outdat_j = outdat[j]; 664 k = ++outdat_j[0]; 665 outdat_j[2 * k] = ctr_j; 666 outdat_j[2 * k - 1] = i; 667 } 668 } 669 isz[i] = isz_i; 670 } 671 } 672 673 /* Now post the sends */ 674 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 675 for (i = 0; i < nrqs; ++i) { 676 j = pa[i]; 677 PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 678 } 679 680 /* No longer need the original indices */ 681 PetscCall(PetscMalloc1(imax, &iscomms)); 682 for (i = 0; i < imax; ++i) { 683 PetscCall(ISRestoreIndices(is[i], idx + i)); 684 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 685 } 686 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 687 688 for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i])); 689 690 /* Do Local work */ 691 #if defined(PETSC_USE_CTABLE) 692 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 693 #else 694 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 695 #endif 696 697 /* Receive messages */ 698 PetscCall(PetscMalloc1(nrqr, &recv_status)); 699 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 700 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 701 702 /* Phase 1 sends are complete - deallocate buffers */ 703 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 704 PetscCall(PetscFree4(w1, w2, w3, w4)); 705 706 PetscCall(PetscMalloc1(nrqr, &xdata)); 707 PetscCall(PetscMalloc1(nrqr, &isz1)); 708 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 709 PetscCall(PetscFree(rbuf[0])); 710 PetscCall(PetscFree(rbuf)); 711 712 /* Send the data back */ 713 /* Do a global reduction to know the buffer space req for incoming messages */ 714 { 715 PetscMPIInt *rw1; 716 717 PetscCall(PetscCalloc1(size, &rw1)); 718 719 for (i = 0; i < nrqr; ++i) { 720 proc = recv_status[i].MPI_SOURCE; 721 722 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 723 rw1[proc] = isz1[i]; 724 } 725 PetscCall(PetscFree(onodes1)); 726 PetscCall(PetscFree(olengths1)); 727 728 /* Determine the number of messages to expect, their lengths, from from-ids */ 729 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 730 PetscCall(PetscFree(rw1)); 731 } 732 /* Now post the Irecvs corresponding to these messages */ 733 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 734 735 /* Now post the sends */ 736 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 737 for (i = 0; i < nrqr; ++i) { 738 j = recv_status[i].MPI_SOURCE; 739 PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i)); 740 } 741 742 /* receive work done on other processors */ 743 { 744 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 745 PetscMPIInt idex; 746 PetscBT table_i; 747 748 for (i = 0; i < nrqs; ++i) { 749 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 750 /* Process the message */ 751 rbuf2_i = rbuf2[idex]; 752 ct1 = 2 * rbuf2_i[0] + 1; 753 jmax = rbuf2[idex][0]; 754 for (j = 1; j <= jmax; j++) { 755 max = rbuf2_i[2 * j]; 756 is_no = rbuf2_i[2 * j - 1]; 757 isz_i = isz[is_no]; 758 table_i = table[is_no]; 759 #if defined(PETSC_USE_CTABLE) 760 table_data_i = table_data[is_no]; 761 #else 762 data_i = data[is_no]; 763 #endif 764 for (k = 0; k < max; k++, ct1++) { 765 row = rbuf2_i[ct1]; 766 if (!PetscBTLookupSet(table_i, row)) { 767 #if defined(PETSC_USE_CTABLE) 768 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 769 #else 770 data_i[isz_i] = row; 771 #endif 772 isz_i++; 773 } 774 } 775 isz[is_no] = isz_i; 776 } 777 } 778 779 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 780 } 781 782 #if defined(PETSC_USE_CTABLE) 783 tcount_max = 0; 784 for (i = 0; i < imax; ++i) { 785 table_data_i = table_data[i]; 786 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 787 if (tcount_max < tcount) tcount_max = tcount; 788 } 789 PetscCall(PetscMalloc1(tcount_max + 1, &tdata)); 790 #endif 791 792 for (i = 0; i < imax; ++i) { 793 #if defined(PETSC_USE_CTABLE) 794 PetscHashIter tpos; 795 796 table_data_i = table_data[i]; 797 PetscHashIterBegin(table_data_i, tpos); 798 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 799 PetscHashIterGetKey(table_data_i, tpos, k); 800 PetscHashIterGetVal(table_data_i, tpos, j); 801 PetscHashIterNext(table_data_i, tpos); 802 tdata[--j] = --k; 803 } 804 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 805 #else 806 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 807 #endif 808 PetscCall(PetscCommDestroy(&iscomms[i])); 809 } 810 811 PetscCall(PetscFree(iscomms)); 812 PetscCall(PetscFree(onodes2)); 813 PetscCall(PetscFree(olengths2)); 814 815 PetscCall(PetscFree(pa)); 816 PetscCall(PetscFree(rbuf2[0])); 817 PetscCall(PetscFree(rbuf2)); 818 PetscCall(PetscFree(s_waits1)); 819 PetscCall(PetscFree(r_waits1)); 820 PetscCall(PetscFree(s_waits2)); 821 PetscCall(PetscFree(r_waits2)); 822 PetscCall(PetscFree(recv_status)); 823 if (xdata) { 824 PetscCall(PetscFree(xdata[0])); 825 PetscCall(PetscFree(xdata)); 826 } 827 PetscCall(PetscFree(isz1)); 828 #if defined(PETSC_USE_CTABLE) 829 for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i)); 830 PetscCall(PetscFree(table_data)); 831 PetscCall(PetscFree(tdata)); 832 PetscCall(PetscFree4(table, data, isz, t_p)); 833 #else 834 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 835 #endif 836 PetscFunctionReturn(PETSC_SUCCESS); 837 } 838 839 /* 840 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 841 the work on the local processor. 842 843 Inputs: 844 C - MAT_MPIAIJ; 845 imax - total no of index sets processed at a time; 846 table - an array of char - size = m bits. 847 848 Output: 849 isz - array containing the count of the solution elements corresponding 850 to each index set; 851 data or table_data - pointer to the solutions 852 */ 853 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data) 854 { 855 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 856 Mat A = c->A, B = c->B; 857 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 858 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 859 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 860 PetscBT table_i; 861 #if defined(PETSC_USE_CTABLE) 862 PetscHMapI table_data_i; 863 PetscHashIter tpos; 864 PetscInt tcount, *tdata; 865 #else 866 PetscInt *data_i; 867 #endif 868 869 PetscFunctionBegin; 870 rstart = C->rmap->rstart; 871 cstart = C->cmap->rstart; 872 ai = a->i; 873 aj = a->j; 874 bi = b->i; 875 bj = b->j; 876 garray = c->garray; 877 878 for (i = 0; i < imax; i++) { 879 #if defined(PETSC_USE_CTABLE) 880 /* copy existing entries of table_data_i into tdata[] */ 881 table_data_i = table_data[i]; 882 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 883 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 884 885 PetscCall(PetscMalloc1(tcount, &tdata)); 886 PetscHashIterBegin(table_data_i, tpos); 887 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 888 PetscHashIterGetKey(table_data_i, tpos, row); 889 PetscHashIterGetVal(table_data_i, tpos, j); 890 PetscHashIterNext(table_data_i, tpos); 891 tdata[--j] = --row; 892 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 893 } 894 #else 895 data_i = data[i]; 896 #endif 897 table_i = table[i]; 898 isz_i = isz[i]; 899 max = isz[i]; 900 901 for (j = 0; j < max; j++) { 902 #if defined(PETSC_USE_CTABLE) 903 row = tdata[j] - rstart; 904 #else 905 row = data_i[j] - rstart; 906 #endif 907 start = ai[row]; 908 end = ai[row + 1]; 909 for (k = start; k < end; k++) { /* Amat */ 910 val = aj[k] + cstart; 911 if (!PetscBTLookupSet(table_i, val)) { 912 #if defined(PETSC_USE_CTABLE) 913 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 914 #else 915 data_i[isz_i] = val; 916 #endif 917 isz_i++; 918 } 919 } 920 start = bi[row]; 921 end = bi[row + 1]; 922 for (k = start; k < end; k++) { /* Bmat */ 923 val = garray[bj[k]]; 924 if (!PetscBTLookupSet(table_i, val)) { 925 #if defined(PETSC_USE_CTABLE) 926 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 927 #else 928 data_i[isz_i] = val; 929 #endif 930 isz_i++; 931 } 932 } 933 } 934 isz[i] = isz_i; 935 936 #if defined(PETSC_USE_CTABLE) 937 PetscCall(PetscFree(tdata)); 938 #endif 939 } 940 PetscFunctionReturn(PETSC_SUCCESS); 941 } 942 943 /* 944 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 945 and return the output 946 947 Input: 948 C - the matrix 949 nrqr - no of messages being processed. 950 rbuf - an array of pointers to the received requests 951 952 Output: 953 xdata - array of messages to be sent back 954 isz1 - size of each message 955 956 For better efficiency perhaps we should malloc separately each xdata[i], 957 then if a remalloc is required we need only copy the data for that one row 958 rather than all previous rows as it is now where a single large chunk of 959 memory is used. 960 961 */ 962 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) 963 { 964 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 965 Mat A = c->A, B = c->B; 966 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 967 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 968 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 969 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 970 PetscInt *rbuf_i, kmax, rbuf_0; 971 PetscBT xtable; 972 973 PetscFunctionBegin; 974 m = C->rmap->N; 975 rstart = C->rmap->rstart; 976 cstart = C->cmap->rstart; 977 ai = a->i; 978 aj = a->j; 979 bi = b->i; 980 bj = b->j; 981 garray = c->garray; 982 983 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 984 rbuf_i = rbuf[i]; 985 rbuf_0 = rbuf_i[0]; 986 ct += rbuf_0; 987 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 988 } 989 990 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 991 else max1 = 1; 992 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 993 if (nrqr) { 994 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 995 ++no_malloc; 996 } 997 PetscCall(PetscBTCreate(m, &xtable)); 998 PetscCall(PetscArrayzero(isz1, nrqr)); 999 1000 ct3 = 0; 1001 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 1002 rbuf_i = rbuf[i]; 1003 rbuf_0 = rbuf_i[0]; 1004 ct1 = 2 * rbuf_0 + 1; 1005 ct2 = ct1; 1006 ct3 += ct1; 1007 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 1008 PetscCall(PetscBTMemzero(m, xtable)); 1009 oct2 = ct2; 1010 kmax = rbuf_i[2 * j]; 1011 for (k = 0; k < kmax; k++, ct1++) { 1012 row = rbuf_i[ct1]; 1013 if (!PetscBTLookupSet(xtable, row)) { 1014 if (!(ct3 < mem_estimate)) { 1015 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1016 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1017 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1018 PetscCall(PetscFree(xdata[0])); 1019 xdata[0] = tmp; 1020 mem_estimate = new_estimate; 1021 ++no_malloc; 1022 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1023 } 1024 xdata[i][ct2++] = row; 1025 ct3++; 1026 } 1027 } 1028 for (k = oct2, max2 = ct2; k < max2; k++) { 1029 row = xdata[i][k] - rstart; 1030 start = ai[row]; 1031 end = ai[row + 1]; 1032 for (l = start; l < end; l++) { 1033 val = aj[l] + cstart; 1034 if (!PetscBTLookupSet(xtable, val)) { 1035 if (!(ct3 < mem_estimate)) { 1036 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1037 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1038 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1039 PetscCall(PetscFree(xdata[0])); 1040 xdata[0] = tmp; 1041 mem_estimate = new_estimate; 1042 ++no_malloc; 1043 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1044 } 1045 xdata[i][ct2++] = val; 1046 ct3++; 1047 } 1048 } 1049 start = bi[row]; 1050 end = bi[row + 1]; 1051 for (l = start; l < end; l++) { 1052 val = garray[bj[l]]; 1053 if (!PetscBTLookupSet(xtable, val)) { 1054 if (!(ct3 < mem_estimate)) { 1055 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1056 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1057 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1058 PetscCall(PetscFree(xdata[0])); 1059 xdata[0] = tmp; 1060 mem_estimate = new_estimate; 1061 ++no_malloc; 1062 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1063 } 1064 xdata[i][ct2++] = val; 1065 ct3++; 1066 } 1067 } 1068 } 1069 /* Update the header*/ 1070 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1071 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1072 } 1073 xdata[i][0] = rbuf_0; 1074 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1075 isz1[i] = ct2; /* size of each message */ 1076 } 1077 PetscCall(PetscBTDestroy(&xtable)); 1078 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1079 PetscFunctionReturn(PETSC_SUCCESS); 1080 } 1081 1082 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1083 /* 1084 Every processor gets the entire matrix 1085 */ 1086 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) 1087 { 1088 Mat B; 1089 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1090 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1091 PetscMPIInt size, rank; 1092 PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j, nrecv = 0; 1093 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1094 1095 PetscFunctionBegin; 1096 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1097 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1098 if (scall == MAT_INITIAL_MATRIX) { 1099 /* Tell every processor the number of nonzeros per row */ 1100 PetscCall(PetscCalloc1(A->rmap->N, &lens)); 1101 for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; 1102 1103 /* All MPI processes get the same matrix */ 1104 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, lens, A->rmap->N, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A))); 1105 1106 /* Create the sequential matrix of the same type as the local block diagonal */ 1107 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1108 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1109 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1110 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1111 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1112 PetscCall(PetscCalloc1(2, Bin)); 1113 **Bin = B; 1114 b = (Mat_SeqAIJ *)B->data; 1115 1116 /* zero column space */ 1117 nrecv = 0; 1118 for (i = 0; i < size; i++) { 1119 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) nrecv += lens[j]; 1120 } 1121 PetscCall(PetscArrayzero(b->j, nrecv)); 1122 1123 /* Copy my part of matrix column indices over */ 1124 sendcount = ad->nz + bd->nz; 1125 jsendbuf = PetscSafePointerPlusOffset(b->j, b->i[rstarts[rank]]); 1126 a_jsendbuf = ad->j; 1127 b_jsendbuf = bd->j; 1128 n = A->rmap->rend - A->rmap->rstart; 1129 cnt = 0; 1130 for (i = 0; i < n; i++) { 1131 /* put in lower diagonal portion */ 1132 m = bd->i[i + 1] - bd->i[i]; 1133 while (m > 0) { 1134 /* is it above diagonal (in bd (compressed) numbering) */ 1135 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1136 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1137 m--; 1138 } 1139 1140 /* put in diagonal portion */ 1141 for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1142 1143 /* put in upper diagonal portion */ 1144 while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1145 } 1146 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1147 1148 /* send column indices, b->j was zeroed */ 1149 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->j, nrecv, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)A))); 1150 1151 /* Assemble the matrix into useable form (numerical values not yet set) */ 1152 /* set the b->ilen (length of each row) values */ 1153 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1154 /* set the b->i indices */ 1155 b->i[0] = 0; 1156 for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1]; 1157 PetscCall(PetscFree(lens)); 1158 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1159 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1160 1161 } else { 1162 B = **Bin; 1163 b = (Mat_SeqAIJ *)B->data; 1164 } 1165 1166 /* Copy my part of matrix numerical values into the values location */ 1167 if (flag == MAT_GET_VALUES) { 1168 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1169 MatScalar *sendbuf; 1170 1171 /* initialize b->a */ 1172 PetscCall(PetscArrayzero(b->a, b->nz)); 1173 1174 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1175 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1176 sendcount = ad->nz + bd->nz; 1177 sendbuf = PetscSafePointerPlusOffset(b->a, b->i[rstarts[rank]]); 1178 a_sendbuf = ada; 1179 b_sendbuf = bda; 1180 b_sendj = bd->j; 1181 n = A->rmap->rend - A->rmap->rstart; 1182 cnt = 0; 1183 for (i = 0; i < n; i++) { 1184 /* put in lower diagonal portion */ 1185 m = bd->i[i + 1] - bd->i[i]; 1186 while (m > 0) { 1187 /* is it above diagonal (in bd (compressed) numbering) */ 1188 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1189 sendbuf[cnt++] = *b_sendbuf++; 1190 m--; 1191 b_sendj++; 1192 } 1193 1194 /* put in diagonal portion */ 1195 for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++; 1196 1197 /* put in upper diagonal portion */ 1198 while (m-- > 0) { 1199 sendbuf[cnt++] = *b_sendbuf++; 1200 b_sendj++; 1201 } 1202 } 1203 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1204 1205 /* send values, b->a was zeroed */ 1206 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, b->a, b->nz, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)A))); 1207 1208 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1209 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1210 } /* endof (flag == MAT_GET_VALUES) */ 1211 1212 PetscCall(MatPropagateSymmetryOptions(A, B)); 1213 PetscFunctionReturn(PETSC_SUCCESS); 1214 } 1215 1216 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) 1217 { 1218 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1219 Mat submat, A = c->A, B = c->B; 1220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1221 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1222 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1223 const PetscInt *icol, *irow; 1224 PetscInt nrow, ncol, start; 1225 PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr; 1226 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc; 1227 PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr; 1228 PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz; 1229 PetscInt *lens, rmax, ncols, *cols, Crow; 1230 #if defined(PETSC_USE_CTABLE) 1231 PetscHMapI cmap, rmap; 1232 PetscInt *cmap_loc, *rmap_loc; 1233 #else 1234 PetscInt *cmap, *rmap; 1235 #endif 1236 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i; 1237 PetscInt *cworkB, lwrite, *subcols, *row2proc; 1238 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1239 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1240 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1241 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1242 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1243 MPI_Comm comm; 1244 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1245 PetscMPIInt *onodes1, *olengths1, idex, end; 1246 Mat_SubSppt *smatis1; 1247 PetscBool isrowsorted, iscolsorted; 1248 1249 PetscFunctionBegin; 1250 PetscValidLogicalCollectiveInt(C, ismax, 2); 1251 PetscValidLogicalCollectiveEnum(C, scall, 5); 1252 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1253 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1254 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1255 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1256 size = c->size; 1257 rank = c->rank; 1258 1259 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1260 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1261 PetscCall(ISGetIndices(isrow[0], &irow)); 1262 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1263 if (allcolumns) { 1264 icol = NULL; 1265 ncol = C->cmap->N; 1266 } else { 1267 PetscCall(ISGetIndices(iscol[0], &icol)); 1268 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1269 } 1270 1271 if (scall == MAT_INITIAL_MATRIX) { 1272 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1273 1274 /* Get some new tags to keep the communication clean */ 1275 tag1 = ((PetscObject)C)->tag; 1276 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1277 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1278 1279 /* evaluate communication - mesg to who, length of mesg, and buffer space 1280 required. Based on this, buffers are allocated, and data copied into them */ 1281 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1282 PetscCall(PetscMalloc1(nrow, &row2proc)); 1283 1284 /* w1[proc] = num of rows owned by proc -- to be requested */ 1285 proc = 0; 1286 nrqs = 0; /* num of outgoing messages */ 1287 for (j = 0; j < nrow; j++) { 1288 row = irow[j]; 1289 if (!isrowsorted) proc = 0; 1290 while (row >= C->rmap->range[proc + 1]) proc++; 1291 w1[proc]++; 1292 row2proc[j] = proc; /* map row index to proc */ 1293 1294 if (proc != rank && !w2[proc]) { 1295 w2[proc] = 1; 1296 nrqs++; 1297 } 1298 } 1299 w1[rank] = 0; /* rows owned by self will not be requested */ 1300 1301 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1302 for (proc = 0, j = 0; proc < size; proc++) { 1303 if (w1[proc]) pa[j++] = proc; 1304 } 1305 1306 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1307 msz = 0; /* total mesg length (for all procs) */ 1308 for (i = 0; i < nrqs; i++) { 1309 proc = pa[i]; 1310 w1[proc] += 3; 1311 msz += w1[proc]; 1312 } 1313 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1314 1315 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1316 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1317 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1318 1319 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1320 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1321 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1322 1323 /* Now post the Irecvs corresponding to these messages */ 1324 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1325 1326 PetscCall(PetscFree(onodes1)); 1327 PetscCall(PetscFree(olengths1)); 1328 1329 /* Allocate Memory for outgoing messages */ 1330 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1331 PetscCall(PetscArrayzero(sbuf1, size)); 1332 PetscCall(PetscArrayzero(ptr, size)); 1333 1334 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1335 iptr = tmp; 1336 for (i = 0; i < nrqs; i++) { 1337 proc = pa[i]; 1338 sbuf1[proc] = iptr; 1339 iptr += w1[proc]; 1340 } 1341 1342 /* Form the outgoing messages */ 1343 /* Initialize the header space */ 1344 for (i = 0; i < nrqs; i++) { 1345 proc = pa[i]; 1346 PetscCall(PetscArrayzero(sbuf1[proc], 3)); 1347 ptr[proc] = sbuf1[proc] + 3; 1348 } 1349 1350 /* Parse the isrow and copy data into outbuf */ 1351 PetscCall(PetscArrayzero(ctr, size)); 1352 for (j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1353 proc = row2proc[j]; 1354 if (proc != rank) { /* copy to the outgoing buf*/ 1355 *ptr[proc] = irow[j]; 1356 ctr[proc]++; 1357 ptr[proc]++; 1358 } 1359 } 1360 1361 /* Update the headers for the current IS */ 1362 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 1363 if ((ctr_j = ctr[j])) { 1364 sbuf1_j = sbuf1[j]; 1365 k = ++sbuf1_j[0]; 1366 sbuf1_j[2 * k] = ctr_j; 1367 sbuf1_j[2 * k - 1] = 0; 1368 } 1369 } 1370 1371 /* Now post the sends */ 1372 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1373 for (i = 0; i < nrqs; ++i) { 1374 proc = pa[i]; 1375 PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i)); 1376 } 1377 1378 /* Post Receives to capture the buffer size */ 1379 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1380 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1381 1382 if (nrqs) rbuf2[0] = tmp + msz; 1383 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1384 1385 for (i = 0; i < nrqs; ++i) { 1386 proc = pa[i]; 1387 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i)); 1388 } 1389 1390 PetscCall(PetscFree2(w1, w2)); 1391 1392 /* Send to other procs the buf size they should allocate */ 1393 /* Receive messages*/ 1394 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1395 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1396 1397 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1398 for (i = 0; i < nrqr; ++i) { 1399 req_size[i] = 0; 1400 rbuf1_i = rbuf1[i]; 1401 start = 2 * rbuf1_i[0] + 1; 1402 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1403 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1404 sbuf2_i = sbuf2[i]; 1405 for (j = start; j < end; j++) { 1406 k = rbuf1_i[j] - rstart; 1407 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1408 sbuf2_i[j] = ncols; 1409 req_size[i] += ncols; 1410 } 1411 req_source1[i] = r_status1[i].MPI_SOURCE; 1412 1413 /* form the header */ 1414 sbuf2_i[0] = req_size[i]; 1415 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1416 1417 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1418 } 1419 1420 PetscCall(PetscFree(r_status1)); 1421 PetscCall(PetscFree(r_waits1)); 1422 1423 /* rbuf2 is received, Post recv column indices a->j */ 1424 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1425 1426 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1427 for (i = 0; i < nrqs; ++i) { 1428 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1429 req_source2[i] = r_status2[i].MPI_SOURCE; 1430 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1431 } 1432 1433 /* Wait on sends1 and sends2 */ 1434 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1435 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1436 PetscCall(PetscFree(s_waits1)); 1437 PetscCall(PetscFree(s_status1)); 1438 1439 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1440 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1441 1442 /* Now allocate sending buffers for a->j, and send them off */ 1443 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1444 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1445 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 1446 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1447 1448 for (i = 0; i < nrqr; i++) { /* for each requested message */ 1449 rbuf1_i = rbuf1[i]; 1450 sbuf_aj_i = sbuf_aj[i]; 1451 ct1 = 2 * rbuf1_i[0] + 1; 1452 ct2 = 0; 1453 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1454 1455 kmax = rbuf1[i][2]; 1456 for (k = 0; k < kmax; k++, ct1++) { /* for each row */ 1457 row = rbuf1_i[ct1] - rstart; 1458 nzA = ai[row + 1] - ai[row]; 1459 nzB = bi[row + 1] - bi[row]; 1460 ncols = nzA + nzB; 1461 cworkA = PetscSafePointerPlusOffset(aj, ai[row]); 1462 cworkB = PetscSafePointerPlusOffset(bj, bi[row]); 1463 1464 /* load the column indices for this row into cols*/ 1465 cols = PetscSafePointerPlusOffset(sbuf_aj_i, ct2); 1466 1467 lwrite = 0; 1468 for (l = 0; l < nzB; l++) { 1469 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1470 } 1471 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1472 for (l = 0; l < nzB; l++) { 1473 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1474 } 1475 1476 ct2 += ncols; 1477 } 1478 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1479 } 1480 1481 /* create column map (cmap): global col of C -> local col of submat */ 1482 #if defined(PETSC_USE_CTABLE) 1483 if (!allcolumns) { 1484 PetscCall(PetscHMapICreateWithSize(ncol, &cmap)); 1485 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1486 for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1487 if (icol[j] >= cstart && icol[j] < cend) { 1488 cmap_loc[icol[j] - cstart] = j + 1; 1489 } else { /* use PetscHMapI for non-local col indices */ 1490 PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1)); 1491 } 1492 } 1493 } else { 1494 cmap = NULL; 1495 cmap_loc = NULL; 1496 } 1497 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1498 #else 1499 if (!allcolumns) { 1500 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1501 for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1502 } else { 1503 cmap = NULL; 1504 } 1505 #endif 1506 1507 /* Create lens for MatSeqAIJSetPreallocation() */ 1508 PetscCall(PetscCalloc1(nrow, &lens)); 1509 1510 /* Compute lens from local part of C */ 1511 for (j = 0; j < nrow; j++) { 1512 row = irow[j]; 1513 proc = row2proc[j]; 1514 if (proc == rank) { 1515 /* diagonal part A = c->A */ 1516 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1517 cols = PetscSafePointerPlusOffset(aj, ai[row - rstart]); 1518 if (!allcolumns) { 1519 for (k = 0; k < ncols; k++) { 1520 #if defined(PETSC_USE_CTABLE) 1521 tcol = cmap_loc[cols[k]]; 1522 #else 1523 tcol = cmap[cols[k] + cstart]; 1524 #endif 1525 if (tcol) lens[j]++; 1526 } 1527 } else { /* allcolumns */ 1528 lens[j] = ncols; 1529 } 1530 1531 /* off-diagonal part B = c->B */ 1532 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1533 cols = PetscSafePointerPlusOffset(bj, bi[row - rstart]); 1534 if (!allcolumns) { 1535 for (k = 0; k < ncols; k++) { 1536 #if defined(PETSC_USE_CTABLE) 1537 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1538 #else 1539 tcol = cmap[bmap[cols[k]]]; 1540 #endif 1541 if (tcol) lens[j]++; 1542 } 1543 } else { /* allcolumns */ 1544 lens[j] += ncols; 1545 } 1546 } 1547 } 1548 1549 /* Create row map (rmap): global row of C -> local row of submat */ 1550 #if defined(PETSC_USE_CTABLE) 1551 PetscCall(PetscHMapICreateWithSize(nrow, &rmap)); 1552 for (j = 0; j < nrow; j++) { 1553 row = irow[j]; 1554 proc = row2proc[j]; 1555 if (proc == rank) { /* a local row */ 1556 rmap_loc[row - rstart] = j; 1557 } else { 1558 PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1)); 1559 } 1560 } 1561 #else 1562 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1563 for (j = 0; j < nrow; j++) rmap[irow[j]] = j; 1564 #endif 1565 1566 /* Update lens from offproc data */ 1567 /* recv a->j is done */ 1568 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1569 for (i = 0; i < nrqs; i++) { 1570 proc = pa[i]; 1571 sbuf1_i = sbuf1[proc]; 1572 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1573 ct1 = 2 + 1; 1574 ct2 = 0; 1575 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1576 rbuf3_i = rbuf3[i]; /* received C->j */ 1577 1578 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1579 max1 = sbuf1_i[2]; 1580 for (k = 0; k < max1; k++, ct1++) { 1581 #if defined(PETSC_USE_CTABLE) 1582 PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row)); 1583 row--; 1584 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1585 #else 1586 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1587 #endif 1588 /* Now, store row index of submat in sbuf1_i[ct1] */ 1589 sbuf1_i[ct1] = row; 1590 1591 nnz = rbuf2_i[ct1]; 1592 if (!allcolumns) { 1593 for (l = 0; l < nnz; l++, ct2++) { 1594 #if defined(PETSC_USE_CTABLE) 1595 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1596 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1597 } else { 1598 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1599 } 1600 #else 1601 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1602 #endif 1603 if (tcol) lens[row]++; 1604 } 1605 } else { /* allcolumns */ 1606 lens[row] += nnz; 1607 } 1608 } 1609 } 1610 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1611 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1612 1613 /* Create the submatrices */ 1614 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1615 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1616 1617 PetscCall(ISGetBlockSize(isrow[0], &i)); 1618 PetscCall(ISGetBlockSize(iscol[0], &j)); 1619 if (i > 1 || j > 1) PetscCall(MatSetBlockSizes(submat, i, j)); 1620 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1621 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1622 1623 /* create struct Mat_SubSppt and attached it to submat */ 1624 PetscCall(PetscNew(&smatis1)); 1625 subc = (Mat_SeqAIJ *)submat->data; 1626 subc->submatis1 = smatis1; 1627 1628 smatis1->id = 0; 1629 smatis1->nrqs = nrqs; 1630 smatis1->nrqr = nrqr; 1631 smatis1->rbuf1 = rbuf1; 1632 smatis1->rbuf2 = rbuf2; 1633 smatis1->rbuf3 = rbuf3; 1634 smatis1->sbuf2 = sbuf2; 1635 smatis1->req_source2 = req_source2; 1636 1637 smatis1->sbuf1 = sbuf1; 1638 smatis1->ptr = ptr; 1639 smatis1->tmp = tmp; 1640 smatis1->ctr = ctr; 1641 1642 smatis1->pa = pa; 1643 smatis1->req_size = req_size; 1644 smatis1->req_source1 = req_source1; 1645 1646 smatis1->allcolumns = allcolumns; 1647 smatis1->singleis = PETSC_TRUE; 1648 smatis1->row2proc = row2proc; 1649 smatis1->rmap = rmap; 1650 smatis1->cmap = cmap; 1651 #if defined(PETSC_USE_CTABLE) 1652 smatis1->rmap_loc = rmap_loc; 1653 smatis1->cmap_loc = cmap_loc; 1654 #endif 1655 1656 smatis1->destroy = submat->ops->destroy; 1657 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1658 submat->factortype = C->factortype; 1659 1660 /* compute rmax */ 1661 rmax = 0; 1662 for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1663 1664 } else { /* scall == MAT_REUSE_MATRIX */ 1665 submat = submats[0]; 1666 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1667 1668 subc = (Mat_SeqAIJ *)submat->data; 1669 rmax = subc->rmax; 1670 smatis1 = subc->submatis1; 1671 nrqs = smatis1->nrqs; 1672 nrqr = smatis1->nrqr; 1673 rbuf1 = smatis1->rbuf1; 1674 rbuf2 = smatis1->rbuf2; 1675 rbuf3 = smatis1->rbuf3; 1676 req_source2 = smatis1->req_source2; 1677 1678 sbuf1 = smatis1->sbuf1; 1679 sbuf2 = smatis1->sbuf2; 1680 ptr = smatis1->ptr; 1681 tmp = smatis1->tmp; 1682 ctr = smatis1->ctr; 1683 1684 pa = smatis1->pa; 1685 req_size = smatis1->req_size; 1686 req_source1 = smatis1->req_source1; 1687 1688 allcolumns = smatis1->allcolumns; 1689 row2proc = smatis1->row2proc; 1690 rmap = smatis1->rmap; 1691 cmap = smatis1->cmap; 1692 #if defined(PETSC_USE_CTABLE) 1693 rmap_loc = smatis1->rmap_loc; 1694 cmap_loc = smatis1->cmap_loc; 1695 #endif 1696 } 1697 1698 /* Post recv matrix values */ 1699 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1700 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1701 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1702 for (i = 0; i < nrqs; ++i) { 1703 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1704 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1705 } 1706 1707 /* Allocate sending buffers for a->a, and send them off */ 1708 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1709 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1710 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 1711 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1712 1713 for (i = 0; i < nrqr; i++) { 1714 rbuf1_i = rbuf1[i]; 1715 sbuf_aa_i = sbuf_aa[i]; 1716 ct1 = 2 * rbuf1_i[0] + 1; 1717 ct2 = 0; 1718 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1719 1720 kmax = rbuf1_i[2]; 1721 for (k = 0; k < kmax; k++, ct1++) { 1722 row = rbuf1_i[ct1] - rstart; 1723 nzA = ai[row + 1] - ai[row]; 1724 nzB = bi[row + 1] - bi[row]; 1725 ncols = nzA + nzB; 1726 cworkB = PetscSafePointerPlusOffset(bj, bi[row]); 1727 vworkA = PetscSafePointerPlusOffset(a_a, ai[row]); 1728 vworkB = PetscSafePointerPlusOffset(b_a, bi[row]); 1729 1730 /* load the column values for this row into vals*/ 1731 vals = PetscSafePointerPlusOffset(sbuf_aa_i, ct2); 1732 1733 lwrite = 0; 1734 for (l = 0; l < nzB; l++) { 1735 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1736 } 1737 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1738 for (l = 0; l < nzB; l++) { 1739 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1740 } 1741 1742 ct2 += ncols; 1743 } 1744 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1745 } 1746 1747 /* Assemble submat */ 1748 /* First assemble the local rows */ 1749 for (j = 0; j < nrow; j++) { 1750 row = irow[j]; 1751 proc = row2proc[j]; 1752 if (proc == rank) { 1753 Crow = row - rstart; /* local row index of C */ 1754 #if defined(PETSC_USE_CTABLE) 1755 row = rmap_loc[Crow]; /* row index of submat */ 1756 #else 1757 row = rmap[row]; 1758 #endif 1759 1760 if (allcolumns) { 1761 /* diagonal part A = c->A */ 1762 ncols = ai[Crow + 1] - ai[Crow]; 1763 cols = PetscSafePointerPlusOffset(aj, ai[Crow]); 1764 vals = PetscSafePointerPlusOffset(a_a, ai[Crow]); 1765 i = 0; 1766 for (k = 0; k < ncols; k++) { 1767 subcols[i] = cols[k] + cstart; 1768 subvals[i++] = vals[k]; 1769 } 1770 1771 /* off-diagonal part B = c->B */ 1772 ncols = bi[Crow + 1] - bi[Crow]; 1773 cols = PetscSafePointerPlusOffset(bj, bi[Crow]); 1774 vals = PetscSafePointerPlusOffset(b_a, bi[Crow]); 1775 for (k = 0; k < ncols; k++) { 1776 subcols[i] = bmap[cols[k]]; 1777 subvals[i++] = vals[k]; 1778 } 1779 1780 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1781 1782 } else { /* !allcolumns */ 1783 #if defined(PETSC_USE_CTABLE) 1784 /* diagonal part A = c->A */ 1785 ncols = ai[Crow + 1] - ai[Crow]; 1786 cols = PetscSafePointerPlusOffset(aj, ai[Crow]); 1787 vals = PetscSafePointerPlusOffset(a_a, ai[Crow]); 1788 i = 0; 1789 for (k = 0; k < ncols; k++) { 1790 tcol = cmap_loc[cols[k]]; 1791 if (tcol) { 1792 subcols[i] = --tcol; 1793 subvals[i++] = vals[k]; 1794 } 1795 } 1796 1797 /* off-diagonal part B = c->B */ 1798 ncols = bi[Crow + 1] - bi[Crow]; 1799 cols = PetscSafePointerPlusOffset(bj, bi[Crow]); 1800 vals = PetscSafePointerPlusOffset(b_a, bi[Crow]); 1801 for (k = 0; k < ncols; k++) { 1802 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1803 if (tcol) { 1804 subcols[i] = --tcol; 1805 subvals[i++] = vals[k]; 1806 } 1807 } 1808 #else 1809 /* diagonal part A = c->A */ 1810 ncols = ai[Crow + 1] - ai[Crow]; 1811 cols = aj + ai[Crow]; 1812 vals = a_a + ai[Crow]; 1813 i = 0; 1814 for (k = 0; k < ncols; k++) { 1815 tcol = cmap[cols[k] + cstart]; 1816 if (tcol) { 1817 subcols[i] = --tcol; 1818 subvals[i++] = vals[k]; 1819 } 1820 } 1821 1822 /* off-diagonal part B = c->B */ 1823 ncols = bi[Crow + 1] - bi[Crow]; 1824 cols = bj + bi[Crow]; 1825 vals = b_a + bi[Crow]; 1826 for (k = 0; k < ncols; k++) { 1827 tcol = cmap[bmap[cols[k]]]; 1828 if (tcol) { 1829 subcols[i] = --tcol; 1830 subvals[i++] = vals[k]; 1831 } 1832 } 1833 #endif 1834 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1835 } 1836 } 1837 } 1838 1839 /* Now assemble the off-proc rows */ 1840 for (i = 0; i < nrqs; i++) { /* for each requested message */ 1841 /* recv values from other processes */ 1842 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1843 proc = pa[idex]; 1844 sbuf1_i = sbuf1[proc]; 1845 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1846 ct1 = 2 + 1; 1847 ct2 = 0; /* count of received C->j */ 1848 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1849 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1850 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1851 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1852 1853 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1854 max1 = sbuf1_i[2]; /* num of rows */ 1855 for (k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1856 row = sbuf1_i[ct1]; /* row index of submat */ 1857 if (!allcolumns) { 1858 idex = 0; 1859 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1860 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1861 for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1862 #if defined(PETSC_USE_CTABLE) 1863 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1864 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1865 } else { 1866 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1867 } 1868 #else 1869 tcol = cmap[rbuf3_i[ct2]]; 1870 #endif 1871 if (tcol) { 1872 subcols[idex] = --tcol; /* may not be sorted */ 1873 subvals[idex++] = rbuf4_i[ct2]; 1874 1875 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1876 For reuse, we replace received C->j with index that should be inserted to submat */ 1877 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1878 } 1879 } 1880 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1881 } else { /* scall == MAT_REUSE_MATRIX */ 1882 submat = submats[0]; 1883 subc = (Mat_SeqAIJ *)submat->data; 1884 1885 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1886 for (l = 0; l < nnz; l++) { 1887 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1888 subvals[idex++] = rbuf4_i[ct2]; 1889 } 1890 1891 bj = subc->j + subc->i[row]; /* sorted column indices */ 1892 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1893 } 1894 } else { /* allcolumns */ 1895 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1896 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, PetscSafePointerPlusOffset(rbuf3_i, ct2), PetscSafePointerPlusOffset(rbuf4_i, ct2), INSERT_VALUES)); 1897 ct2 += nnz; 1898 } 1899 } 1900 } 1901 1902 /* sending a->a are done */ 1903 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1904 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1905 1906 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1907 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1908 submats[0] = submat; 1909 1910 /* Restore the indices */ 1911 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1912 if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol)); 1913 1914 /* Destroy allocated memory */ 1915 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 1916 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1917 if (sbuf_aa) { 1918 PetscCall(PetscFree(sbuf_aa[0])); 1919 PetscCall(PetscFree(sbuf_aa)); 1920 } 1921 1922 if (scall == MAT_INITIAL_MATRIX) { 1923 PetscCall(PetscFree(lens)); 1924 if (sbuf_aj) { 1925 PetscCall(PetscFree(sbuf_aj[0])); 1926 PetscCall(PetscFree(sbuf_aj)); 1927 } 1928 } 1929 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1930 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1931 PetscFunctionReturn(PETSC_SUCCESS); 1932 } 1933 1934 static PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1935 { 1936 PetscInt ncol; 1937 PetscBool colflag, allcolumns = PETSC_FALSE; 1938 1939 PetscFunctionBegin; 1940 /* Allocate memory to hold all the submatrices */ 1941 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat)); 1942 1943 /* Check for special case: each processor gets entire matrix columns */ 1944 PetscCall(ISIdentity(iscol[0], &colflag)); 1945 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1946 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1947 1948 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1949 PetscFunctionReturn(PETSC_SUCCESS); 1950 } 1951 1952 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1953 { 1954 PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2]; 1955 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1956 Mat_SeqAIJ *subc; 1957 Mat_SubSppt *smat; 1958 1959 PetscFunctionBegin; 1960 /* Check for special case: each processor has a single IS */ 1961 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1962 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1963 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1964 PetscFunctionReturn(PETSC_SUCCESS); 1965 } 1966 1967 /* Collect global wantallmatrix and nstages */ 1968 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1969 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1970 if (!nmax) nmax = 1; 1971 1972 if (scall == MAT_INITIAL_MATRIX) { 1973 /* Collect global wantallmatrix and nstages */ 1974 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1975 PetscCall(ISIdentity(*isrow, &rowflag)); 1976 PetscCall(ISIdentity(*iscol, &colflag)); 1977 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1978 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1979 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1980 wantallmatrix = PETSC_TRUE; 1981 1982 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1983 } 1984 } 1985 1986 /* Determine the number of stages through which submatrices are done 1987 Each stage will extract nmax submatrices. 1988 nmax is determined by the matrix column dimension. 1989 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1990 */ 1991 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1992 1993 in[0] = -1 * (PetscInt)wantallmatrix; 1994 in[1] = nstages; 1995 PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 1996 wantallmatrix = (PetscBool)(-out[0]); 1997 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1998 1999 } else { /* MAT_REUSE_MATRIX */ 2000 if (ismax) { 2001 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2002 smat = subc->submatis1; 2003 } else { /* (*submat)[0] is a dummy matrix */ 2004 smat = (Mat_SubSppt *)(*submat)[0]->data; 2005 } 2006 if (!smat) { 2007 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2008 wantallmatrix = PETSC_TRUE; 2009 } else if (smat->singleis) { 2010 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 2011 PetscFunctionReturn(PETSC_SUCCESS); 2012 } else { 2013 nstages = smat->nstages; 2014 } 2015 } 2016 2017 if (wantallmatrix) { 2018 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 2019 PetscFunctionReturn(PETSC_SUCCESS); 2020 } 2021 2022 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2023 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat)); 2024 2025 for (i = 0, pos = 0; i < nstages; i++) { 2026 if (pos + nmax <= ismax) max_no = nmax; 2027 else if (pos >= ismax) max_no = 0; 2028 else max_no = ismax - pos; 2029 2030 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, PetscSafePointerPlusOffset(isrow, pos), PetscSafePointerPlusOffset(iscol, pos), scall, *submat + pos)); 2031 if (!max_no) { 2032 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2033 smat = (Mat_SubSppt *)(*submat)[pos]->data; 2034 smat->nstages = nstages; 2035 } 2036 pos++; /* advance to next dummy matrix if any */ 2037 } else pos += max_no; 2038 } 2039 2040 if (ismax && scall == MAT_INITIAL_MATRIX) { 2041 /* save nstages for reuse */ 2042 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2043 smat = subc->submatis1; 2044 smat->nstages = nstages; 2045 } 2046 PetscFunctionReturn(PETSC_SUCCESS); 2047 } 2048 2049 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) 2050 { 2051 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2052 Mat A = c->A; 2053 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2054 const PetscInt **icol, **irow; 2055 PetscInt *nrow, *ncol, start; 2056 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr; 2057 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1; 2058 PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol; 2059 PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2060 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2061 #if defined(PETSC_USE_CTABLE) 2062 PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i; 2063 #else 2064 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2065 #endif 2066 const PetscInt *irow_i; 2067 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2068 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2069 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2070 MPI_Comm comm; 2071 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2072 PetscMPIInt *onodes1, *olengths1, end; 2073 PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2074 Mat_SubSppt *smat_i; 2075 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2076 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen; 2077 2078 PetscFunctionBegin; 2079 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2080 size = c->size; 2081 rank = c->rank; 2082 2083 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2084 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2085 2086 for (i = 0; i < ismax; i++) { 2087 PetscCall(ISSorted(iscol[i], &issorted[i])); 2088 if (!issorted[i]) iscsorted = issorted[i]; 2089 2090 PetscCall(ISSorted(isrow[i], &issorted[i])); 2091 2092 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2093 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2094 2095 /* Check for special case: allcolumn */ 2096 PetscCall(ISIdentity(iscol[i], &colflag)); 2097 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2098 if (colflag && ncol[i] == C->cmap->N) { 2099 allcolumns[i] = PETSC_TRUE; 2100 icol[i] = NULL; 2101 } else { 2102 allcolumns[i] = PETSC_FALSE; 2103 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2104 } 2105 } 2106 2107 if (scall == MAT_REUSE_MATRIX) { 2108 /* Assumes new rows are same length as the old rows */ 2109 for (i = 0; i < ismax; i++) { 2110 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2111 subc = (Mat_SeqAIJ *)submats[i]->data; 2112 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2113 2114 /* Initial matrix as if empty */ 2115 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2116 2117 smat_i = subc->submatis1; 2118 2119 nrqs = smat_i->nrqs; 2120 nrqr = smat_i->nrqr; 2121 rbuf1 = smat_i->rbuf1; 2122 rbuf2 = smat_i->rbuf2; 2123 rbuf3 = smat_i->rbuf3; 2124 req_source2 = smat_i->req_source2; 2125 2126 sbuf1 = smat_i->sbuf1; 2127 sbuf2 = smat_i->sbuf2; 2128 ptr = smat_i->ptr; 2129 tmp = smat_i->tmp; 2130 ctr = smat_i->ctr; 2131 2132 pa = smat_i->pa; 2133 req_size = smat_i->req_size; 2134 req_source1 = smat_i->req_source1; 2135 2136 allcolumns[i] = smat_i->allcolumns; 2137 row2proc[i] = smat_i->row2proc; 2138 rmap[i] = smat_i->rmap; 2139 cmap[i] = smat_i->cmap; 2140 } 2141 2142 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2143 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2144 smat_i = (Mat_SubSppt *)submats[0]->data; 2145 2146 nrqs = smat_i->nrqs; 2147 nrqr = smat_i->nrqr; 2148 rbuf1 = smat_i->rbuf1; 2149 rbuf2 = smat_i->rbuf2; 2150 rbuf3 = smat_i->rbuf3; 2151 req_source2 = smat_i->req_source2; 2152 2153 sbuf1 = smat_i->sbuf1; 2154 sbuf2 = smat_i->sbuf2; 2155 ptr = smat_i->ptr; 2156 tmp = smat_i->tmp; 2157 ctr = smat_i->ctr; 2158 2159 pa = smat_i->pa; 2160 req_size = smat_i->req_size; 2161 req_source1 = smat_i->req_source1; 2162 2163 allcolumns[0] = PETSC_FALSE; 2164 } 2165 } else { /* scall == MAT_INITIAL_MATRIX */ 2166 /* Get some new tags to keep the communication clean */ 2167 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2168 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2169 2170 /* evaluate communication - mesg to who, length of mesg, and buffer space 2171 required. Based on this, buffers are allocated, and data copied into them*/ 2172 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2173 2174 for (i = 0; i < ismax; i++) { 2175 jmax = nrow[i]; 2176 irow_i = irow[i]; 2177 2178 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2179 row2proc[i] = row2proc_i; 2180 2181 if (issorted[i]) proc = 0; 2182 for (j = 0; j < jmax; j++) { 2183 if (!issorted[i]) proc = 0; 2184 row = irow_i[j]; 2185 while (row >= C->rmap->range[proc + 1]) proc++; 2186 w4[proc]++; 2187 row2proc_i[j] = proc; /* map row index to proc */ 2188 } 2189 for (j = 0; j < size; j++) { 2190 if (w4[j]) { 2191 w1[j] += w4[j]; 2192 w3[j]++; 2193 w4[j] = 0; 2194 } 2195 } 2196 } 2197 2198 nrqs = 0; /* no of outgoing messages */ 2199 msz = 0; /* total mesg length (for all procs) */ 2200 w1[rank] = 0; /* no mesg sent to self */ 2201 w3[rank] = 0; 2202 for (i = 0; i < size; i++) { 2203 if (w1[i]) { 2204 w2[i] = 1; 2205 nrqs++; 2206 } /* there exists a message to proc i */ 2207 } 2208 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2209 for (i = 0, j = 0; i < size; i++) { 2210 if (w1[i]) { 2211 pa[j] = i; 2212 j++; 2213 } 2214 } 2215 2216 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2217 for (i = 0; i < nrqs; i++) { 2218 j = pa[i]; 2219 w1[j] += w2[j] + 2 * w3[j]; 2220 msz += w1[j]; 2221 } 2222 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2223 2224 /* Determine the number of messages to expect, their lengths, from from-ids */ 2225 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2226 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2227 2228 /* Now post the Irecvs corresponding to these messages */ 2229 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2230 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2231 2232 /* Allocate Memory for outgoing messages */ 2233 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2234 PetscCall(PetscArrayzero(sbuf1, size)); 2235 PetscCall(PetscArrayzero(ptr, size)); 2236 2237 { 2238 PetscInt *iptr = tmp; 2239 k = 0; 2240 for (i = 0; i < nrqs; i++) { 2241 j = pa[i]; 2242 iptr += k; 2243 sbuf1[j] = iptr; 2244 k = w1[j]; 2245 } 2246 } 2247 2248 /* Form the outgoing messages. Initialize the header space */ 2249 for (i = 0; i < nrqs; i++) { 2250 j = pa[i]; 2251 sbuf1[j][0] = 0; 2252 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2253 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2254 } 2255 2256 /* Parse the isrow and copy data into outbuf */ 2257 for (i = 0; i < ismax; i++) { 2258 row2proc_i = row2proc[i]; 2259 PetscCall(PetscArrayzero(ctr, size)); 2260 irow_i = irow[i]; 2261 jmax = nrow[i]; 2262 for (j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2263 proc = row2proc_i[j]; 2264 if (proc != rank) { /* copy to the outgoing buf*/ 2265 ctr[proc]++; 2266 *ptr[proc] = irow_i[j]; 2267 ptr[proc]++; 2268 } 2269 } 2270 /* Update the headers for the current IS */ 2271 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 2272 if ((ctr_j = ctr[j])) { 2273 sbuf1_j = sbuf1[j]; 2274 k = ++sbuf1_j[0]; 2275 sbuf1_j[2 * k] = ctr_j; 2276 sbuf1_j[2 * k - 1] = i; 2277 } 2278 } 2279 } 2280 2281 /* Now post the sends */ 2282 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2283 for (i = 0; i < nrqs; ++i) { 2284 j = pa[i]; 2285 PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2286 } 2287 2288 /* Post Receives to capture the buffer size */ 2289 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2290 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2291 if (nrqs) rbuf2[0] = tmp + msz; 2292 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 2293 for (i = 0; i < nrqs; ++i) { 2294 j = pa[i]; 2295 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2296 } 2297 2298 /* Send to other procs the buf size they should allocate */ 2299 /* Receive messages*/ 2300 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2301 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2302 { 2303 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2304 PetscInt *sbuf2_i; 2305 2306 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2307 for (i = 0; i < nrqr; ++i) { 2308 req_size[i] = 0; 2309 rbuf1_i = rbuf1[i]; 2310 start = 2 * rbuf1_i[0] + 1; 2311 end = olengths1[i]; 2312 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2313 sbuf2_i = sbuf2[i]; 2314 for (j = start; j < end; j++) { 2315 id = rbuf1_i[j] - rstart; 2316 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2317 sbuf2_i[j] = ncols; 2318 req_size[i] += ncols; 2319 } 2320 req_source1[i] = onodes1[i]; 2321 /* form the header */ 2322 sbuf2_i[0] = req_size[i]; 2323 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2324 2325 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2326 } 2327 } 2328 2329 PetscCall(PetscFree(onodes1)); 2330 PetscCall(PetscFree(olengths1)); 2331 PetscCall(PetscFree(r_waits1)); 2332 PetscCall(PetscFree4(w1, w2, w3, w4)); 2333 2334 /* Receive messages*/ 2335 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2336 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2337 for (i = 0; i < nrqs; ++i) { 2338 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2339 req_source2[i] = pa[i]; 2340 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2341 } 2342 PetscCall(PetscFree(r_waits2)); 2343 2344 /* Wait on sends1 and sends2 */ 2345 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2346 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2347 PetscCall(PetscFree(s_waits1)); 2348 PetscCall(PetscFree(s_waits2)); 2349 2350 /* Now allocate sending buffers for a->j, and send them off */ 2351 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2352 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2353 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 2354 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2355 2356 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2357 { 2358 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2359 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2360 PetscInt cend = C->cmap->rend; 2361 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2362 2363 for (i = 0; i < nrqr; i++) { 2364 rbuf1_i = rbuf1[i]; 2365 sbuf_aj_i = sbuf_aj[i]; 2366 ct1 = 2 * rbuf1_i[0] + 1; 2367 ct2 = 0; 2368 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2369 kmax = rbuf1[i][2 * j]; 2370 for (k = 0; k < kmax; k++, ct1++) { 2371 row = rbuf1_i[ct1] - rstart; 2372 nzA = a_i[row + 1] - a_i[row]; 2373 nzB = b_i[row + 1] - b_i[row]; 2374 ncols = nzA + nzB; 2375 cworkA = PetscSafePointerPlusOffset(a_j, a_i[row]); 2376 cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]); 2377 2378 /* load the column indices for this row into cols */ 2379 cols = sbuf_aj_i + ct2; 2380 2381 lwrite = 0; 2382 for (l = 0; l < nzB; l++) { 2383 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2384 } 2385 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2386 for (l = 0; l < nzB; l++) { 2387 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2388 } 2389 2390 ct2 += ncols; 2391 } 2392 } 2393 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2394 } 2395 } 2396 2397 /* create col map: global col of C -> local col of submatrices */ 2398 { 2399 const PetscInt *icol_i; 2400 #if defined(PETSC_USE_CTABLE) 2401 for (i = 0; i < ismax; i++) { 2402 if (!allcolumns[i]) { 2403 PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i)); 2404 2405 jmax = ncol[i]; 2406 icol_i = icol[i]; 2407 cmap_i = cmap[i]; 2408 for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1)); 2409 } else cmap[i] = NULL; 2410 } 2411 #else 2412 for (i = 0; i < ismax; i++) { 2413 if (!allcolumns[i]) { 2414 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2415 jmax = ncol[i]; 2416 icol_i = icol[i]; 2417 cmap_i = cmap[i]; 2418 for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1; 2419 } else cmap[i] = NULL; 2420 } 2421 #endif 2422 } 2423 2424 /* Create lens which is required for MatCreate... */ 2425 for (i = 0, j = 0; i < ismax; i++) j += nrow[i]; 2426 PetscCall(PetscMalloc1(ismax, &lens)); 2427 2428 if (ismax) PetscCall(PetscCalloc1(j, &lens[0])); 2429 for (i = 1; i < ismax; i++) lens[i] = PetscSafePointerPlusOffset(lens[i - 1], nrow[i - 1]); 2430 2431 /* Update lens from local data */ 2432 for (i = 0; i < ismax; i++) { 2433 row2proc_i = row2proc[i]; 2434 jmax = nrow[i]; 2435 if (!allcolumns[i]) cmap_i = cmap[i]; 2436 irow_i = irow[i]; 2437 lens_i = lens[i]; 2438 for (j = 0; j < jmax; j++) { 2439 row = irow_i[j]; 2440 proc = row2proc_i[j]; 2441 if (proc == rank) { 2442 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2443 if (!allcolumns[i]) { 2444 for (k = 0; k < ncols; k++) { 2445 #if defined(PETSC_USE_CTABLE) 2446 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2447 #else 2448 tcol = cmap_i[cols[k]]; 2449 #endif 2450 if (tcol) lens_i[j]++; 2451 } 2452 } else { /* allcolumns */ 2453 lens_i[j] = ncols; 2454 } 2455 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2456 } 2457 } 2458 } 2459 2460 /* Create row map: global row of C -> local row of submatrices */ 2461 #if defined(PETSC_USE_CTABLE) 2462 for (i = 0; i < ismax; i++) { 2463 PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i)); 2464 irow_i = irow[i]; 2465 jmax = nrow[i]; 2466 for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1)); 2467 } 2468 #else 2469 for (i = 0; i < ismax; i++) { 2470 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2471 rmap_i = rmap[i]; 2472 irow_i = irow[i]; 2473 jmax = nrow[i]; 2474 for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j; 2475 } 2476 #endif 2477 2478 /* Update lens from offproc data */ 2479 { 2480 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2481 2482 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2483 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2484 sbuf1_i = sbuf1[pa[tmp2]]; 2485 jmax = sbuf1_i[0]; 2486 ct1 = 2 * jmax + 1; 2487 ct2 = 0; 2488 rbuf2_i = rbuf2[tmp2]; 2489 rbuf3_i = rbuf3[tmp2]; 2490 for (j = 1; j <= jmax; j++) { 2491 is_no = sbuf1_i[2 * j - 1]; 2492 max1 = sbuf1_i[2 * j]; 2493 lens_i = lens[is_no]; 2494 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2495 rmap_i = rmap[is_no]; 2496 for (k = 0; k < max1; k++, ct1++) { 2497 #if defined(PETSC_USE_CTABLE) 2498 PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row)); 2499 row--; 2500 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2501 #else 2502 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2503 #endif 2504 max2 = rbuf2_i[ct1]; 2505 for (l = 0; l < max2; l++, ct2++) { 2506 if (!allcolumns[is_no]) { 2507 #if defined(PETSC_USE_CTABLE) 2508 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2509 #else 2510 tcol = cmap_i[rbuf3_i[ct2]]; 2511 #endif 2512 if (tcol) lens_i[row]++; 2513 } else { /* allcolumns */ 2514 lens_i[row]++; /* lens_i[row] += max2 ? */ 2515 } 2516 } 2517 } 2518 } 2519 } 2520 } 2521 PetscCall(PetscFree(r_waits3)); 2522 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2523 PetscCall(PetscFree(s_waits3)); 2524 2525 /* Create the submatrices */ 2526 for (i = 0; i < ismax; i++) { 2527 PetscInt rbs, cbs; 2528 2529 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2530 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2531 2532 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2533 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2534 2535 if (rbs > 1 || cbs > 1) PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2536 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2537 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2538 2539 /* create struct Mat_SubSppt and attached it to submat */ 2540 PetscCall(PetscNew(&smat_i)); 2541 subc = (Mat_SeqAIJ *)submats[i]->data; 2542 subc->submatis1 = smat_i; 2543 2544 smat_i->destroy = submats[i]->ops->destroy; 2545 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2546 submats[i]->factortype = C->factortype; 2547 2548 smat_i->id = i; 2549 smat_i->nrqs = nrqs; 2550 smat_i->nrqr = nrqr; 2551 smat_i->rbuf1 = rbuf1; 2552 smat_i->rbuf2 = rbuf2; 2553 smat_i->rbuf3 = rbuf3; 2554 smat_i->sbuf2 = sbuf2; 2555 smat_i->req_source2 = req_source2; 2556 2557 smat_i->sbuf1 = sbuf1; 2558 smat_i->ptr = ptr; 2559 smat_i->tmp = tmp; 2560 smat_i->ctr = ctr; 2561 2562 smat_i->pa = pa; 2563 smat_i->req_size = req_size; 2564 smat_i->req_source1 = req_source1; 2565 2566 smat_i->allcolumns = allcolumns[i]; 2567 smat_i->singleis = PETSC_FALSE; 2568 smat_i->row2proc = row2proc[i]; 2569 smat_i->rmap = rmap[i]; 2570 smat_i->cmap = cmap[i]; 2571 } 2572 2573 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2574 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2575 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2576 PetscCall(MatSetType(submats[0], MATDUMMY)); 2577 2578 /* create struct Mat_SubSppt and attached it to submat */ 2579 PetscCall(PetscNew(&smat_i)); 2580 submats[0]->data = (void *)smat_i; 2581 2582 smat_i->destroy = submats[0]->ops->destroy; 2583 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2584 submats[0]->factortype = C->factortype; 2585 2586 smat_i->id = 0; 2587 smat_i->nrqs = nrqs; 2588 smat_i->nrqr = nrqr; 2589 smat_i->rbuf1 = rbuf1; 2590 smat_i->rbuf2 = rbuf2; 2591 smat_i->rbuf3 = rbuf3; 2592 smat_i->sbuf2 = sbuf2; 2593 smat_i->req_source2 = req_source2; 2594 2595 smat_i->sbuf1 = sbuf1; 2596 smat_i->ptr = ptr; 2597 smat_i->tmp = tmp; 2598 smat_i->ctr = ctr; 2599 2600 smat_i->pa = pa; 2601 smat_i->req_size = req_size; 2602 smat_i->req_source1 = req_source1; 2603 2604 smat_i->allcolumns = PETSC_FALSE; 2605 smat_i->singleis = PETSC_FALSE; 2606 smat_i->row2proc = NULL; 2607 smat_i->rmap = NULL; 2608 smat_i->cmap = NULL; 2609 } 2610 2611 if (ismax) PetscCall(PetscFree(lens[0])); 2612 PetscCall(PetscFree(lens)); 2613 if (sbuf_aj) { 2614 PetscCall(PetscFree(sbuf_aj[0])); 2615 PetscCall(PetscFree(sbuf_aj)); 2616 } 2617 2618 } /* endof scall == MAT_INITIAL_MATRIX */ 2619 2620 /* Post recv matrix values */ 2621 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2622 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2623 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2624 for (i = 0; i < nrqs; ++i) { 2625 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2626 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2627 } 2628 2629 /* Allocate sending buffers for a->a, and send them off */ 2630 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2631 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2632 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 2633 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2634 2635 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2636 { 2637 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2638 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2639 PetscInt cend = C->cmap->rend; 2640 PetscInt *b_j = b->j; 2641 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2642 2643 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2644 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2645 for (i = 0; i < nrqr; i++) { 2646 rbuf1_i = rbuf1[i]; 2647 sbuf_aa_i = sbuf_aa[i]; 2648 ct1 = 2 * rbuf1_i[0] + 1; 2649 ct2 = 0; 2650 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2651 kmax = rbuf1_i[2 * j]; 2652 for (k = 0; k < kmax; k++, ct1++) { 2653 row = rbuf1_i[ct1] - rstart; 2654 nzA = a_i[row + 1] - a_i[row]; 2655 nzB = b_i[row + 1] - b_i[row]; 2656 ncols = nzA + nzB; 2657 cworkB = PetscSafePointerPlusOffset(b_j, b_i[row]); 2658 vworkA = PetscSafePointerPlusOffset(a_a, a_i[row]); 2659 vworkB = PetscSafePointerPlusOffset(b_a, b_i[row]); 2660 2661 /* load the column values for this row into vals*/ 2662 vals = sbuf_aa_i + ct2; 2663 2664 lwrite = 0; 2665 for (l = 0; l < nzB; l++) { 2666 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2667 } 2668 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2669 for (l = 0; l < nzB; l++) { 2670 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2671 } 2672 2673 ct2 += ncols; 2674 } 2675 } 2676 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2677 } 2678 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2679 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2680 } 2681 2682 /* Assemble the matrices */ 2683 /* First assemble the local rows */ 2684 for (i = 0; i < ismax; i++) { 2685 row2proc_i = row2proc[i]; 2686 subc = (Mat_SeqAIJ *)submats[i]->data; 2687 imat_ilen = subc->ilen; 2688 imat_j = subc->j; 2689 imat_i = subc->i; 2690 imat_a = subc->a; 2691 2692 if (!allcolumns[i]) cmap_i = cmap[i]; 2693 rmap_i = rmap[i]; 2694 irow_i = irow[i]; 2695 jmax = nrow[i]; 2696 for (j = 0; j < jmax; j++) { 2697 row = irow_i[j]; 2698 proc = row2proc_i[j]; 2699 if (proc == rank) { 2700 old_row = row; 2701 #if defined(PETSC_USE_CTABLE) 2702 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2703 row--; 2704 #else 2705 row = rmap_i[row]; 2706 #endif 2707 ilen_row = imat_ilen[row]; 2708 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2709 mat_i = imat_i[row]; 2710 mat_a = imat_a + mat_i; 2711 mat_j = imat_j + mat_i; 2712 if (!allcolumns[i]) { 2713 for (k = 0; k < ncols; k++) { 2714 #if defined(PETSC_USE_CTABLE) 2715 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2716 #else 2717 tcol = cmap_i[cols[k]]; 2718 #endif 2719 if (tcol) { 2720 *mat_j++ = tcol - 1; 2721 *mat_a++ = vals[k]; 2722 ilen_row++; 2723 } 2724 } 2725 } else { /* allcolumns */ 2726 for (k = 0; k < ncols; k++) { 2727 *mat_j++ = cols[k]; /* global col index! */ 2728 *mat_a++ = vals[k]; 2729 ilen_row++; 2730 } 2731 } 2732 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2733 2734 imat_ilen[row] = ilen_row; 2735 } 2736 } 2737 } 2738 2739 /* Now assemble the off proc rows */ 2740 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2741 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2742 sbuf1_i = sbuf1[pa[tmp2]]; 2743 jmax = sbuf1_i[0]; 2744 ct1 = 2 * jmax + 1; 2745 ct2 = 0; 2746 rbuf2_i = rbuf2[tmp2]; 2747 rbuf3_i = rbuf3[tmp2]; 2748 rbuf4_i = rbuf4[tmp2]; 2749 for (j = 1; j <= jmax; j++) { 2750 is_no = sbuf1_i[2 * j - 1]; 2751 rmap_i = rmap[is_no]; 2752 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2753 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2754 imat_ilen = subc->ilen; 2755 imat_j = subc->j; 2756 imat_i = subc->i; 2757 imat_a = subc->a; 2758 max1 = sbuf1_i[2 * j]; 2759 for (k = 0; k < max1; k++, ct1++) { 2760 row = sbuf1_i[ct1]; 2761 #if defined(PETSC_USE_CTABLE) 2762 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2763 row--; 2764 #else 2765 row = rmap_i[row]; 2766 #endif 2767 ilen = imat_ilen[row]; 2768 mat_i = imat_i[row]; 2769 mat_a = PetscSafePointerPlusOffset(imat_a, mat_i); 2770 mat_j = PetscSafePointerPlusOffset(imat_j, mat_i); 2771 max2 = rbuf2_i[ct1]; 2772 if (!allcolumns[is_no]) { 2773 for (l = 0; l < max2; l++, ct2++) { 2774 #if defined(PETSC_USE_CTABLE) 2775 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2776 #else 2777 tcol = cmap_i[rbuf3_i[ct2]]; 2778 #endif 2779 if (tcol) { 2780 *mat_j++ = tcol - 1; 2781 *mat_a++ = rbuf4_i[ct2]; 2782 ilen++; 2783 } 2784 } 2785 } else { /* allcolumns */ 2786 for (l = 0; l < max2; l++, ct2++) { 2787 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2788 *mat_a++ = rbuf4_i[ct2]; 2789 ilen++; 2790 } 2791 } 2792 imat_ilen[row] = ilen; 2793 } 2794 } 2795 } 2796 2797 if (!iscsorted) { /* sort column indices of the rows */ 2798 for (i = 0; i < ismax; i++) { 2799 subc = (Mat_SeqAIJ *)submats[i]->data; 2800 imat_j = subc->j; 2801 imat_i = subc->i; 2802 imat_a = subc->a; 2803 imat_ilen = subc->ilen; 2804 2805 if (allcolumns[i]) continue; 2806 jmax = nrow[i]; 2807 for (j = 0; j < jmax; j++) { 2808 mat_i = imat_i[j]; 2809 mat_a = imat_a + mat_i; 2810 mat_j = imat_j + mat_i; 2811 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2812 } 2813 } 2814 } 2815 2816 PetscCall(PetscFree(r_waits4)); 2817 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2818 PetscCall(PetscFree(s_waits4)); 2819 2820 /* Restore the indices */ 2821 for (i = 0; i < ismax; i++) { 2822 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2823 if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i)); 2824 } 2825 2826 for (i = 0; i < ismax; i++) { 2827 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2828 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2829 } 2830 2831 /* Destroy allocated memory */ 2832 if (sbuf_aa) { 2833 PetscCall(PetscFree(sbuf_aa[0])); 2834 PetscCall(PetscFree(sbuf_aa)); 2835 } 2836 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2837 2838 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 2839 PetscCall(PetscFree(rbuf4)); 2840 2841 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2842 PetscFunctionReturn(PETSC_SUCCESS); 2843 } 2844 2845 /* 2846 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2847 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2848 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2849 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2850 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2851 state, and needs to be "assembled" later by compressing B's column space. 2852 2853 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2854 Following this call, C->A & C->B have been created, even if empty. 2855 */ 2856 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) 2857 { 2858 /* If making this function public, change the error returned in this function away from _PLIB. */ 2859 Mat_MPIAIJ *aij; 2860 Mat_SeqAIJ *Baij; 2861 PetscBool seqaij, Bdisassembled; 2862 PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count; 2863 PetscScalar v; 2864 const PetscInt *rowindices, *colindices; 2865 2866 PetscFunctionBegin; 2867 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2868 if (A) { 2869 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2870 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2871 if (rowemb) { 2872 PetscCall(ISGetLocalSize(rowemb, &m)); 2873 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2874 } else { 2875 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2876 } 2877 if (dcolemb) { 2878 PetscCall(ISGetLocalSize(dcolemb, &n)); 2879 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2880 } else { 2881 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2882 } 2883 } 2884 if (B) { 2885 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2886 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2887 if (rowemb) { 2888 PetscCall(ISGetLocalSize(rowemb, &m)); 2889 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2890 } else { 2891 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2892 } 2893 if (ocolemb) { 2894 PetscCall(ISGetLocalSize(ocolemb, &n)); 2895 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2896 } else { 2897 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2898 } 2899 } 2900 2901 aij = (Mat_MPIAIJ *)C->data; 2902 if (!aij->A) { 2903 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2904 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2905 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2906 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2907 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2908 } 2909 if (A) { 2910 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2911 } else { 2912 PetscCall(MatSetUp(aij->A)); 2913 } 2914 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2915 /* 2916 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2917 need to "disassemble" B -- convert it to using C's global indices. 2918 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2919 2920 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2921 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2922 2923 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2924 At least avoid calling MatSetValues() and the implied searches? 2925 */ 2926 2927 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2928 #if defined(PETSC_USE_CTABLE) 2929 PetscCall(PetscHMapIDestroy(&aij->colmap)); 2930 #else 2931 PetscCall(PetscFree(aij->colmap)); 2932 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2933 #endif 2934 ngcol = 0; 2935 if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol)); 2936 if (aij->garray) PetscCall(PetscFree(aij->garray)); 2937 PetscCall(VecDestroy(&aij->lvec)); 2938 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2939 } 2940 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B)); 2941 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B)); 2942 } 2943 Bdisassembled = PETSC_FALSE; 2944 if (!aij->B) { 2945 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2946 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2947 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2948 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2949 Bdisassembled = PETSC_TRUE; 2950 } 2951 if (B) { 2952 Baij = (Mat_SeqAIJ *)B->data; 2953 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2954 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2955 for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i]; 2956 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2957 PetscCall(PetscFree(nz)); 2958 } 2959 2960 PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend)); 2961 shift = rend - rstart; 2962 count = 0; 2963 rowindices = NULL; 2964 colindices = NULL; 2965 if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices)); 2966 if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices)); 2967 for (i = 0; i < B->rmap->n; i++) { 2968 PetscInt row; 2969 row = i; 2970 if (rowindices) row = rowindices[i]; 2971 for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2972 col = Baij->j[count]; 2973 if (colindices) col = colindices[col]; 2974 if (Bdisassembled && col >= rstart) col += shift; 2975 v = Baij->a[count]; 2976 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2977 ++count; 2978 } 2979 } 2980 /* No assembly for aij->B is necessary. */ 2981 /* FIXME: set aij->B's nonzerostate correctly. */ 2982 } else { 2983 PetscCall(MatSetUp(aij->B)); 2984 } 2985 C->preallocated = PETSC_TRUE; 2986 C->was_assembled = PETSC_FALSE; 2987 C->assembled = PETSC_FALSE; 2988 /* 2989 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2990 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2991 */ 2992 PetscFunctionReturn(PETSC_SUCCESS); 2993 } 2994 2995 /* 2996 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2997 */ 2998 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) 2999 { 3000 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 3001 3002 PetscFunctionBegin; 3003 PetscAssertPointer(A, 2); 3004 PetscAssertPointer(B, 3); 3005 /* FIXME: make sure C is assembled */ 3006 *A = aij->A; 3007 *B = aij->B; 3008 /* Note that we don't incref *A and *B, so be careful! */ 3009 PetscFunctionReturn(PETSC_SUCCESS); 3010 } 3011 3012 /* 3013 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3014 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3015 */ 3016 static PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) 3017 { 3018 PetscMPIInt size, flag; 3019 PetscInt i, ii, cismax, ispar; 3020 Mat *A, *B; 3021 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 3022 3023 PetscFunctionBegin; 3024 if (!ismax) PetscFunctionReturn(PETSC_SUCCESS); 3025 3026 for (i = 0, cismax = 0; i < ismax; ++i) { 3027 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 3028 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3029 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3030 if (size > 1) ++cismax; 3031 } 3032 3033 /* 3034 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3035 ispar counts the number of parallel ISs across C's comm. 3036 */ 3037 PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 3038 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3039 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 3040 PetscFunctionReturn(PETSC_SUCCESS); 3041 } 3042 3043 /* if (ispar) */ 3044 /* 3045 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3046 These are used to extract the off-diag portion of the resulting parallel matrix. 3047 The row IS for the off-diag portion is the same as for the diag portion, 3048 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3049 */ 3050 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3051 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3052 for (i = 0, ii = 0; i < ismax; ++i) { 3053 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3054 if (size > 1) { 3055 /* 3056 TODO: This is the part that's ***NOT SCALABLE***. 3057 To fix this we need to extract just the indices of C's nonzero columns 3058 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3059 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3060 to be done without serializing on the IS list, so, most likely, it is best 3061 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3062 */ 3063 PetscCall(ISGetNonlocalIS(iscol[i], &ciscol[ii])); 3064 /* Now we have to 3065 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3066 were sorted on each rank, concatenated they might no longer be sorted; 3067 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3068 indices in the nondecreasing order to the original index positions. 3069 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3070 */ 3071 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3072 PetscCall(ISSort(ciscol[ii])); 3073 ++ii; 3074 } 3075 } 3076 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3077 for (i = 0, ii = 0; i < ismax; ++i) { 3078 PetscInt j, issize; 3079 const PetscInt *indices; 3080 3081 /* 3082 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3083 */ 3084 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3085 PetscCall(ISSort(isrow[i])); 3086 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3087 PetscCall(ISGetIndices(isrow[i], &indices)); 3088 for (j = 1; j < issize; ++j) { 3089 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3090 } 3091 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3092 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3093 PetscCall(ISSort(iscol[i])); 3094 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3095 PetscCall(ISGetIndices(iscol[i], &indices)); 3096 for (j = 1; j < issize; ++j) { 3097 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3098 } 3099 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3100 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3101 if (size > 1) { 3102 cisrow[ii] = isrow[i]; 3103 ++ii; 3104 } 3105 } 3106 /* 3107 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3108 array of sequential matrices underlying the resulting parallel matrices. 3109 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3110 contain duplicates. 3111 3112 There are as many diag matrices as there are original index sets. There are only as many parallel 3113 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3114 3115 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3116 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3117 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3118 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3119 with A[i] and B[ii] extracted from the corresponding MPI submat. 3120 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3121 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3122 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3123 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3124 values into A[i] and B[ii] sitting inside the corresponding submat. 3125 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3126 A[i], B[ii], AA[i] or BB[ii] matrices. 3127 */ 3128 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3129 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat)); 3130 3131 /* Now obtain the sequential A and B submatrices separately. */ 3132 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3133 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3134 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3135 3136 /* 3137 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3138 matrices A & B have been extracted directly into the parallel matrices containing them, or 3139 simply into the sequential matrix identical with the corresponding A (if size == 1). 3140 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3141 to have the same sparsity pattern. 3142 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3143 must be constructed for C. This is done by setseqmat(s). 3144 */ 3145 for (i = 0, ii = 0; i < ismax; ++i) { 3146 /* 3147 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3148 That way we can avoid sorting and computing permutations when reusing. 3149 To this end: 3150 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3151 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3152 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3153 */ 3154 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3155 3156 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3157 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3158 if (size > 1) { 3159 if (scall == MAT_INITIAL_MATRIX) { 3160 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3161 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3162 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3163 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3164 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3165 } 3166 /* 3167 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3168 */ 3169 { 3170 Mat AA = A[i], BB = B[ii]; 3171 3172 if (AA || BB) { 3173 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3174 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3175 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3176 } 3177 PetscCall(MatDestroy(&AA)); 3178 } 3179 PetscCall(ISDestroy(ciscol + ii)); 3180 PetscCall(ISDestroy(ciscol_p + ii)); 3181 ++ii; 3182 } else { /* if (size == 1) */ 3183 if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i])); 3184 if (isrow_p[i] || iscol_p[i]) { 3185 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3186 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3187 /* Otherwise A is extracted straight into (*submats)[i]. */ 3188 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3189 PetscCall(MatDestroy(A + i)); 3190 } else (*submat)[i] = A[i]; 3191 } 3192 PetscCall(ISDestroy(&isrow_p[i])); 3193 PetscCall(ISDestroy(&iscol_p[i])); 3194 } 3195 PetscCall(PetscFree2(cisrow, ciscol)); 3196 PetscCall(PetscFree2(isrow_p, iscol_p)); 3197 PetscCall(PetscFree(ciscol_p)); 3198 PetscCall(PetscFree(A)); 3199 PetscCall(MatDestroySubMatrices(cismax, &B)); 3200 PetscFunctionReturn(PETSC_SUCCESS); 3201 } 3202 3203 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 3204 { 3205 PetscFunctionBegin; 3206 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3207 PetscFunctionReturn(PETSC_SUCCESS); 3208 } 3209