1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 /* 22 Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks 23 24 The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but 25 that is a very major recoding job. 26 27 Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin 28 */ 29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[]) 30 { 31 PetscFunctionBegin; 32 for (PetscInt i = 0; i < imax; i++) { 33 if (!is[i]) break; 34 PetscInt n = 0, N, Nmax, Nmin; 35 const PetscInt *idx; 36 PetscInt *nidx = NULL; 37 MPI_Comm comm; 38 PetscBT bt; 39 40 PetscCall(ISGetLocalSize(is[i], &N)); 41 if (N > 0) { /* Nmax and Nmin are garbage for empty IS */ 42 PetscCall(ISGetIndices(is[i], &idx)); 43 PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax)); 44 Nmin = Nmin / bs; 45 Nmax = Nmax / bs; 46 PetscCall(PetscBTCreate(Nmax - Nmin, &bt)); 47 for (PetscInt j = 0; j < N; j++) { 48 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++; 49 } 50 PetscCall(PetscMalloc1(n, &nidx)); 51 n = 0; 52 PetscCall(PetscBTMemzero(Nmax - Nmin, bt)); 53 for (PetscInt j = 0; j < N; j++) { 54 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs; 55 } 56 PetscCall(PetscBTDestroy(&bt)); 57 PetscCall(ISRestoreIndices(is[i], &idx)); 58 } 59 PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm)); 60 PetscCall(ISDestroy(is + i)); 61 PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i)); 62 } 63 PetscFunctionReturn(PETSC_SUCCESS); 64 } 65 66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) 67 { 68 PetscInt i; 69 70 PetscFunctionBegin; 71 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 72 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); 73 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 74 PetscFunctionReturn(PETSC_SUCCESS); 75 } 76 77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) 78 { 79 PetscInt i; 80 81 PetscFunctionBegin; 82 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 83 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); 84 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 85 PetscFunctionReturn(PETSC_SUCCESS); 86 } 87 88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) 89 { 90 MPI_Comm comm; 91 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j; 92 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 93 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 94 const PetscInt *indices_i, **indices; 95 PetscLayout rmap; 96 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner; 97 PetscSF sf; 98 PetscSFNode *remote; 99 100 PetscFunctionBegin; 101 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 102 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 103 PetscCallMPI(MPI_Comm_size(comm, &size)); 104 /* get row map to determine where rows should be going */ 105 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 106 /* retrieve IS data and put all together so that we 107 * can optimize communication 108 * */ 109 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 110 for (i = 0, tlength = 0; i < nidx; i++) { 111 PetscCall(ISGetLocalSize(is[i], &length[i])); 112 tlength += length[i]; 113 PetscCall(ISGetIndices(is[i], &indices[i])); 114 } 115 /* find these rows on remote processors */ 116 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 117 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 118 nrrows = 0; 119 for (i = 0; i < nidx; i++) { 120 length_i = length[i]; 121 indices_i = indices[i]; 122 for (j = 0; j < length_i; j++) { 123 owner = -1; 124 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 125 /* remote processors */ 126 if (owner != rank) { 127 tosizes_temp[owner]++; /* number of rows to owner */ 128 rrow_ranks[nrrows] = owner; /* processor */ 129 rrow_isids[nrrows] = i; /* is id */ 130 remoterows[nrrows++] = indices_i[j]; /* row */ 131 } 132 } 133 PetscCall(ISRestoreIndices(is[i], &indices[i])); 134 } 135 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 136 /* test if we need to exchange messages 137 * generally speaking, we do not need to exchange 138 * data when overlap is 1 139 * */ 140 PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 141 /* we do not have any messages 142 * It usually corresponds to overlap 1 143 * */ 144 if (!reducednrrows) { 145 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 146 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 147 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 148 PetscFunctionReturn(PETSC_SUCCESS); 149 } 150 nto = 0; 151 /* send sizes and ranks for building a two-sided communication */ 152 for (i = 0; i < size; i++) { 153 if (tosizes_temp[i]) { 154 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 155 tosizes_temp[i] = nto; /* a map from processor to index */ 156 toranks[nto++] = i; /* processor */ 157 } 158 } 159 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 160 toffsets[0] = 0; 161 for (i = 0; i < nto; i++) { 162 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 163 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 164 } 165 /* send information to other processors */ 166 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 167 nrecvrows = 0; 168 for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 169 PetscCall(PetscMalloc1(nrecvrows, &remote)); 170 nrecvrows = 0; 171 for (i = 0; i < nfrom; i++) { 172 for (j = 0; j < fromsizes[2 * i]; j++) { 173 remote[nrecvrows].rank = fromranks[i]; 174 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 175 } 176 } 177 PetscCall(PetscSFCreate(comm, &sf)); 178 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 179 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 180 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 181 PetscCall(PetscSFSetFromOptions(sf)); 182 /* message pair <no of is, row> */ 183 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 184 for (i = 0; i < nrrows; i++) { 185 owner = rrow_ranks[i]; /* processor */ 186 j = tosizes_temp[owner]; /* index */ 187 todata[toffsets[j]++] = rrow_isids[i]; 188 todata[toffsets[j]++] = remoterows[i]; 189 } 190 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 191 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 192 PetscCall(PetscFree(toffsets)); 193 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 194 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 195 PetscCall(PetscSFDestroy(&sf)); 196 /* send rows belonging to the remote so that then we could get the overlapping data back */ 197 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 198 PetscCall(PetscFree2(todata, fromdata)); 199 PetscCall(PetscFree(fromsizes)); 200 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 201 PetscCall(PetscFree(fromranks)); 202 nrecvrows = 0; 203 for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 204 PetscCall(PetscCalloc1(nrecvrows, &todata)); 205 PetscCall(PetscMalloc1(nrecvrows, &remote)); 206 nrecvrows = 0; 207 for (i = 0; i < nto; i++) { 208 for (j = 0; j < tosizes[2 * i]; j++) { 209 remote[nrecvrows].rank = toranks[i]; 210 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 211 } 212 } 213 PetscCall(PetscSFCreate(comm, &sf)); 214 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 215 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 216 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 217 PetscCall(PetscSFSetFromOptions(sf)); 218 /* overlap communication and computation */ 219 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 220 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 221 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 222 PetscCall(PetscSFDestroy(&sf)); 223 PetscCall(PetscFree2(sbdata, sbsizes)); 224 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 225 PetscCall(PetscFree(toranks)); 226 PetscCall(PetscFree(tosizes)); 227 PetscCall(PetscFree(todata)); 228 PetscFunctionReturn(PETSC_SUCCESS); 229 } 230 231 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 232 { 233 PetscInt *isz, isz_i, i, j, is_id, data_size; 234 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 235 const PetscInt *indices_i_temp; 236 MPI_Comm *iscomms; 237 238 PetscFunctionBegin; 239 max_lsize = 0; 240 PetscCall(PetscMalloc1(nidx, &isz)); 241 for (i = 0; i < nidx; i++) { 242 PetscCall(ISGetLocalSize(is[i], &lsize)); 243 max_lsize = lsize > max_lsize ? lsize : max_lsize; 244 isz[i] = lsize; 245 } 246 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 247 for (i = 0; i < nidx; i++) { 248 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 249 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 250 PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i])); 251 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 252 PetscCall(ISDestroy(&is[i])); 253 } 254 /* retrieve information to get row id and its overlap */ 255 for (i = 0; i < nrecvs;) { 256 is_id = recvdata[i++]; 257 data_size = recvdata[i++]; 258 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 259 isz_i = isz[is_id]; 260 for (j = 0; j < data_size; j++) { 261 col = recvdata[i++]; 262 indices_i[isz_i++] = col; 263 } 264 isz[is_id] = isz_i; 265 } 266 /* remove duplicate entities */ 267 for (i = 0; i < nidx; i++) { 268 indices_i = indices_temp + (max_lsize + nrecvs) * i; 269 isz_i = isz[i]; 270 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 271 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 272 PetscCall(PetscCommDestroy(&iscomms[i])); 273 } 274 PetscCall(PetscFree(isz)); 275 PetscCall(PetscFree2(indices_temp, iscomms)); 276 PetscFunctionReturn(PETSC_SUCCESS); 277 } 278 279 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 280 { 281 PetscLayout rmap, cmap; 282 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 283 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 284 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 285 const PetscInt *gcols, *ai, *aj, *bi, *bj; 286 Mat amat, bmat; 287 PetscMPIInt rank; 288 PetscBool done; 289 MPI_Comm comm; 290 291 PetscFunctionBegin; 292 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 293 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 294 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 295 /* Even if the mat is symmetric, we still assume it is not symmetric */ 296 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 297 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 298 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 299 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 300 /* total number of nonzero values is used to estimate the memory usage in the next step */ 301 tnz = ai[an] + bi[bn]; 302 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 303 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 304 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 305 /* to find the longest message */ 306 max_fszs = 0; 307 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 308 /* better way to estimate number of nonzero in the mat??? */ 309 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 310 for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i; 311 rows_pos = 0; 312 totalrows = 0; 313 for (i = 0; i < nfrom; i++) { 314 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 315 /* group data together */ 316 for (j = 0; j < fromsizes[2 * i]; j += 2) { 317 is_id = fromrows[rows_pos++]; /* no of is */ 318 rows_i = rows_data[is_id]; 319 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 320 } 321 /* estimate a space to avoid multiple allocations */ 322 for (j = 0; j < nidx; j++) { 323 indvc_ij = 0; 324 rows_i = rows_data[j]; 325 for (l = 0; l < rows_pos_i[j]; l++) { 326 row = rows_i[l] - rstart; 327 start = ai[row]; 328 end = ai[row + 1]; 329 for (k = start; k < end; k++) { /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 332 } 333 start = bi[row]; 334 end = bi[row + 1]; 335 for (k = start; k < end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 341 indv_counts[i * nidx + j] = indvc_ij; 342 totalrows += indvc_ij; 343 } 344 } 345 /* message triple <no of is, number of rows, rows> */ 346 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 347 totalrows = 0; 348 rows_pos = 0; 349 /* use this code again */ 350 for (i = 0; i < nfrom; i++) { 351 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 352 for (j = 0; j < fromsizes[2 * i]; j += 2) { 353 is_id = fromrows[rows_pos++]; 354 rows_i = rows_data[is_id]; 355 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 356 } 357 /* add data */ 358 for (j = 0; j < nidx; j++) { 359 if (!indv_counts[i * nidx + j]) continue; 360 indvc_ij = 0; 361 sbdata[totalrows++] = j; 362 sbdata[totalrows++] = indv_counts[i * nidx + j]; 363 sbsizes[2 * i] += 2; 364 rows_i = rows_data[j]; 365 for (l = 0; l < rows_pos_i[j]; l++) { 366 row = rows_i[l] - rstart; 367 start = ai[row]; 368 end = ai[row + 1]; 369 for (k = start; k < end; k++) { /* Amat */ 370 col = aj[k] + cstart; 371 indices_tmp[indvc_ij++] = col; 372 } 373 start = bi[row]; 374 end = bi[row + 1]; 375 for (k = start; k < end; k++) { /* Bmat */ 376 col = gcols[bj[k]]; 377 indices_tmp[indvc_ij++] = col; 378 } 379 } 380 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 381 sbsizes[2 * i] += indvc_ij; 382 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 383 totalrows += indvc_ij; 384 } 385 } 386 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 387 offsets[0] = 0; 388 for (i = 0; i < nfrom; i++) { 389 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 390 sbsizes[2 * i + 1] = offsets[i]; 391 } 392 PetscCall(PetscFree(offsets)); 393 if (sbrowsizes) *sbrowsizes = sbsizes; 394 if (sbrows) *sbrows = sbdata; 395 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 396 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 397 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 398 PetscFunctionReturn(PETSC_SUCCESS); 399 } 400 401 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) 402 { 403 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 404 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 405 PetscInt lsize, lsize_tmp; 406 PetscMPIInt rank, owner; 407 Mat amat, bmat; 408 PetscBool done; 409 PetscLayout cmap, rmap; 410 MPI_Comm comm; 411 412 PetscFunctionBegin; 413 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 414 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 415 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 416 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 417 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 418 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 419 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 420 /* is it a safe way to compute number of nonzero values ? */ 421 tnz = ai[an] + bi[bn]; 422 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 423 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 424 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 425 /* it is a better way to estimate memory than the old implementation 426 * where global size of matrix is used 427 * */ 428 PetscCall(PetscMalloc1(tnz, &indices_temp)); 429 for (i = 0; i < nidx; i++) { 430 MPI_Comm iscomm; 431 432 PetscCall(ISGetLocalSize(is[i], &lsize)); 433 PetscCall(ISGetIndices(is[i], &indices)); 434 lsize_tmp = 0; 435 for (j = 0; j < lsize; j++) { 436 owner = -1; 437 row = indices[j]; 438 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 439 if (owner != rank) continue; 440 /* local number */ 441 row -= rstart; 442 start = ai[row]; 443 end = ai[row + 1]; 444 for (k = start; k < end; k++) { /* Amat */ 445 col = aj[k] + cstart; 446 indices_temp[lsize_tmp++] = col; 447 } 448 start = bi[row]; 449 end = bi[row + 1]; 450 for (k = start; k < end; k++) { /* Bmat */ 451 col = gcols[bj[k]]; 452 indices_temp[lsize_tmp++] = col; 453 } 454 } 455 PetscCall(ISRestoreIndices(is[i], &indices)); 456 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 457 PetscCall(ISDestroy(&is[i])); 458 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 459 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 460 PetscCall(PetscCommDestroy(&iscomm)); 461 } 462 PetscCall(PetscFree(indices_temp)); 463 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 464 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 465 PetscFunctionReturn(PETSC_SUCCESS); 466 } 467 468 /* 469 Sample message format: 470 If a processor A wants processor B to process some elements corresponding 471 to index sets is[1],is[5] 472 mesg [0] = 2 (no of index sets in the mesg) 473 ----------- 474 mesg [1] = 1 => is[1] 475 mesg [2] = sizeof(is[1]); 476 ----------- 477 mesg [3] = 5 => is[5] 478 mesg [4] = sizeof(is[5]); 479 ----------- 480 mesg [5] 481 mesg [n] datas[1] 482 ----------- 483 mesg[n+1] 484 mesg[m] data(is[5]) 485 ----------- 486 487 Notes: 488 nrqs - no of requests sent (or to be sent out) 489 nrqr - no of requests received (which have to be or which have been processed) 490 */ 491 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) 492 { 493 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 494 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 495 const PetscInt **idx, *idx_i; 496 PetscInt *n, **data, len; 497 #if defined(PETSC_USE_CTABLE) 498 PetscHMapI *table_data, table_data_i; 499 PetscInt *tdata, tcount, tcount_max; 500 #else 501 PetscInt *data_i, *d_p; 502 #endif 503 PetscMPIInt size, rank, tag1, tag2, proc = 0; 504 PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr; 505 PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2; 506 PetscBT *table; 507 MPI_Comm comm; 508 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 509 MPI_Status *recv_status; 510 MPI_Comm *iscomms; 511 char *t_p; 512 513 PetscFunctionBegin; 514 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 515 size = c->size; 516 rank = c->rank; 517 M = C->rmap->N; 518 519 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 520 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 521 522 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 523 524 for (i = 0; i < imax; i++) { 525 PetscCall(ISGetIndices(is[i], &idx[i])); 526 PetscCall(ISGetLocalSize(is[i], &n[i])); 527 } 528 529 /* evaluate communication - mesg to who,length of mesg, and buffer space 530 required. Based on this, buffers are allocated, and data copied into them */ 531 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 532 for (i = 0; i < imax; i++) { 533 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 534 idx_i = idx[i]; 535 len = n[i]; 536 for (j = 0; j < len; j++) { 537 row = idx_i[j]; 538 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 539 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 540 w4[proc]++; 541 } 542 for (j = 0; j < size; j++) { 543 if (w4[j]) { 544 w1[j] += w4[j]; 545 w3[j]++; 546 } 547 } 548 } 549 550 nrqs = 0; /* no of outgoing messages */ 551 msz = 0; /* total mesg length (for all proc */ 552 w1[rank] = 0; /* no mesg sent to intself */ 553 w3[rank] = 0; 554 for (i = 0; i < size; i++) { 555 if (w1[i]) { 556 w2[i] = 1; 557 nrqs++; 558 } /* there exists a message to proc i */ 559 } 560 /* pa - is list of processors to communicate with */ 561 PetscCall(PetscMalloc1(nrqs, &pa)); 562 for (i = 0, j = 0; i < size; i++) { 563 if (w1[i]) { 564 pa[j] = i; 565 j++; 566 } 567 } 568 569 /* Each message would have a header = 1 + 2*(no of IS) + data */ 570 for (i = 0; i < nrqs; i++) { 571 j = pa[i]; 572 w1[j] += w2[j] + 2 * w3[j]; 573 msz += w1[j]; 574 } 575 576 /* Determine the number of messages to expect, their lengths, from from-ids */ 577 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 578 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 579 580 /* Now post the Irecvs corresponding to these messages */ 581 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 582 583 /* Allocate Memory for outgoing messages */ 584 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 585 PetscCall(PetscArrayzero(outdat, size)); 586 PetscCall(PetscArrayzero(ptr, size)); 587 588 { 589 PetscInt *iptr = tmp, ict = 0; 590 for (i = 0; i < nrqs; i++) { 591 j = pa[i]; 592 iptr += ict; 593 outdat[j] = iptr; 594 ict = w1[j]; 595 } 596 } 597 598 /* Form the outgoing messages */ 599 /* plug in the headers */ 600 for (i = 0; i < nrqs; i++) { 601 j = pa[i]; 602 outdat[j][0] = 0; 603 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 604 ptr[j] = outdat[j] + 2 * w3[j] + 1; 605 } 606 607 /* Memory for doing local proc's work */ 608 { 609 PetscInt M_BPB_imax = 0; 610 #if defined(PETSC_USE_CTABLE) 611 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 612 PetscCall(PetscMalloc1(imax, &table_data)); 613 for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i)); 614 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 615 for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 616 #else 617 PetscInt Mimax = 0; 618 PetscCall(PetscIntMultError(M, imax, &Mimax)); 619 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 620 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 621 for (i = 0; i < imax; i++) { 622 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 623 data[i] = d_p + M * i; 624 } 625 #endif 626 } 627 628 /* Parse the IS and update local tables and the outgoing buf with the data */ 629 { 630 PetscInt n_i, isz_i, *outdat_j, ctr_j; 631 PetscBT table_i; 632 633 for (i = 0; i < imax; i++) { 634 PetscCall(PetscArrayzero(ctr, size)); 635 n_i = n[i]; 636 table_i = table[i]; 637 idx_i = idx[i]; 638 #if defined(PETSC_USE_CTABLE) 639 table_data_i = table_data[i]; 640 #else 641 data_i = data[i]; 642 #endif 643 isz_i = isz[i]; 644 for (j = 0; j < n_i; j++) { /* parse the indices of each IS */ 645 row = idx_i[j]; 646 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 647 if (proc != rank) { /* copy to the outgoing buffer */ 648 ctr[proc]++; 649 *ptr[proc] = row; 650 ptr[proc]++; 651 } else if (!PetscBTLookupSet(table_i, row)) { 652 #if defined(PETSC_USE_CTABLE) 653 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 654 #else 655 data_i[isz_i] = row; /* Update the local table */ 656 #endif 657 isz_i++; 658 } 659 } 660 /* Update the headers for the current IS */ 661 for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 662 if ((ctr_j = ctr[j])) { 663 outdat_j = outdat[j]; 664 k = ++outdat_j[0]; 665 outdat_j[2 * k] = ctr_j; 666 outdat_j[2 * k - 1] = i; 667 } 668 } 669 isz[i] = isz_i; 670 } 671 } 672 673 /* Now post the sends */ 674 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 675 for (i = 0; i < nrqs; ++i) { 676 j = pa[i]; 677 PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 678 } 679 680 /* No longer need the original indices */ 681 PetscCall(PetscMalloc1(imax, &iscomms)); 682 for (i = 0; i < imax; ++i) { 683 PetscCall(ISRestoreIndices(is[i], idx + i)); 684 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 685 } 686 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 687 688 for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i])); 689 690 /* Do Local work */ 691 #if defined(PETSC_USE_CTABLE) 692 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 693 #else 694 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 695 #endif 696 697 /* Receive messages */ 698 PetscCall(PetscMalloc1(nrqr, &recv_status)); 699 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 700 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 701 702 /* Phase 1 sends are complete - deallocate buffers */ 703 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 704 PetscCall(PetscFree4(w1, w2, w3, w4)); 705 706 PetscCall(PetscMalloc1(nrqr, &xdata)); 707 PetscCall(PetscMalloc1(nrqr, &isz1)); 708 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 709 PetscCall(PetscFree(rbuf[0])); 710 PetscCall(PetscFree(rbuf)); 711 712 /* Send the data back */ 713 /* Do a global reduction to know the buffer space req for incoming messages */ 714 { 715 PetscMPIInt *rw1; 716 717 PetscCall(PetscCalloc1(size, &rw1)); 718 719 for (i = 0; i < nrqr; ++i) { 720 proc = recv_status[i].MPI_SOURCE; 721 722 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 723 rw1[proc] = isz1[i]; 724 } 725 PetscCall(PetscFree(onodes1)); 726 PetscCall(PetscFree(olengths1)); 727 728 /* Determine the number of messages to expect, their lengths, from from-ids */ 729 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 730 PetscCall(PetscFree(rw1)); 731 } 732 /* Now post the Irecvs corresponding to these messages */ 733 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 734 735 /* Now post the sends */ 736 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 737 for (i = 0; i < nrqr; ++i) { 738 j = recv_status[i].MPI_SOURCE; 739 PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i)); 740 } 741 742 /* receive work done on other processors */ 743 { 744 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 745 PetscMPIInt idex; 746 PetscBT table_i; 747 748 for (i = 0; i < nrqs; ++i) { 749 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 750 /* Process the message */ 751 rbuf2_i = rbuf2[idex]; 752 ct1 = 2 * rbuf2_i[0] + 1; 753 jmax = rbuf2[idex][0]; 754 for (j = 1; j <= jmax; j++) { 755 max = rbuf2_i[2 * j]; 756 is_no = rbuf2_i[2 * j - 1]; 757 isz_i = isz[is_no]; 758 table_i = table[is_no]; 759 #if defined(PETSC_USE_CTABLE) 760 table_data_i = table_data[is_no]; 761 #else 762 data_i = data[is_no]; 763 #endif 764 for (k = 0; k < max; k++, ct1++) { 765 row = rbuf2_i[ct1]; 766 if (!PetscBTLookupSet(table_i, row)) { 767 #if defined(PETSC_USE_CTABLE) 768 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 769 #else 770 data_i[isz_i] = row; 771 #endif 772 isz_i++; 773 } 774 } 775 isz[is_no] = isz_i; 776 } 777 } 778 779 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 780 } 781 782 #if defined(PETSC_USE_CTABLE) 783 tcount_max = 0; 784 for (i = 0; i < imax; ++i) { 785 table_data_i = table_data[i]; 786 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 787 if (tcount_max < tcount) tcount_max = tcount; 788 } 789 PetscCall(PetscMalloc1(tcount_max + 1, &tdata)); 790 #endif 791 792 for (i = 0; i < imax; ++i) { 793 #if defined(PETSC_USE_CTABLE) 794 PetscHashIter tpos; 795 796 table_data_i = table_data[i]; 797 PetscHashIterBegin(table_data_i, tpos); 798 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 799 PetscHashIterGetKey(table_data_i, tpos, k); 800 PetscHashIterGetVal(table_data_i, tpos, j); 801 PetscHashIterNext(table_data_i, tpos); 802 tdata[--j] = --k; 803 } 804 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 805 #else 806 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 807 #endif 808 PetscCall(PetscCommDestroy(&iscomms[i])); 809 } 810 811 PetscCall(PetscFree(iscomms)); 812 PetscCall(PetscFree(onodes2)); 813 PetscCall(PetscFree(olengths2)); 814 815 PetscCall(PetscFree(pa)); 816 PetscCall(PetscFree(rbuf2[0])); 817 PetscCall(PetscFree(rbuf2)); 818 PetscCall(PetscFree(s_waits1)); 819 PetscCall(PetscFree(r_waits1)); 820 PetscCall(PetscFree(s_waits2)); 821 PetscCall(PetscFree(r_waits2)); 822 PetscCall(PetscFree(recv_status)); 823 if (xdata) { 824 PetscCall(PetscFree(xdata[0])); 825 PetscCall(PetscFree(xdata)); 826 } 827 PetscCall(PetscFree(isz1)); 828 #if defined(PETSC_USE_CTABLE) 829 for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i)); 830 PetscCall(PetscFree(table_data)); 831 PetscCall(PetscFree(tdata)); 832 PetscCall(PetscFree4(table, data, isz, t_p)); 833 #else 834 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 835 #endif 836 PetscFunctionReturn(PETSC_SUCCESS); 837 } 838 839 /* 840 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 841 the work on the local processor. 842 843 Inputs: 844 C - MAT_MPIAIJ; 845 imax - total no of index sets processed at a time; 846 table - an array of char - size = m bits. 847 848 Output: 849 isz - array containing the count of the solution elements corresponding 850 to each index set; 851 data or table_data - pointer to the solutions 852 */ 853 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data) 854 { 855 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 856 Mat A = c->A, B = c->B; 857 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 858 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 859 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 860 PetscBT table_i; 861 #if defined(PETSC_USE_CTABLE) 862 PetscHMapI table_data_i; 863 PetscHashIter tpos; 864 PetscInt tcount, *tdata; 865 #else 866 PetscInt *data_i; 867 #endif 868 869 PetscFunctionBegin; 870 rstart = C->rmap->rstart; 871 cstart = C->cmap->rstart; 872 ai = a->i; 873 aj = a->j; 874 bi = b->i; 875 bj = b->j; 876 garray = c->garray; 877 878 for (i = 0; i < imax; i++) { 879 #if defined(PETSC_USE_CTABLE) 880 /* copy existing entries of table_data_i into tdata[] */ 881 table_data_i = table_data[i]; 882 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 883 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 884 885 PetscCall(PetscMalloc1(tcount, &tdata)); 886 PetscHashIterBegin(table_data_i, tpos); 887 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 888 PetscHashIterGetKey(table_data_i, tpos, row); 889 PetscHashIterGetVal(table_data_i, tpos, j); 890 PetscHashIterNext(table_data_i, tpos); 891 tdata[--j] = --row; 892 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 893 } 894 #else 895 data_i = data[i]; 896 #endif 897 table_i = table[i]; 898 isz_i = isz[i]; 899 max = isz[i]; 900 901 for (j = 0; j < max; j++) { 902 #if defined(PETSC_USE_CTABLE) 903 row = tdata[j] - rstart; 904 #else 905 row = data_i[j] - rstart; 906 #endif 907 start = ai[row]; 908 end = ai[row + 1]; 909 for (k = start; k < end; k++) { /* Amat */ 910 val = aj[k] + cstart; 911 if (!PetscBTLookupSet(table_i, val)) { 912 #if defined(PETSC_USE_CTABLE) 913 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 914 #else 915 data_i[isz_i] = val; 916 #endif 917 isz_i++; 918 } 919 } 920 start = bi[row]; 921 end = bi[row + 1]; 922 for (k = start; k < end; k++) { /* Bmat */ 923 val = garray[bj[k]]; 924 if (!PetscBTLookupSet(table_i, val)) { 925 #if defined(PETSC_USE_CTABLE) 926 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 927 #else 928 data_i[isz_i] = val; 929 #endif 930 isz_i++; 931 } 932 } 933 } 934 isz[i] = isz_i; 935 936 #if defined(PETSC_USE_CTABLE) 937 PetscCall(PetscFree(tdata)); 938 #endif 939 } 940 PetscFunctionReturn(PETSC_SUCCESS); 941 } 942 943 /* 944 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 945 and return the output 946 947 Input: 948 C - the matrix 949 nrqr - no of messages being processed. 950 rbuf - an array of pointers to the received requests 951 952 Output: 953 xdata - array of messages to be sent back 954 isz1 - size of each message 955 956 For better efficiency perhaps we should malloc separately each xdata[i], 957 then if a remalloc is required we need only copy the data for that one row 958 rather then all previous rows as it is now where a single large chunk of 959 memory is used. 960 961 */ 962 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) 963 { 964 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 965 Mat A = c->A, B = c->B; 966 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 967 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 968 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 969 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 970 PetscInt *rbuf_i, kmax, rbuf_0; 971 PetscBT xtable; 972 973 PetscFunctionBegin; 974 m = C->rmap->N; 975 rstart = C->rmap->rstart; 976 cstart = C->cmap->rstart; 977 ai = a->i; 978 aj = a->j; 979 bi = b->i; 980 bj = b->j; 981 garray = c->garray; 982 983 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 984 rbuf_i = rbuf[i]; 985 rbuf_0 = rbuf_i[0]; 986 ct += rbuf_0; 987 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 988 } 989 990 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 991 else max1 = 1; 992 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 993 if (nrqr) { 994 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 995 ++no_malloc; 996 } 997 PetscCall(PetscBTCreate(m, &xtable)); 998 PetscCall(PetscArrayzero(isz1, nrqr)); 999 1000 ct3 = 0; 1001 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 1002 rbuf_i = rbuf[i]; 1003 rbuf_0 = rbuf_i[0]; 1004 ct1 = 2 * rbuf_0 + 1; 1005 ct2 = ct1; 1006 ct3 += ct1; 1007 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 1008 PetscCall(PetscBTMemzero(m, xtable)); 1009 oct2 = ct2; 1010 kmax = rbuf_i[2 * j]; 1011 for (k = 0; k < kmax; k++, ct1++) { 1012 row = rbuf_i[ct1]; 1013 if (!PetscBTLookupSet(xtable, row)) { 1014 if (!(ct3 < mem_estimate)) { 1015 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1016 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1017 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1018 PetscCall(PetscFree(xdata[0])); 1019 xdata[0] = tmp; 1020 mem_estimate = new_estimate; 1021 ++no_malloc; 1022 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1023 } 1024 xdata[i][ct2++] = row; 1025 ct3++; 1026 } 1027 } 1028 for (k = oct2, max2 = ct2; k < max2; k++) { 1029 row = xdata[i][k] - rstart; 1030 start = ai[row]; 1031 end = ai[row + 1]; 1032 for (l = start; l < end; l++) { 1033 val = aj[l] + cstart; 1034 if (!PetscBTLookupSet(xtable, val)) { 1035 if (!(ct3 < mem_estimate)) { 1036 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1037 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1038 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1039 PetscCall(PetscFree(xdata[0])); 1040 xdata[0] = tmp; 1041 mem_estimate = new_estimate; 1042 ++no_malloc; 1043 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1044 } 1045 xdata[i][ct2++] = val; 1046 ct3++; 1047 } 1048 } 1049 start = bi[row]; 1050 end = bi[row + 1]; 1051 for (l = start; l < end; l++) { 1052 val = garray[bj[l]]; 1053 if (!PetscBTLookupSet(xtable, val)) { 1054 if (!(ct3 < mem_estimate)) { 1055 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1056 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1057 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1058 PetscCall(PetscFree(xdata[0])); 1059 xdata[0] = tmp; 1060 mem_estimate = new_estimate; 1061 ++no_malloc; 1062 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1063 } 1064 xdata[i][ct2++] = val; 1065 ct3++; 1066 } 1067 } 1068 } 1069 /* Update the header*/ 1070 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1071 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1072 } 1073 xdata[i][0] = rbuf_0; 1074 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1075 isz1[i] = ct2; /* size of each message */ 1076 } 1077 PetscCall(PetscBTDestroy(&xtable)); 1078 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1079 PetscFunctionReturn(PETSC_SUCCESS); 1080 } 1081 1082 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1083 /* 1084 Every processor gets the entire matrix 1085 */ 1086 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) 1087 { 1088 Mat B; 1089 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1090 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1091 PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL; 1092 PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j; 1093 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1094 1095 PetscFunctionBegin; 1096 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1097 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1098 if (scall == MAT_INITIAL_MATRIX) { 1099 /* Tell every processor the number of nonzeros per row */ 1100 PetscCall(PetscMalloc1(A->rmap->N, &lens)); 1101 for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; 1102 PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1103 for (i = 0; i < size; i++) { 1104 recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i]; 1105 displs[i] = A->rmap->range[i]; 1106 } 1107 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1108 /* Create the sequential matrix of the same type as the local block diagonal */ 1109 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1110 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1111 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1112 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1113 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1114 PetscCall(PetscCalloc1(2, Bin)); 1115 **Bin = B; 1116 b = (Mat_SeqAIJ *)B->data; 1117 1118 /* Copy my part of matrix column indices over */ 1119 sendcount = ad->nz + bd->nz; 1120 jsendbuf = b->j + b->i[rstarts[rank]]; 1121 a_jsendbuf = ad->j; 1122 b_jsendbuf = bd->j; 1123 n = A->rmap->rend - A->rmap->rstart; 1124 cnt = 0; 1125 for (i = 0; i < n; i++) { 1126 /* put in lower diagonal portion */ 1127 m = bd->i[i + 1] - bd->i[i]; 1128 while (m > 0) { 1129 /* is it above diagonal (in bd (compressed) numbering) */ 1130 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1131 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1132 m--; 1133 } 1134 1135 /* put in diagonal portion */ 1136 for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1137 1138 /* put in upper diagonal portion */ 1139 while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1140 } 1141 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1142 1143 /* Gather all column indices to all processors */ 1144 for (i = 0; i < size; i++) { 1145 recvcounts[i] = 0; 1146 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j]; 1147 } 1148 displs[0] = 0; 1149 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1150 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1151 /* Assemble the matrix into useable form (note numerical values not yet set) */ 1152 /* set the b->ilen (length of each row) values */ 1153 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1154 /* set the b->i indices */ 1155 b->i[0] = 0; 1156 for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1]; 1157 PetscCall(PetscFree(lens)); 1158 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1159 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1160 1161 } else { 1162 B = **Bin; 1163 b = (Mat_SeqAIJ *)B->data; 1164 } 1165 1166 /* Copy my part of matrix numerical values into the values location */ 1167 if (flag == MAT_GET_VALUES) { 1168 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1169 MatScalar *sendbuf, *recvbuf; 1170 1171 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1172 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1173 sendcount = ad->nz + bd->nz; 1174 sendbuf = b->a + b->i[rstarts[rank]]; 1175 a_sendbuf = ada; 1176 b_sendbuf = bda; 1177 b_sendj = bd->j; 1178 n = A->rmap->rend - A->rmap->rstart; 1179 cnt = 0; 1180 for (i = 0; i < n; i++) { 1181 /* put in lower diagonal portion */ 1182 m = bd->i[i + 1] - bd->i[i]; 1183 while (m > 0) { 1184 /* is it above diagonal (in bd (compressed) numbering) */ 1185 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 m--; 1188 b_sendj++; 1189 } 1190 1191 /* put in diagonal portion */ 1192 for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++; 1193 1194 /* put in upper diagonal portion */ 1195 while (m-- > 0) { 1196 sendbuf[cnt++] = *b_sendbuf++; 1197 b_sendj++; 1198 } 1199 } 1200 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1201 1202 /* Gather all numerical values to all processors */ 1203 if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1204 for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]]; 1205 displs[0] = 0; 1206 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1207 recvbuf = b->a; 1208 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A))); 1209 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1210 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1211 } /* endof (flag == MAT_GET_VALUES) */ 1212 PetscCall(PetscFree2(recvcounts, displs)); 1213 1214 PetscCall(MatPropagateSymmetryOptions(A, B)); 1215 PetscFunctionReturn(PETSC_SUCCESS); 1216 } 1217 1218 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) 1219 { 1220 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1221 Mat submat, A = c->A, B = c->B; 1222 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1223 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1224 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1225 const PetscInt *icol, *irow; 1226 PetscInt nrow, ncol, start; 1227 PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr; 1228 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc; 1229 PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr; 1230 PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz; 1231 PetscInt *lens, rmax, ncols, *cols, Crow; 1232 #if defined(PETSC_USE_CTABLE) 1233 PetscHMapI cmap, rmap; 1234 PetscInt *cmap_loc, *rmap_loc; 1235 #else 1236 PetscInt *cmap, *rmap; 1237 #endif 1238 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i; 1239 PetscInt *cworkB, lwrite, *subcols, *row2proc; 1240 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1241 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1242 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1243 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1244 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1245 MPI_Comm comm; 1246 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1247 PetscMPIInt *onodes1, *olengths1, idex, end; 1248 Mat_SubSppt *smatis1; 1249 PetscBool isrowsorted, iscolsorted; 1250 1251 PetscFunctionBegin; 1252 PetscValidLogicalCollectiveInt(C, ismax, 2); 1253 PetscValidLogicalCollectiveEnum(C, scall, 5); 1254 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1255 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1256 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1257 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1258 size = c->size; 1259 rank = c->rank; 1260 1261 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1262 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1263 PetscCall(ISGetIndices(isrow[0], &irow)); 1264 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1265 if (allcolumns) { 1266 icol = NULL; 1267 ncol = C->cmap->N; 1268 } else { 1269 PetscCall(ISGetIndices(iscol[0], &icol)); 1270 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1271 } 1272 1273 if (scall == MAT_INITIAL_MATRIX) { 1274 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1275 1276 /* Get some new tags to keep the communication clean */ 1277 tag1 = ((PetscObject)C)->tag; 1278 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1279 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1280 1281 /* evaluate communication - mesg to who, length of mesg, and buffer space 1282 required. Based on this, buffers are allocated, and data copied into them */ 1283 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1284 PetscCall(PetscMalloc1(nrow, &row2proc)); 1285 1286 /* w1[proc] = num of rows owned by proc -- to be requested */ 1287 proc = 0; 1288 nrqs = 0; /* num of outgoing messages */ 1289 for (j = 0; j < nrow; j++) { 1290 row = irow[j]; 1291 if (!isrowsorted) proc = 0; 1292 while (row >= C->rmap->range[proc + 1]) proc++; 1293 w1[proc]++; 1294 row2proc[j] = proc; /* map row index to proc */ 1295 1296 if (proc != rank && !w2[proc]) { 1297 w2[proc] = 1; 1298 nrqs++; 1299 } 1300 } 1301 w1[rank] = 0; /* rows owned by self will not be requested */ 1302 1303 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1304 for (proc = 0, j = 0; proc < size; proc++) { 1305 if (w1[proc]) pa[j++] = proc; 1306 } 1307 1308 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1309 msz = 0; /* total mesg length (for all procs) */ 1310 for (i = 0; i < nrqs; i++) { 1311 proc = pa[i]; 1312 w1[proc] += 3; 1313 msz += w1[proc]; 1314 } 1315 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1316 1317 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1318 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1319 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1320 1321 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1322 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1323 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1324 1325 /* Now post the Irecvs corresponding to these messages */ 1326 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1327 1328 PetscCall(PetscFree(onodes1)); 1329 PetscCall(PetscFree(olengths1)); 1330 1331 /* Allocate Memory for outgoing messages */ 1332 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1333 PetscCall(PetscArrayzero(sbuf1, size)); 1334 PetscCall(PetscArrayzero(ptr, size)); 1335 1336 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1337 iptr = tmp; 1338 for (i = 0; i < nrqs; i++) { 1339 proc = pa[i]; 1340 sbuf1[proc] = iptr; 1341 iptr += w1[proc]; 1342 } 1343 1344 /* Form the outgoing messages */ 1345 /* Initialize the header space */ 1346 for (i = 0; i < nrqs; i++) { 1347 proc = pa[i]; 1348 PetscCall(PetscArrayzero(sbuf1[proc], 3)); 1349 ptr[proc] = sbuf1[proc] + 3; 1350 } 1351 1352 /* Parse the isrow and copy data into outbuf */ 1353 PetscCall(PetscArrayzero(ctr, size)); 1354 for (j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1355 proc = row2proc[j]; 1356 if (proc != rank) { /* copy to the outgoing buf*/ 1357 *ptr[proc] = irow[j]; 1358 ctr[proc]++; 1359 ptr[proc]++; 1360 } 1361 } 1362 1363 /* Update the headers for the current IS */ 1364 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 1365 if ((ctr_j = ctr[j])) { 1366 sbuf1_j = sbuf1[j]; 1367 k = ++sbuf1_j[0]; 1368 sbuf1_j[2 * k] = ctr_j; 1369 sbuf1_j[2 * k - 1] = 0; 1370 } 1371 } 1372 1373 /* Now post the sends */ 1374 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1375 for (i = 0; i < nrqs; ++i) { 1376 proc = pa[i]; 1377 PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i)); 1378 } 1379 1380 /* Post Receives to capture the buffer size */ 1381 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1382 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1383 1384 if (nrqs) rbuf2[0] = tmp + msz; 1385 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1386 1387 for (i = 0; i < nrqs; ++i) { 1388 proc = pa[i]; 1389 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i)); 1390 } 1391 1392 PetscCall(PetscFree2(w1, w2)); 1393 1394 /* Send to other procs the buf size they should allocate */ 1395 /* Receive messages*/ 1396 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1397 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1398 1399 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1400 for (i = 0; i < nrqr; ++i) { 1401 req_size[i] = 0; 1402 rbuf1_i = rbuf1[i]; 1403 start = 2 * rbuf1_i[0] + 1; 1404 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1405 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1406 sbuf2_i = sbuf2[i]; 1407 for (j = start; j < end; j++) { 1408 k = rbuf1_i[j] - rstart; 1409 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1410 sbuf2_i[j] = ncols; 1411 req_size[i] += ncols; 1412 } 1413 req_source1[i] = r_status1[i].MPI_SOURCE; 1414 1415 /* form the header */ 1416 sbuf2_i[0] = req_size[i]; 1417 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1418 1419 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1420 } 1421 1422 PetscCall(PetscFree(r_status1)); 1423 PetscCall(PetscFree(r_waits1)); 1424 1425 /* rbuf2 is received, Post recv column indices a->j */ 1426 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1427 1428 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1429 for (i = 0; i < nrqs; ++i) { 1430 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1431 req_source2[i] = r_status2[i].MPI_SOURCE; 1432 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1433 } 1434 1435 /* Wait on sends1 and sends2 */ 1436 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1437 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1438 PetscCall(PetscFree(s_waits1)); 1439 PetscCall(PetscFree(s_status1)); 1440 1441 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1442 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1443 1444 /* Now allocate sending buffers for a->j, and send them off */ 1445 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1446 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1447 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 1448 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1449 1450 for (i = 0; i < nrqr; i++) { /* for each requested message */ 1451 rbuf1_i = rbuf1[i]; 1452 sbuf_aj_i = sbuf_aj[i]; 1453 ct1 = 2 * rbuf1_i[0] + 1; 1454 ct2 = 0; 1455 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1456 1457 kmax = rbuf1[i][2]; 1458 for (k = 0; k < kmax; k++, ct1++) { /* for each row */ 1459 row = rbuf1_i[ct1] - rstart; 1460 nzA = ai[row + 1] - ai[row]; 1461 nzB = bi[row + 1] - bi[row]; 1462 ncols = nzA + nzB; 1463 cworkA = aj + ai[row]; 1464 cworkB = bj + bi[row]; 1465 1466 /* load the column indices for this row into cols*/ 1467 cols = sbuf_aj_i + ct2; 1468 1469 lwrite = 0; 1470 for (l = 0; l < nzB; l++) { 1471 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1472 } 1473 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1474 for (l = 0; l < nzB; l++) { 1475 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1476 } 1477 1478 ct2 += ncols; 1479 } 1480 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1481 } 1482 1483 /* create column map (cmap): global col of C -> local col of submat */ 1484 #if defined(PETSC_USE_CTABLE) 1485 if (!allcolumns) { 1486 PetscCall(PetscHMapICreateWithSize(ncol, &cmap)); 1487 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1488 for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1489 if (icol[j] >= cstart && icol[j] < cend) { 1490 cmap_loc[icol[j] - cstart] = j + 1; 1491 } else { /* use PetscHMapI for non-local col indices */ 1492 PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1)); 1493 } 1494 } 1495 } else { 1496 cmap = NULL; 1497 cmap_loc = NULL; 1498 } 1499 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1500 #else 1501 if (!allcolumns) { 1502 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1503 for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1504 } else { 1505 cmap = NULL; 1506 } 1507 #endif 1508 1509 /* Create lens for MatSeqAIJSetPreallocation() */ 1510 PetscCall(PetscCalloc1(nrow, &lens)); 1511 1512 /* Compute lens from local part of C */ 1513 for (j = 0; j < nrow; j++) { 1514 row = irow[j]; 1515 proc = row2proc[j]; 1516 if (proc == rank) { 1517 /* diagonal part A = c->A */ 1518 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1519 cols = aj + ai[row - rstart]; 1520 if (!allcolumns) { 1521 for (k = 0; k < ncols; k++) { 1522 #if defined(PETSC_USE_CTABLE) 1523 tcol = cmap_loc[cols[k]]; 1524 #else 1525 tcol = cmap[cols[k] + cstart]; 1526 #endif 1527 if (tcol) lens[j]++; 1528 } 1529 } else { /* allcolumns */ 1530 lens[j] = ncols; 1531 } 1532 1533 /* off-diagonal part B = c->B */ 1534 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1535 cols = bj + bi[row - rstart]; 1536 if (!allcolumns) { 1537 for (k = 0; k < ncols; k++) { 1538 #if defined(PETSC_USE_CTABLE) 1539 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1540 #else 1541 tcol = cmap[bmap[cols[k]]]; 1542 #endif 1543 if (tcol) lens[j]++; 1544 } 1545 } else { /* allcolumns */ 1546 lens[j] += ncols; 1547 } 1548 } 1549 } 1550 1551 /* Create row map (rmap): global row of C -> local row of submat */ 1552 #if defined(PETSC_USE_CTABLE) 1553 PetscCall(PetscHMapICreateWithSize(nrow, &rmap)); 1554 for (j = 0; j < nrow; j++) { 1555 row = irow[j]; 1556 proc = row2proc[j]; 1557 if (proc == rank) { /* a local row */ 1558 rmap_loc[row - rstart] = j; 1559 } else { 1560 PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1)); 1561 } 1562 } 1563 #else 1564 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1565 for (j = 0; j < nrow; j++) rmap[irow[j]] = j; 1566 #endif 1567 1568 /* Update lens from offproc data */ 1569 /* recv a->j is done */ 1570 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1571 for (i = 0; i < nrqs; i++) { 1572 proc = pa[i]; 1573 sbuf1_i = sbuf1[proc]; 1574 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1575 ct1 = 2 + 1; 1576 ct2 = 0; 1577 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1578 rbuf3_i = rbuf3[i]; /* received C->j */ 1579 1580 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1581 max1 = sbuf1_i[2]; 1582 for (k = 0; k < max1; k++, ct1++) { 1583 #if defined(PETSC_USE_CTABLE) 1584 PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row)); 1585 row--; 1586 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1587 #else 1588 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1589 #endif 1590 /* Now, store row index of submat in sbuf1_i[ct1] */ 1591 sbuf1_i[ct1] = row; 1592 1593 nnz = rbuf2_i[ct1]; 1594 if (!allcolumns) { 1595 for (l = 0; l < nnz; l++, ct2++) { 1596 #if defined(PETSC_USE_CTABLE) 1597 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1598 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1599 } else { 1600 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1601 } 1602 #else 1603 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1604 #endif 1605 if (tcol) lens[row]++; 1606 } 1607 } else { /* allcolumns */ 1608 lens[row] += nnz; 1609 } 1610 } 1611 } 1612 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1613 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1614 1615 /* Create the submatrices */ 1616 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1617 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1618 1619 PetscCall(ISGetBlockSize(isrow[0], &i)); 1620 PetscCall(ISGetBlockSize(iscol[0], &j)); 1621 PetscCall(MatSetBlockSizes(submat, i, j)); 1622 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1623 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1624 1625 /* create struct Mat_SubSppt and attached it to submat */ 1626 PetscCall(PetscNew(&smatis1)); 1627 subc = (Mat_SeqAIJ *)submat->data; 1628 subc->submatis1 = smatis1; 1629 1630 smatis1->id = 0; 1631 smatis1->nrqs = nrqs; 1632 smatis1->nrqr = nrqr; 1633 smatis1->rbuf1 = rbuf1; 1634 smatis1->rbuf2 = rbuf2; 1635 smatis1->rbuf3 = rbuf3; 1636 smatis1->sbuf2 = sbuf2; 1637 smatis1->req_source2 = req_source2; 1638 1639 smatis1->sbuf1 = sbuf1; 1640 smatis1->ptr = ptr; 1641 smatis1->tmp = tmp; 1642 smatis1->ctr = ctr; 1643 1644 smatis1->pa = pa; 1645 smatis1->req_size = req_size; 1646 smatis1->req_source1 = req_source1; 1647 1648 smatis1->allcolumns = allcolumns; 1649 smatis1->singleis = PETSC_TRUE; 1650 smatis1->row2proc = row2proc; 1651 smatis1->rmap = rmap; 1652 smatis1->cmap = cmap; 1653 #if defined(PETSC_USE_CTABLE) 1654 smatis1->rmap_loc = rmap_loc; 1655 smatis1->cmap_loc = cmap_loc; 1656 #endif 1657 1658 smatis1->destroy = submat->ops->destroy; 1659 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1660 submat->factortype = C->factortype; 1661 1662 /* compute rmax */ 1663 rmax = 0; 1664 for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1665 1666 } else { /* scall == MAT_REUSE_MATRIX */ 1667 submat = submats[0]; 1668 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1669 1670 subc = (Mat_SeqAIJ *)submat->data; 1671 rmax = subc->rmax; 1672 smatis1 = subc->submatis1; 1673 nrqs = smatis1->nrqs; 1674 nrqr = smatis1->nrqr; 1675 rbuf1 = smatis1->rbuf1; 1676 rbuf2 = smatis1->rbuf2; 1677 rbuf3 = smatis1->rbuf3; 1678 req_source2 = smatis1->req_source2; 1679 1680 sbuf1 = smatis1->sbuf1; 1681 sbuf2 = smatis1->sbuf2; 1682 ptr = smatis1->ptr; 1683 tmp = smatis1->tmp; 1684 ctr = smatis1->ctr; 1685 1686 pa = smatis1->pa; 1687 req_size = smatis1->req_size; 1688 req_source1 = smatis1->req_source1; 1689 1690 allcolumns = smatis1->allcolumns; 1691 row2proc = smatis1->row2proc; 1692 rmap = smatis1->rmap; 1693 cmap = smatis1->cmap; 1694 #if defined(PETSC_USE_CTABLE) 1695 rmap_loc = smatis1->rmap_loc; 1696 cmap_loc = smatis1->cmap_loc; 1697 #endif 1698 } 1699 1700 /* Post recv matrix values */ 1701 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1702 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1703 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1704 for (i = 0; i < nrqs; ++i) { 1705 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1706 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1707 } 1708 1709 /* Allocate sending buffers for a->a, and send them off */ 1710 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1711 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1712 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 1713 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1714 1715 for (i = 0; i < nrqr; i++) { 1716 rbuf1_i = rbuf1[i]; 1717 sbuf_aa_i = sbuf_aa[i]; 1718 ct1 = 2 * rbuf1_i[0] + 1; 1719 ct2 = 0; 1720 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1721 1722 kmax = rbuf1_i[2]; 1723 for (k = 0; k < kmax; k++, ct1++) { 1724 row = rbuf1_i[ct1] - rstart; 1725 nzA = ai[row + 1] - ai[row]; 1726 nzB = bi[row + 1] - bi[row]; 1727 ncols = nzA + nzB; 1728 cworkB = bj + bi[row]; 1729 vworkA = a_a + ai[row]; 1730 vworkB = b_a + bi[row]; 1731 1732 /* load the column values for this row into vals*/ 1733 vals = sbuf_aa_i + ct2; 1734 1735 lwrite = 0; 1736 for (l = 0; l < nzB; l++) { 1737 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1738 } 1739 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1740 for (l = 0; l < nzB; l++) { 1741 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1742 } 1743 1744 ct2 += ncols; 1745 } 1746 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1747 } 1748 1749 /* Assemble submat */ 1750 /* First assemble the local rows */ 1751 for (j = 0; j < nrow; j++) { 1752 row = irow[j]; 1753 proc = row2proc[j]; 1754 if (proc == rank) { 1755 Crow = row - rstart; /* local row index of C */ 1756 #if defined(PETSC_USE_CTABLE) 1757 row = rmap_loc[Crow]; /* row index of submat */ 1758 #else 1759 row = rmap[row]; 1760 #endif 1761 1762 if (allcolumns) { 1763 /* diagonal part A = c->A */ 1764 ncols = ai[Crow + 1] - ai[Crow]; 1765 cols = aj + ai[Crow]; 1766 vals = a_a + ai[Crow]; 1767 i = 0; 1768 for (k = 0; k < ncols; k++) { 1769 subcols[i] = cols[k] + cstart; 1770 subvals[i++] = vals[k]; 1771 } 1772 1773 /* off-diagonal part B = c->B */ 1774 ncols = bi[Crow + 1] - bi[Crow]; 1775 cols = bj + bi[Crow]; 1776 vals = b_a + bi[Crow]; 1777 for (k = 0; k < ncols; k++) { 1778 subcols[i] = bmap[cols[k]]; 1779 subvals[i++] = vals[k]; 1780 } 1781 1782 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1783 1784 } else { /* !allcolumns */ 1785 #if defined(PETSC_USE_CTABLE) 1786 /* diagonal part A = c->A */ 1787 ncols = ai[Crow + 1] - ai[Crow]; 1788 cols = aj + ai[Crow]; 1789 vals = a_a + ai[Crow]; 1790 i = 0; 1791 for (k = 0; k < ncols; k++) { 1792 tcol = cmap_loc[cols[k]]; 1793 if (tcol) { 1794 subcols[i] = --tcol; 1795 subvals[i++] = vals[k]; 1796 } 1797 } 1798 1799 /* off-diagonal part B = c->B */ 1800 ncols = bi[Crow + 1] - bi[Crow]; 1801 cols = bj + bi[Crow]; 1802 vals = b_a + bi[Crow]; 1803 for (k = 0; k < ncols; k++) { 1804 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1805 if (tcol) { 1806 subcols[i] = --tcol; 1807 subvals[i++] = vals[k]; 1808 } 1809 } 1810 #else 1811 /* diagonal part A = c->A */ 1812 ncols = ai[Crow + 1] - ai[Crow]; 1813 cols = aj + ai[Crow]; 1814 vals = a_a + ai[Crow]; 1815 i = 0; 1816 for (k = 0; k < ncols; k++) { 1817 tcol = cmap[cols[k] + cstart]; 1818 if (tcol) { 1819 subcols[i] = --tcol; 1820 subvals[i++] = vals[k]; 1821 } 1822 } 1823 1824 /* off-diagonal part B = c->B */ 1825 ncols = bi[Crow + 1] - bi[Crow]; 1826 cols = bj + bi[Crow]; 1827 vals = b_a + bi[Crow]; 1828 for (k = 0; k < ncols; k++) { 1829 tcol = cmap[bmap[cols[k]]]; 1830 if (tcol) { 1831 subcols[i] = --tcol; 1832 subvals[i++] = vals[k]; 1833 } 1834 } 1835 #endif 1836 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1837 } 1838 } 1839 } 1840 1841 /* Now assemble the off-proc rows */ 1842 for (i = 0; i < nrqs; i++) { /* for each requested message */ 1843 /* recv values from other processes */ 1844 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1845 proc = pa[idex]; 1846 sbuf1_i = sbuf1[proc]; 1847 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1848 ct1 = 2 + 1; 1849 ct2 = 0; /* count of received C->j */ 1850 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1851 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1852 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1853 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1854 1855 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1856 max1 = sbuf1_i[2]; /* num of rows */ 1857 for (k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1858 row = sbuf1_i[ct1]; /* row index of submat */ 1859 if (!allcolumns) { 1860 idex = 0; 1861 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1862 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1863 for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1864 #if defined(PETSC_USE_CTABLE) 1865 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1866 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1867 } else { 1868 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1869 } 1870 #else 1871 tcol = cmap[rbuf3_i[ct2]]; 1872 #endif 1873 if (tcol) { 1874 subcols[idex] = --tcol; /* may not be sorted */ 1875 subvals[idex++] = rbuf4_i[ct2]; 1876 1877 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1878 For reuse, we replace received C->j with index that should be inserted to submat */ 1879 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1880 } 1881 } 1882 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1883 } else { /* scall == MAT_REUSE_MATRIX */ 1884 submat = submats[0]; 1885 subc = (Mat_SeqAIJ *)submat->data; 1886 1887 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1888 for (l = 0; l < nnz; l++) { 1889 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1890 subvals[idex++] = rbuf4_i[ct2]; 1891 } 1892 1893 bj = subc->j + subc->i[row]; /* sorted column indices */ 1894 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1895 } 1896 } else { /* allcolumns */ 1897 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1898 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES)); 1899 ct2 += nnz; 1900 } 1901 } 1902 } 1903 1904 /* sending a->a are done */ 1905 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1906 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1907 1908 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1909 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1910 submats[0] = submat; 1911 1912 /* Restore the indices */ 1913 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1914 if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol)); 1915 1916 /* Destroy allocated memory */ 1917 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 1918 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1919 if (sbuf_aa) { 1920 PetscCall(PetscFree(sbuf_aa[0])); 1921 PetscCall(PetscFree(sbuf_aa)); 1922 } 1923 1924 if (scall == MAT_INITIAL_MATRIX) { 1925 PetscCall(PetscFree(lens)); 1926 if (sbuf_aj) { 1927 PetscCall(PetscFree(sbuf_aj[0])); 1928 PetscCall(PetscFree(sbuf_aj)); 1929 } 1930 } 1931 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1932 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1933 PetscFunctionReturn(PETSC_SUCCESS); 1934 } 1935 1936 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1937 { 1938 PetscInt ncol; 1939 PetscBool colflag, allcolumns = PETSC_FALSE; 1940 1941 PetscFunctionBegin; 1942 /* Allocate memory to hold all the submatrices */ 1943 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat)); 1944 1945 /* Check for special case: each processor gets entire matrix columns */ 1946 PetscCall(ISIdentity(iscol[0], &colflag)); 1947 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1948 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1949 1950 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1951 PetscFunctionReturn(PETSC_SUCCESS); 1952 } 1953 1954 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1955 { 1956 PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2]; 1957 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1958 Mat_SeqAIJ *subc; 1959 Mat_SubSppt *smat; 1960 1961 PetscFunctionBegin; 1962 /* Check for special case: each processor has a single IS */ 1963 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1964 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1965 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1966 PetscFunctionReturn(PETSC_SUCCESS); 1967 } 1968 1969 /* Collect global wantallmatrix and nstages */ 1970 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1971 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1972 if (!nmax) nmax = 1; 1973 1974 if (scall == MAT_INITIAL_MATRIX) { 1975 /* Collect global wantallmatrix and nstages */ 1976 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1977 PetscCall(ISIdentity(*isrow, &rowflag)); 1978 PetscCall(ISIdentity(*iscol, &colflag)); 1979 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1980 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1981 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1982 wantallmatrix = PETSC_TRUE; 1983 1984 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1985 } 1986 } 1987 1988 /* Determine the number of stages through which submatrices are done 1989 Each stage will extract nmax submatrices. 1990 nmax is determined by the matrix column dimension. 1991 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1992 */ 1993 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1994 1995 in[0] = -1 * (PetscInt)wantallmatrix; 1996 in[1] = nstages; 1997 PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 1998 wantallmatrix = (PetscBool)(-out[0]); 1999 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2000 2001 } else { /* MAT_REUSE_MATRIX */ 2002 if (ismax) { 2003 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2004 smat = subc->submatis1; 2005 } else { /* (*submat)[0] is a dummy matrix */ 2006 smat = (Mat_SubSppt *)(*submat)[0]->data; 2007 } 2008 if (!smat) { 2009 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2010 wantallmatrix = PETSC_TRUE; 2011 } else if (smat->singleis) { 2012 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 2013 PetscFunctionReturn(PETSC_SUCCESS); 2014 } else { 2015 nstages = smat->nstages; 2016 } 2017 } 2018 2019 if (wantallmatrix) { 2020 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 2021 PetscFunctionReturn(PETSC_SUCCESS); 2022 } 2023 2024 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2025 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat)); 2026 2027 for (i = 0, pos = 0; i < nstages; i++) { 2028 if (pos + nmax <= ismax) max_no = nmax; 2029 else if (pos >= ismax) max_no = 0; 2030 else max_no = ismax - pos; 2031 2032 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos)); 2033 if (!max_no) { 2034 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2035 smat = (Mat_SubSppt *)(*submat)[pos]->data; 2036 smat->nstages = nstages; 2037 } 2038 pos++; /* advance to next dummy matrix if any */ 2039 } else pos += max_no; 2040 } 2041 2042 if (ismax && scall == MAT_INITIAL_MATRIX) { 2043 /* save nstages for reuse */ 2044 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2045 smat = subc->submatis1; 2046 smat->nstages = nstages; 2047 } 2048 PetscFunctionReturn(PETSC_SUCCESS); 2049 } 2050 2051 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) 2052 { 2053 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2054 Mat A = c->A; 2055 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2056 const PetscInt **icol, **irow; 2057 PetscInt *nrow, *ncol, start; 2058 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr; 2059 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1; 2060 PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol; 2061 PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2062 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2063 #if defined(PETSC_USE_CTABLE) 2064 PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i; 2065 #else 2066 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2067 #endif 2068 const PetscInt *irow_i; 2069 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2070 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2071 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2072 MPI_Comm comm; 2073 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2074 PetscMPIInt *onodes1, *olengths1, end; 2075 PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2076 Mat_SubSppt *smat_i; 2077 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2078 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen; 2079 2080 PetscFunctionBegin; 2081 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2082 size = c->size; 2083 rank = c->rank; 2084 2085 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2086 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2087 2088 for (i = 0; i < ismax; i++) { 2089 PetscCall(ISSorted(iscol[i], &issorted[i])); 2090 if (!issorted[i]) iscsorted = issorted[i]; 2091 2092 PetscCall(ISSorted(isrow[i], &issorted[i])); 2093 2094 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2095 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2096 2097 /* Check for special case: allcolumn */ 2098 PetscCall(ISIdentity(iscol[i], &colflag)); 2099 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2100 if (colflag && ncol[i] == C->cmap->N) { 2101 allcolumns[i] = PETSC_TRUE; 2102 icol[i] = NULL; 2103 } else { 2104 allcolumns[i] = PETSC_FALSE; 2105 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2106 } 2107 } 2108 2109 if (scall == MAT_REUSE_MATRIX) { 2110 /* Assumes new rows are same length as the old rows */ 2111 for (i = 0; i < ismax; i++) { 2112 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2113 subc = (Mat_SeqAIJ *)submats[i]->data; 2114 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2115 2116 /* Initial matrix as if empty */ 2117 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2118 2119 smat_i = subc->submatis1; 2120 2121 nrqs = smat_i->nrqs; 2122 nrqr = smat_i->nrqr; 2123 rbuf1 = smat_i->rbuf1; 2124 rbuf2 = smat_i->rbuf2; 2125 rbuf3 = smat_i->rbuf3; 2126 req_source2 = smat_i->req_source2; 2127 2128 sbuf1 = smat_i->sbuf1; 2129 sbuf2 = smat_i->sbuf2; 2130 ptr = smat_i->ptr; 2131 tmp = smat_i->tmp; 2132 ctr = smat_i->ctr; 2133 2134 pa = smat_i->pa; 2135 req_size = smat_i->req_size; 2136 req_source1 = smat_i->req_source1; 2137 2138 allcolumns[i] = smat_i->allcolumns; 2139 row2proc[i] = smat_i->row2proc; 2140 rmap[i] = smat_i->rmap; 2141 cmap[i] = smat_i->cmap; 2142 } 2143 2144 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2145 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2146 smat_i = (Mat_SubSppt *)submats[0]->data; 2147 2148 nrqs = smat_i->nrqs; 2149 nrqr = smat_i->nrqr; 2150 rbuf1 = smat_i->rbuf1; 2151 rbuf2 = smat_i->rbuf2; 2152 rbuf3 = smat_i->rbuf3; 2153 req_source2 = smat_i->req_source2; 2154 2155 sbuf1 = smat_i->sbuf1; 2156 sbuf2 = smat_i->sbuf2; 2157 ptr = smat_i->ptr; 2158 tmp = smat_i->tmp; 2159 ctr = smat_i->ctr; 2160 2161 pa = smat_i->pa; 2162 req_size = smat_i->req_size; 2163 req_source1 = smat_i->req_source1; 2164 2165 allcolumns[0] = PETSC_FALSE; 2166 } 2167 } else { /* scall == MAT_INITIAL_MATRIX */ 2168 /* Get some new tags to keep the communication clean */ 2169 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2170 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2171 2172 /* evaluate communication - mesg to who, length of mesg, and buffer space 2173 required. Based on this, buffers are allocated, and data copied into them*/ 2174 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2175 2176 for (i = 0; i < ismax; i++) { 2177 jmax = nrow[i]; 2178 irow_i = irow[i]; 2179 2180 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2181 row2proc[i] = row2proc_i; 2182 2183 if (issorted[i]) proc = 0; 2184 for (j = 0; j < jmax; j++) { 2185 if (!issorted[i]) proc = 0; 2186 row = irow_i[j]; 2187 while (row >= C->rmap->range[proc + 1]) proc++; 2188 w4[proc]++; 2189 row2proc_i[j] = proc; /* map row index to proc */ 2190 } 2191 for (j = 0; j < size; j++) { 2192 if (w4[j]) { 2193 w1[j] += w4[j]; 2194 w3[j]++; 2195 w4[j] = 0; 2196 } 2197 } 2198 } 2199 2200 nrqs = 0; /* no of outgoing messages */ 2201 msz = 0; /* total mesg length (for all procs) */ 2202 w1[rank] = 0; /* no mesg sent to self */ 2203 w3[rank] = 0; 2204 for (i = 0; i < size; i++) { 2205 if (w1[i]) { 2206 w2[i] = 1; 2207 nrqs++; 2208 } /* there exists a message to proc i */ 2209 } 2210 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2211 for (i = 0, j = 0; i < size; i++) { 2212 if (w1[i]) { 2213 pa[j] = i; 2214 j++; 2215 } 2216 } 2217 2218 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2219 for (i = 0; i < nrqs; i++) { 2220 j = pa[i]; 2221 w1[j] += w2[j] + 2 * w3[j]; 2222 msz += w1[j]; 2223 } 2224 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2225 2226 /* Determine the number of messages to expect, their lengths, from from-ids */ 2227 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2228 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2229 2230 /* Now post the Irecvs corresponding to these messages */ 2231 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2232 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2233 2234 /* Allocate Memory for outgoing messages */ 2235 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2236 PetscCall(PetscArrayzero(sbuf1, size)); 2237 PetscCall(PetscArrayzero(ptr, size)); 2238 2239 { 2240 PetscInt *iptr = tmp; 2241 k = 0; 2242 for (i = 0; i < nrqs; i++) { 2243 j = pa[i]; 2244 iptr += k; 2245 sbuf1[j] = iptr; 2246 k = w1[j]; 2247 } 2248 } 2249 2250 /* Form the outgoing messages. Initialize the header space */ 2251 for (i = 0; i < nrqs; i++) { 2252 j = pa[i]; 2253 sbuf1[j][0] = 0; 2254 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2255 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2256 } 2257 2258 /* Parse the isrow and copy data into outbuf */ 2259 for (i = 0; i < ismax; i++) { 2260 row2proc_i = row2proc[i]; 2261 PetscCall(PetscArrayzero(ctr, size)); 2262 irow_i = irow[i]; 2263 jmax = nrow[i]; 2264 for (j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2265 proc = row2proc_i[j]; 2266 if (proc != rank) { /* copy to the outgoing buf*/ 2267 ctr[proc]++; 2268 *ptr[proc] = irow_i[j]; 2269 ptr[proc]++; 2270 } 2271 } 2272 /* Update the headers for the current IS */ 2273 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 2274 if ((ctr_j = ctr[j])) { 2275 sbuf1_j = sbuf1[j]; 2276 k = ++sbuf1_j[0]; 2277 sbuf1_j[2 * k] = ctr_j; 2278 sbuf1_j[2 * k - 1] = i; 2279 } 2280 } 2281 } 2282 2283 /* Now post the sends */ 2284 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2285 for (i = 0; i < nrqs; ++i) { 2286 j = pa[i]; 2287 PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2288 } 2289 2290 /* Post Receives to capture the buffer size */ 2291 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2292 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2293 if (nrqs) rbuf2[0] = tmp + msz; 2294 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 2295 for (i = 0; i < nrqs; ++i) { 2296 j = pa[i]; 2297 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2298 } 2299 2300 /* Send to other procs the buf size they should allocate */ 2301 /* Receive messages*/ 2302 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2303 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2304 { 2305 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2306 PetscInt *sbuf2_i; 2307 2308 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2309 for (i = 0; i < nrqr; ++i) { 2310 req_size[i] = 0; 2311 rbuf1_i = rbuf1[i]; 2312 start = 2 * rbuf1_i[0] + 1; 2313 end = olengths1[i]; 2314 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2315 sbuf2_i = sbuf2[i]; 2316 for (j = start; j < end; j++) { 2317 id = rbuf1_i[j] - rstart; 2318 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2319 sbuf2_i[j] = ncols; 2320 req_size[i] += ncols; 2321 } 2322 req_source1[i] = onodes1[i]; 2323 /* form the header */ 2324 sbuf2_i[0] = req_size[i]; 2325 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2326 2327 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2328 } 2329 } 2330 2331 PetscCall(PetscFree(onodes1)); 2332 PetscCall(PetscFree(olengths1)); 2333 PetscCall(PetscFree(r_waits1)); 2334 PetscCall(PetscFree4(w1, w2, w3, w4)); 2335 2336 /* Receive messages*/ 2337 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2338 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2339 for (i = 0; i < nrqs; ++i) { 2340 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2341 req_source2[i] = pa[i]; 2342 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2343 } 2344 PetscCall(PetscFree(r_waits2)); 2345 2346 /* Wait on sends1 and sends2 */ 2347 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2348 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2349 PetscCall(PetscFree(s_waits1)); 2350 PetscCall(PetscFree(s_waits2)); 2351 2352 /* Now allocate sending buffers for a->j, and send them off */ 2353 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2354 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2355 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 2356 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2357 2358 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2359 { 2360 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2361 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2362 PetscInt cend = C->cmap->rend; 2363 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2364 2365 for (i = 0; i < nrqr; i++) { 2366 rbuf1_i = rbuf1[i]; 2367 sbuf_aj_i = sbuf_aj[i]; 2368 ct1 = 2 * rbuf1_i[0] + 1; 2369 ct2 = 0; 2370 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2371 kmax = rbuf1[i][2 * j]; 2372 for (k = 0; k < kmax; k++, ct1++) { 2373 row = rbuf1_i[ct1] - rstart; 2374 nzA = a_i[row + 1] - a_i[row]; 2375 nzB = b_i[row + 1] - b_i[row]; 2376 ncols = nzA + nzB; 2377 cworkA = a_j + a_i[row]; 2378 cworkB = b_j + b_i[row]; 2379 2380 /* load the column indices for this row into cols */ 2381 cols = sbuf_aj_i + ct2; 2382 2383 lwrite = 0; 2384 for (l = 0; l < nzB; l++) { 2385 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2386 } 2387 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2388 for (l = 0; l < nzB; l++) { 2389 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2390 } 2391 2392 ct2 += ncols; 2393 } 2394 } 2395 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2396 } 2397 } 2398 2399 /* create col map: global col of C -> local col of submatrices */ 2400 { 2401 const PetscInt *icol_i; 2402 #if defined(PETSC_USE_CTABLE) 2403 for (i = 0; i < ismax; i++) { 2404 if (!allcolumns[i]) { 2405 PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i)); 2406 2407 jmax = ncol[i]; 2408 icol_i = icol[i]; 2409 cmap_i = cmap[i]; 2410 for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1)); 2411 } else cmap[i] = NULL; 2412 } 2413 #else 2414 for (i = 0; i < ismax; i++) { 2415 if (!allcolumns[i]) { 2416 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2417 jmax = ncol[i]; 2418 icol_i = icol[i]; 2419 cmap_i = cmap[i]; 2420 for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1; 2421 } else cmap[i] = NULL; 2422 } 2423 #endif 2424 } 2425 2426 /* Create lens which is required for MatCreate... */ 2427 for (i = 0, j = 0; i < ismax; i++) j += nrow[i]; 2428 PetscCall(PetscMalloc1(ismax, &lens)); 2429 2430 if (ismax) PetscCall(PetscCalloc1(j, &lens[0])); 2431 for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1]; 2432 2433 /* Update lens from local data */ 2434 for (i = 0; i < ismax; i++) { 2435 row2proc_i = row2proc[i]; 2436 jmax = nrow[i]; 2437 if (!allcolumns[i]) cmap_i = cmap[i]; 2438 irow_i = irow[i]; 2439 lens_i = lens[i]; 2440 for (j = 0; j < jmax; j++) { 2441 row = irow_i[j]; 2442 proc = row2proc_i[j]; 2443 if (proc == rank) { 2444 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2445 if (!allcolumns[i]) { 2446 for (k = 0; k < ncols; k++) { 2447 #if defined(PETSC_USE_CTABLE) 2448 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2449 #else 2450 tcol = cmap_i[cols[k]]; 2451 #endif 2452 if (tcol) lens_i[j]++; 2453 } 2454 } else { /* allcolumns */ 2455 lens_i[j] = ncols; 2456 } 2457 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2458 } 2459 } 2460 } 2461 2462 /* Create row map: global row of C -> local row of submatrices */ 2463 #if defined(PETSC_USE_CTABLE) 2464 for (i = 0; i < ismax; i++) { 2465 PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i)); 2466 irow_i = irow[i]; 2467 jmax = nrow[i]; 2468 for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1)); 2469 } 2470 #else 2471 for (i = 0; i < ismax; i++) { 2472 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2473 rmap_i = rmap[i]; 2474 irow_i = irow[i]; 2475 jmax = nrow[i]; 2476 for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j; 2477 } 2478 #endif 2479 2480 /* Update lens from offproc data */ 2481 { 2482 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2483 2484 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2485 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2486 sbuf1_i = sbuf1[pa[tmp2]]; 2487 jmax = sbuf1_i[0]; 2488 ct1 = 2 * jmax + 1; 2489 ct2 = 0; 2490 rbuf2_i = rbuf2[tmp2]; 2491 rbuf3_i = rbuf3[tmp2]; 2492 for (j = 1; j <= jmax; j++) { 2493 is_no = sbuf1_i[2 * j - 1]; 2494 max1 = sbuf1_i[2 * j]; 2495 lens_i = lens[is_no]; 2496 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2497 rmap_i = rmap[is_no]; 2498 for (k = 0; k < max1; k++, ct1++) { 2499 #if defined(PETSC_USE_CTABLE) 2500 PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row)); 2501 row--; 2502 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2503 #else 2504 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2505 #endif 2506 max2 = rbuf2_i[ct1]; 2507 for (l = 0; l < max2; l++, ct2++) { 2508 if (!allcolumns[is_no]) { 2509 #if defined(PETSC_USE_CTABLE) 2510 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2511 #else 2512 tcol = cmap_i[rbuf3_i[ct2]]; 2513 #endif 2514 if (tcol) lens_i[row]++; 2515 } else { /* allcolumns */ 2516 lens_i[row]++; /* lens_i[row] += max2 ? */ 2517 } 2518 } 2519 } 2520 } 2521 } 2522 } 2523 PetscCall(PetscFree(r_waits3)); 2524 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2525 PetscCall(PetscFree(s_waits3)); 2526 2527 /* Create the submatrices */ 2528 for (i = 0; i < ismax; i++) { 2529 PetscInt rbs, cbs; 2530 2531 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2532 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2533 2534 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2535 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2536 2537 PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2538 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2539 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2540 2541 /* create struct Mat_SubSppt and attached it to submat */ 2542 PetscCall(PetscNew(&smat_i)); 2543 subc = (Mat_SeqAIJ *)submats[i]->data; 2544 subc->submatis1 = smat_i; 2545 2546 smat_i->destroy = submats[i]->ops->destroy; 2547 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2548 submats[i]->factortype = C->factortype; 2549 2550 smat_i->id = i; 2551 smat_i->nrqs = nrqs; 2552 smat_i->nrqr = nrqr; 2553 smat_i->rbuf1 = rbuf1; 2554 smat_i->rbuf2 = rbuf2; 2555 smat_i->rbuf3 = rbuf3; 2556 smat_i->sbuf2 = sbuf2; 2557 smat_i->req_source2 = req_source2; 2558 2559 smat_i->sbuf1 = sbuf1; 2560 smat_i->ptr = ptr; 2561 smat_i->tmp = tmp; 2562 smat_i->ctr = ctr; 2563 2564 smat_i->pa = pa; 2565 smat_i->req_size = req_size; 2566 smat_i->req_source1 = req_source1; 2567 2568 smat_i->allcolumns = allcolumns[i]; 2569 smat_i->singleis = PETSC_FALSE; 2570 smat_i->row2proc = row2proc[i]; 2571 smat_i->rmap = rmap[i]; 2572 smat_i->cmap = cmap[i]; 2573 } 2574 2575 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2576 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2577 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2578 PetscCall(MatSetType(submats[0], MATDUMMY)); 2579 2580 /* create struct Mat_SubSppt and attached it to submat */ 2581 PetscCall(PetscNew(&smat_i)); 2582 submats[0]->data = (void *)smat_i; 2583 2584 smat_i->destroy = submats[0]->ops->destroy; 2585 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2586 submats[0]->factortype = C->factortype; 2587 2588 smat_i->id = 0; 2589 smat_i->nrqs = nrqs; 2590 smat_i->nrqr = nrqr; 2591 smat_i->rbuf1 = rbuf1; 2592 smat_i->rbuf2 = rbuf2; 2593 smat_i->rbuf3 = rbuf3; 2594 smat_i->sbuf2 = sbuf2; 2595 smat_i->req_source2 = req_source2; 2596 2597 smat_i->sbuf1 = sbuf1; 2598 smat_i->ptr = ptr; 2599 smat_i->tmp = tmp; 2600 smat_i->ctr = ctr; 2601 2602 smat_i->pa = pa; 2603 smat_i->req_size = req_size; 2604 smat_i->req_source1 = req_source1; 2605 2606 smat_i->allcolumns = PETSC_FALSE; 2607 smat_i->singleis = PETSC_FALSE; 2608 smat_i->row2proc = NULL; 2609 smat_i->rmap = NULL; 2610 smat_i->cmap = NULL; 2611 } 2612 2613 if (ismax) PetscCall(PetscFree(lens[0])); 2614 PetscCall(PetscFree(lens)); 2615 if (sbuf_aj) { 2616 PetscCall(PetscFree(sbuf_aj[0])); 2617 PetscCall(PetscFree(sbuf_aj)); 2618 } 2619 2620 } /* endof scall == MAT_INITIAL_MATRIX */ 2621 2622 /* Post recv matrix values */ 2623 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2624 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2625 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2626 for (i = 0; i < nrqs; ++i) { 2627 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2628 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2629 } 2630 2631 /* Allocate sending buffers for a->a, and send them off */ 2632 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2633 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2634 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 2635 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2636 2637 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2638 { 2639 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2640 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2641 PetscInt cend = C->cmap->rend; 2642 PetscInt *b_j = b->j; 2643 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2644 2645 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2646 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2647 for (i = 0; i < nrqr; i++) { 2648 rbuf1_i = rbuf1[i]; 2649 sbuf_aa_i = sbuf_aa[i]; 2650 ct1 = 2 * rbuf1_i[0] + 1; 2651 ct2 = 0; 2652 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2653 kmax = rbuf1_i[2 * j]; 2654 for (k = 0; k < kmax; k++, ct1++) { 2655 row = rbuf1_i[ct1] - rstart; 2656 nzA = a_i[row + 1] - a_i[row]; 2657 nzB = b_i[row + 1] - b_i[row]; 2658 ncols = nzA + nzB; 2659 cworkB = b_j + b_i[row]; 2660 vworkA = a_a + a_i[row]; 2661 vworkB = b_a + b_i[row]; 2662 2663 /* load the column values for this row into vals*/ 2664 vals = sbuf_aa_i + ct2; 2665 2666 lwrite = 0; 2667 for (l = 0; l < nzB; l++) { 2668 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2669 } 2670 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2671 for (l = 0; l < nzB; l++) { 2672 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2673 } 2674 2675 ct2 += ncols; 2676 } 2677 } 2678 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2679 } 2680 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2681 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2682 } 2683 2684 /* Assemble the matrices */ 2685 /* First assemble the local rows */ 2686 for (i = 0; i < ismax; i++) { 2687 row2proc_i = row2proc[i]; 2688 subc = (Mat_SeqAIJ *)submats[i]->data; 2689 imat_ilen = subc->ilen; 2690 imat_j = subc->j; 2691 imat_i = subc->i; 2692 imat_a = subc->a; 2693 2694 if (!allcolumns[i]) cmap_i = cmap[i]; 2695 rmap_i = rmap[i]; 2696 irow_i = irow[i]; 2697 jmax = nrow[i]; 2698 for (j = 0; j < jmax; j++) { 2699 row = irow_i[j]; 2700 proc = row2proc_i[j]; 2701 if (proc == rank) { 2702 old_row = row; 2703 #if defined(PETSC_USE_CTABLE) 2704 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2705 row--; 2706 #else 2707 row = rmap_i[row]; 2708 #endif 2709 ilen_row = imat_ilen[row]; 2710 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2711 mat_i = imat_i[row]; 2712 mat_a = imat_a + mat_i; 2713 mat_j = imat_j + mat_i; 2714 if (!allcolumns[i]) { 2715 for (k = 0; k < ncols; k++) { 2716 #if defined(PETSC_USE_CTABLE) 2717 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2718 #else 2719 tcol = cmap_i[cols[k]]; 2720 #endif 2721 if (tcol) { 2722 *mat_j++ = tcol - 1; 2723 *mat_a++ = vals[k]; 2724 ilen_row++; 2725 } 2726 } 2727 } else { /* allcolumns */ 2728 for (k = 0; k < ncols; k++) { 2729 *mat_j++ = cols[k]; /* global col index! */ 2730 *mat_a++ = vals[k]; 2731 ilen_row++; 2732 } 2733 } 2734 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2735 2736 imat_ilen[row] = ilen_row; 2737 } 2738 } 2739 } 2740 2741 /* Now assemble the off proc rows */ 2742 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2743 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2744 sbuf1_i = sbuf1[pa[tmp2]]; 2745 jmax = sbuf1_i[0]; 2746 ct1 = 2 * jmax + 1; 2747 ct2 = 0; 2748 rbuf2_i = rbuf2[tmp2]; 2749 rbuf3_i = rbuf3[tmp2]; 2750 rbuf4_i = rbuf4[tmp2]; 2751 for (j = 1; j <= jmax; j++) { 2752 is_no = sbuf1_i[2 * j - 1]; 2753 rmap_i = rmap[is_no]; 2754 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2755 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2756 imat_ilen = subc->ilen; 2757 imat_j = subc->j; 2758 imat_i = subc->i; 2759 imat_a = subc->a; 2760 max1 = sbuf1_i[2 * j]; 2761 for (k = 0; k < max1; k++, ct1++) { 2762 row = sbuf1_i[ct1]; 2763 #if defined(PETSC_USE_CTABLE) 2764 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2765 row--; 2766 #else 2767 row = rmap_i[row]; 2768 #endif 2769 ilen = imat_ilen[row]; 2770 mat_i = imat_i[row]; 2771 mat_a = imat_a + mat_i; 2772 mat_j = imat_j + mat_i; 2773 max2 = rbuf2_i[ct1]; 2774 if (!allcolumns[is_no]) { 2775 for (l = 0; l < max2; l++, ct2++) { 2776 #if defined(PETSC_USE_CTABLE) 2777 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2778 #else 2779 tcol = cmap_i[rbuf3_i[ct2]]; 2780 #endif 2781 if (tcol) { 2782 *mat_j++ = tcol - 1; 2783 *mat_a++ = rbuf4_i[ct2]; 2784 ilen++; 2785 } 2786 } 2787 } else { /* allcolumns */ 2788 for (l = 0; l < max2; l++, ct2++) { 2789 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2790 *mat_a++ = rbuf4_i[ct2]; 2791 ilen++; 2792 } 2793 } 2794 imat_ilen[row] = ilen; 2795 } 2796 } 2797 } 2798 2799 if (!iscsorted) { /* sort column indices of the rows */ 2800 for (i = 0; i < ismax; i++) { 2801 subc = (Mat_SeqAIJ *)submats[i]->data; 2802 imat_j = subc->j; 2803 imat_i = subc->i; 2804 imat_a = subc->a; 2805 imat_ilen = subc->ilen; 2806 2807 if (allcolumns[i]) continue; 2808 jmax = nrow[i]; 2809 for (j = 0; j < jmax; j++) { 2810 mat_i = imat_i[j]; 2811 mat_a = imat_a + mat_i; 2812 mat_j = imat_j + mat_i; 2813 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2814 } 2815 } 2816 } 2817 2818 PetscCall(PetscFree(r_waits4)); 2819 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2820 PetscCall(PetscFree(s_waits4)); 2821 2822 /* Restore the indices */ 2823 for (i = 0; i < ismax; i++) { 2824 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2825 if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i)); 2826 } 2827 2828 for (i = 0; i < ismax; i++) { 2829 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2830 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2831 } 2832 2833 /* Destroy allocated memory */ 2834 if (sbuf_aa) { 2835 PetscCall(PetscFree(sbuf_aa[0])); 2836 PetscCall(PetscFree(sbuf_aa)); 2837 } 2838 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2839 2840 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 2841 PetscCall(PetscFree(rbuf4)); 2842 2843 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2844 PetscFunctionReturn(PETSC_SUCCESS); 2845 } 2846 2847 /* 2848 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2849 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2850 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2851 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2852 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2853 state, and needs to be "assembled" later by compressing B's column space. 2854 2855 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2856 Following this call, C->A & C->B have been created, even if empty. 2857 */ 2858 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) 2859 { 2860 /* If making this function public, change the error returned in this function away from _PLIB. */ 2861 Mat_MPIAIJ *aij; 2862 Mat_SeqAIJ *Baij; 2863 PetscBool seqaij, Bdisassembled; 2864 PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count; 2865 PetscScalar v; 2866 const PetscInt *rowindices, *colindices; 2867 2868 PetscFunctionBegin; 2869 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2870 if (A) { 2871 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2872 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2873 if (rowemb) { 2874 PetscCall(ISGetLocalSize(rowemb, &m)); 2875 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2876 } else { 2877 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2878 } 2879 if (dcolemb) { 2880 PetscCall(ISGetLocalSize(dcolemb, &n)); 2881 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2882 } else { 2883 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2884 } 2885 } 2886 if (B) { 2887 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2888 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2889 if (rowemb) { 2890 PetscCall(ISGetLocalSize(rowemb, &m)); 2891 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2892 } else { 2893 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2894 } 2895 if (ocolemb) { 2896 PetscCall(ISGetLocalSize(ocolemb, &n)); 2897 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2898 } else { 2899 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2900 } 2901 } 2902 2903 aij = (Mat_MPIAIJ *)C->data; 2904 if (!aij->A) { 2905 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2906 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2907 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2908 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2909 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2910 } 2911 if (A) { 2912 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2913 } else { 2914 PetscCall(MatSetUp(aij->A)); 2915 } 2916 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2917 /* 2918 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2919 need to "disassemble" B -- convert it to using C's global indices. 2920 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2921 2922 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2923 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2924 2925 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2926 At least avoid calling MatSetValues() and the implied searches? 2927 */ 2928 2929 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2930 #if defined(PETSC_USE_CTABLE) 2931 PetscCall(PetscHMapIDestroy(&aij->colmap)); 2932 #else 2933 PetscCall(PetscFree(aij->colmap)); 2934 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2935 #endif 2936 ngcol = 0; 2937 if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol)); 2938 if (aij->garray) PetscCall(PetscFree(aij->garray)); 2939 PetscCall(VecDestroy(&aij->lvec)); 2940 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2941 } 2942 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B)); 2943 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B)); 2944 } 2945 Bdisassembled = PETSC_FALSE; 2946 if (!aij->B) { 2947 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2948 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2949 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2950 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2951 Bdisassembled = PETSC_TRUE; 2952 } 2953 if (B) { 2954 Baij = (Mat_SeqAIJ *)B->data; 2955 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2956 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2957 for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i]; 2958 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2959 PetscCall(PetscFree(nz)); 2960 } 2961 2962 PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend)); 2963 shift = rend - rstart; 2964 count = 0; 2965 rowindices = NULL; 2966 colindices = NULL; 2967 if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices)); 2968 if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices)); 2969 for (i = 0; i < B->rmap->n; i++) { 2970 PetscInt row; 2971 row = i; 2972 if (rowindices) row = rowindices[i]; 2973 for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2974 col = Baij->j[count]; 2975 if (colindices) col = colindices[col]; 2976 if (Bdisassembled && col >= rstart) col += shift; 2977 v = Baij->a[count]; 2978 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2979 ++count; 2980 } 2981 } 2982 /* No assembly for aij->B is necessary. */ 2983 /* FIXME: set aij->B's nonzerostate correctly. */ 2984 } else { 2985 PetscCall(MatSetUp(aij->B)); 2986 } 2987 C->preallocated = PETSC_TRUE; 2988 C->was_assembled = PETSC_FALSE; 2989 C->assembled = PETSC_FALSE; 2990 /* 2991 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2992 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2993 */ 2994 PetscFunctionReturn(PETSC_SUCCESS); 2995 } 2996 2997 /* 2998 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2999 */ 3000 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) 3001 { 3002 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 3003 3004 PetscFunctionBegin; 3005 PetscValidPointer(A, 2); 3006 PetscValidPointer(B, 3); 3007 /* FIXME: make sure C is assembled */ 3008 *A = aij->A; 3009 *B = aij->B; 3010 /* Note that we don't incref *A and *B, so be careful! */ 3011 PetscFunctionReturn(PETSC_SUCCESS); 3012 } 3013 3014 /* 3015 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3016 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3017 */ 3018 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) 3019 { 3020 PetscMPIInt size, flag; 3021 PetscInt i, ii, cismax, ispar; 3022 Mat *A, *B; 3023 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 3024 3025 PetscFunctionBegin; 3026 if (!ismax) PetscFunctionReturn(PETSC_SUCCESS); 3027 3028 for (i = 0, cismax = 0; i < ismax; ++i) { 3029 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 3030 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3031 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3032 if (size > 1) ++cismax; 3033 } 3034 3035 /* 3036 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3037 ispar counts the number of parallel ISs across C's comm. 3038 */ 3039 PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 3040 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3041 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 3042 PetscFunctionReturn(PETSC_SUCCESS); 3043 } 3044 3045 /* if (ispar) */ 3046 /* 3047 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3048 These are used to extract the off-diag portion of the resulting parallel matrix. 3049 The row IS for the off-diag portion is the same as for the diag portion, 3050 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3051 */ 3052 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3053 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3054 for (i = 0, ii = 0; i < ismax; ++i) { 3055 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3056 if (size > 1) { 3057 /* 3058 TODO: This is the part that's ***NOT SCALABLE***. 3059 To fix this we need to extract just the indices of C's nonzero columns 3060 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3061 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3062 to be done without serializing on the IS list, so, most likely, it is best 3063 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3064 */ 3065 PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii]))); 3066 /* Now we have to 3067 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3068 were sorted on each rank, concatenated they might no longer be sorted; 3069 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3070 indices in the nondecreasing order to the original index positions. 3071 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3072 */ 3073 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3074 PetscCall(ISSort(ciscol[ii])); 3075 ++ii; 3076 } 3077 } 3078 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3079 for (i = 0, ii = 0; i < ismax; ++i) { 3080 PetscInt j, issize; 3081 const PetscInt *indices; 3082 3083 /* 3084 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3085 */ 3086 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3087 PetscCall(ISSort(isrow[i])); 3088 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3089 PetscCall(ISGetIndices(isrow[i], &indices)); 3090 for (j = 1; j < issize; ++j) { 3091 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3092 } 3093 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3094 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3095 PetscCall(ISSort(iscol[i])); 3096 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3097 PetscCall(ISGetIndices(iscol[i], &indices)); 3098 for (j = 1; j < issize; ++j) { 3099 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3100 } 3101 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3102 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3103 if (size > 1) { 3104 cisrow[ii] = isrow[i]; 3105 ++ii; 3106 } 3107 } 3108 /* 3109 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3110 array of sequential matrices underlying the resulting parallel matrices. 3111 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3112 contain duplicates. 3113 3114 There are as many diag matrices as there are original index sets. There are only as many parallel 3115 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3116 3117 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3118 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3119 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3120 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3121 with A[i] and B[ii] extracted from the corresponding MPI submat. 3122 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3123 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3124 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3125 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3126 values into A[i] and B[ii] sitting inside the corresponding submat. 3127 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3128 A[i], B[ii], AA[i] or BB[ii] matrices. 3129 */ 3130 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3131 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat)); 3132 3133 /* Now obtain the sequential A and B submatrices separately. */ 3134 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3135 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3136 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3137 3138 /* 3139 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3140 matrices A & B have been extracted directly into the parallel matrices containing them, or 3141 simply into the sequential matrix identical with the corresponding A (if size == 1). 3142 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3143 to have the same sparsity pattern. 3144 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3145 must be constructed for C. This is done by setseqmat(s). 3146 */ 3147 for (i = 0, ii = 0; i < ismax; ++i) { 3148 /* 3149 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3150 That way we can avoid sorting and computing permutations when reusing. 3151 To this end: 3152 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3153 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3154 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3155 */ 3156 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3157 3158 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3159 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3160 if (size > 1) { 3161 if (scall == MAT_INITIAL_MATRIX) { 3162 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3163 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3164 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3165 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3166 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3167 } 3168 /* 3169 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3170 */ 3171 { 3172 Mat AA = A[i], BB = B[ii]; 3173 3174 if (AA || BB) { 3175 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3176 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3177 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3178 } 3179 PetscCall(MatDestroy(&AA)); 3180 } 3181 PetscCall(ISDestroy(ciscol + ii)); 3182 PetscCall(ISDestroy(ciscol_p + ii)); 3183 ++ii; 3184 } else { /* if (size == 1) */ 3185 if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i])); 3186 if (isrow_p[i] || iscol_p[i]) { 3187 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3188 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3189 /* Otherwise A is extracted straight into (*submats)[i]. */ 3190 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3191 PetscCall(MatDestroy(A + i)); 3192 } else (*submat)[i] = A[i]; 3193 } 3194 PetscCall(ISDestroy(&isrow_p[i])); 3195 PetscCall(ISDestroy(&iscol_p[i])); 3196 } 3197 PetscCall(PetscFree2(cisrow, ciscol)); 3198 PetscCall(PetscFree2(isrow_p, iscol_p)); 3199 PetscCall(PetscFree(ciscol_p)); 3200 PetscCall(PetscFree(A)); 3201 PetscCall(MatDestroySubMatrices(cismax, &B)); 3202 PetscFunctionReturn(PETSC_SUCCESS); 3203 } 3204 3205 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 3206 { 3207 PetscFunctionBegin; 3208 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3209 PetscFunctionReturn(PETSC_SUCCESS); 3210 } 3211