1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 /* 22 Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks 23 24 The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but 25 that is a very major recoding job. 26 27 Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin 28 */ 29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[]) 30 { 31 PetscFunctionBegin; 32 for (PetscInt i = 0; i < imax; i++) { 33 if (!is[i]) break; 34 PetscInt n = 0, N, Nmax, Nmin; 35 const PetscInt *idx; 36 PetscInt *nidx = NULL; 37 MPI_Comm comm; 38 PetscBT bt; 39 40 PetscCall(ISGetLocalSize(is[i], &N)); 41 if (N > 0) { /* Nmax and Nmin are garbage for empty IS */ 42 PetscCall(ISGetIndices(is[i], &idx)); 43 PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax)); 44 Nmin = Nmin / bs; 45 Nmax = Nmax / bs; 46 PetscCall(PetscBTCreate(Nmax - Nmin, &bt)); 47 for (PetscInt j = 0; j < N; j++) { 48 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++; 49 } 50 PetscCall(PetscMalloc1(n, &nidx)); 51 n = 0; 52 PetscCall(PetscBTMemzero(Nmax - Nmin, bt)); 53 for (PetscInt j = 0; j < N; j++) { 54 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs; 55 } 56 PetscCall(PetscBTDestroy(&bt)); 57 PetscCall(ISRestoreIndices(is[i], &idx)); 58 } 59 PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm)); 60 PetscCall(ISDestroy(is + i)); 61 PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i)); 62 } 63 PetscFunctionReturn(0); 64 } 65 66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) 67 { 68 PetscInt i; 69 70 PetscFunctionBegin; 71 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 72 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); 73 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) 78 { 79 PetscInt i; 80 81 PetscFunctionBegin; 82 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 83 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); 84 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 85 PetscFunctionReturn(0); 86 } 87 88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) 89 { 90 MPI_Comm comm; 91 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j; 92 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 93 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 94 const PetscInt *indices_i, **indices; 95 PetscLayout rmap; 96 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner; 97 PetscSF sf; 98 PetscSFNode *remote; 99 100 PetscFunctionBegin; 101 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 102 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 103 PetscCallMPI(MPI_Comm_size(comm, &size)); 104 /* get row map to determine where rows should be going */ 105 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 106 /* retrieve IS data and put all together so that we 107 * can optimize communication 108 * */ 109 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 110 for (i = 0, tlength = 0; i < nidx; i++) { 111 PetscCall(ISGetLocalSize(is[i], &length[i])); 112 tlength += length[i]; 113 PetscCall(ISGetIndices(is[i], &indices[i])); 114 } 115 /* find these rows on remote processors */ 116 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 117 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 118 nrrows = 0; 119 for (i = 0; i < nidx; i++) { 120 length_i = length[i]; 121 indices_i = indices[i]; 122 for (j = 0; j < length_i; j++) { 123 owner = -1; 124 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 125 /* remote processors */ 126 if (owner != rank) { 127 tosizes_temp[owner]++; /* number of rows to owner */ 128 rrow_ranks[nrrows] = owner; /* processor */ 129 rrow_isids[nrrows] = i; /* is id */ 130 remoterows[nrrows++] = indices_i[j]; /* row */ 131 } 132 } 133 PetscCall(ISRestoreIndices(is[i], &indices[i])); 134 } 135 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 136 /* test if we need to exchange messages 137 * generally speaking, we do not need to exchange 138 * data when overlap is 1 139 * */ 140 PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 141 /* we do not have any messages 142 * It usually corresponds to overlap 1 143 * */ 144 if (!reducednrrows) { 145 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 146 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 147 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 148 PetscFunctionReturn(0); 149 } 150 nto = 0; 151 /* send sizes and ranks for building a two-sided communcation */ 152 for (i = 0; i < size; i++) { 153 if (tosizes_temp[i]) { 154 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 155 tosizes_temp[i] = nto; /* a map from processor to index */ 156 toranks[nto++] = i; /* processor */ 157 } 158 } 159 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 160 toffsets[0] = 0; 161 for (i = 0; i < nto; i++) { 162 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 163 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 164 } 165 /* send information to other processors */ 166 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 167 nrecvrows = 0; 168 for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 169 PetscCall(PetscMalloc1(nrecvrows, &remote)); 170 nrecvrows = 0; 171 for (i = 0; i < nfrom; i++) { 172 for (j = 0; j < fromsizes[2 * i]; j++) { 173 remote[nrecvrows].rank = fromranks[i]; 174 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 175 } 176 } 177 PetscCall(PetscSFCreate(comm, &sf)); 178 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 179 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 180 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 181 PetscCall(PetscSFSetFromOptions(sf)); 182 /* message pair <no of is, row> */ 183 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 184 for (i = 0; i < nrrows; i++) { 185 owner = rrow_ranks[i]; /* processor */ 186 j = tosizes_temp[owner]; /* index */ 187 todata[toffsets[j]++] = rrow_isids[i]; 188 todata[toffsets[j]++] = remoterows[i]; 189 } 190 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 191 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 192 PetscCall(PetscFree(toffsets)); 193 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 194 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 195 PetscCall(PetscSFDestroy(&sf)); 196 /* send rows belonging to the remote so that then we could get the overlapping data back */ 197 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 198 PetscCall(PetscFree2(todata, fromdata)); 199 PetscCall(PetscFree(fromsizes)); 200 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 201 PetscCall(PetscFree(fromranks)); 202 nrecvrows = 0; 203 for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 204 PetscCall(PetscCalloc1(nrecvrows, &todata)); 205 PetscCall(PetscMalloc1(nrecvrows, &remote)); 206 nrecvrows = 0; 207 for (i = 0; i < nto; i++) { 208 for (j = 0; j < tosizes[2 * i]; j++) { 209 remote[nrecvrows].rank = toranks[i]; 210 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 211 } 212 } 213 PetscCall(PetscSFCreate(comm, &sf)); 214 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 215 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 216 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 217 PetscCall(PetscSFSetFromOptions(sf)); 218 /* overlap communication and computation */ 219 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 220 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 221 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 222 PetscCall(PetscSFDestroy(&sf)); 223 PetscCall(PetscFree2(sbdata, sbsizes)); 224 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 225 PetscCall(PetscFree(toranks)); 226 PetscCall(PetscFree(tosizes)); 227 PetscCall(PetscFree(todata)); 228 PetscFunctionReturn(0); 229 } 230 231 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 232 { 233 PetscInt *isz, isz_i, i, j, is_id, data_size; 234 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 235 const PetscInt *indices_i_temp; 236 MPI_Comm *iscomms; 237 238 PetscFunctionBegin; 239 max_lsize = 0; 240 PetscCall(PetscMalloc1(nidx, &isz)); 241 for (i = 0; i < nidx; i++) { 242 PetscCall(ISGetLocalSize(is[i], &lsize)); 243 max_lsize = lsize > max_lsize ? lsize : max_lsize; 244 isz[i] = lsize; 245 } 246 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 247 for (i = 0; i < nidx; i++) { 248 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 249 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 250 PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i])); 251 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 252 PetscCall(ISDestroy(&is[i])); 253 } 254 /* retrieve information to get row id and its overlap */ 255 for (i = 0; i < nrecvs;) { 256 is_id = recvdata[i++]; 257 data_size = recvdata[i++]; 258 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 259 isz_i = isz[is_id]; 260 for (j = 0; j < data_size; j++) { 261 col = recvdata[i++]; 262 indices_i[isz_i++] = col; 263 } 264 isz[is_id] = isz_i; 265 } 266 /* remove duplicate entities */ 267 for (i = 0; i < nidx; i++) { 268 indices_i = indices_temp + (max_lsize + nrecvs) * i; 269 isz_i = isz[i]; 270 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 271 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 272 PetscCall(PetscCommDestroy(&iscomms[i])); 273 } 274 PetscCall(PetscFree(isz)); 275 PetscCall(PetscFree2(indices_temp, iscomms)); 276 PetscFunctionReturn(0); 277 } 278 279 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 280 { 281 PetscLayout rmap, cmap; 282 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 283 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 284 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 285 const PetscInt *gcols, *ai, *aj, *bi, *bj; 286 Mat amat, bmat; 287 PetscMPIInt rank; 288 PetscBool done; 289 MPI_Comm comm; 290 291 PetscFunctionBegin; 292 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 293 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 294 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 295 /* Even if the mat is symmetric, we still assume it is not symmetric */ 296 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 297 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 298 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 299 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 300 /* total number of nonzero values is used to estimate the memory usage in the next step */ 301 tnz = ai[an] + bi[bn]; 302 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 303 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 304 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 305 /* to find the longest message */ 306 max_fszs = 0; 307 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 308 /* better way to estimate number of nonzero in the mat??? */ 309 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 310 for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i; 311 rows_pos = 0; 312 totalrows = 0; 313 for (i = 0; i < nfrom; i++) { 314 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 315 /* group data together */ 316 for (j = 0; j < fromsizes[2 * i]; j += 2) { 317 is_id = fromrows[rows_pos++]; /* no of is */ 318 rows_i = rows_data[is_id]; 319 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 320 } 321 /* estimate a space to avoid multiple allocations */ 322 for (j = 0; j < nidx; j++) { 323 indvc_ij = 0; 324 rows_i = rows_data[j]; 325 for (l = 0; l < rows_pos_i[j]; l++) { 326 row = rows_i[l] - rstart; 327 start = ai[row]; 328 end = ai[row + 1]; 329 for (k = start; k < end; k++) { /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 332 } 333 start = bi[row]; 334 end = bi[row + 1]; 335 for (k = start; k < end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 341 indv_counts[i * nidx + j] = indvc_ij; 342 totalrows += indvc_ij; 343 } 344 } 345 /* message triple <no of is, number of rows, rows> */ 346 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 347 totalrows = 0; 348 rows_pos = 0; 349 /* use this code again */ 350 for (i = 0; i < nfrom; i++) { 351 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 352 for (j = 0; j < fromsizes[2 * i]; j += 2) { 353 is_id = fromrows[rows_pos++]; 354 rows_i = rows_data[is_id]; 355 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 356 } 357 /* add data */ 358 for (j = 0; j < nidx; j++) { 359 if (!indv_counts[i * nidx + j]) continue; 360 indvc_ij = 0; 361 sbdata[totalrows++] = j; 362 sbdata[totalrows++] = indv_counts[i * nidx + j]; 363 sbsizes[2 * i] += 2; 364 rows_i = rows_data[j]; 365 for (l = 0; l < rows_pos_i[j]; l++) { 366 row = rows_i[l] - rstart; 367 start = ai[row]; 368 end = ai[row + 1]; 369 for (k = start; k < end; k++) { /* Amat */ 370 col = aj[k] + cstart; 371 indices_tmp[indvc_ij++] = col; 372 } 373 start = bi[row]; 374 end = bi[row + 1]; 375 for (k = start; k < end; k++) { /* Bmat */ 376 col = gcols[bj[k]]; 377 indices_tmp[indvc_ij++] = col; 378 } 379 } 380 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 381 sbsizes[2 * i] += indvc_ij; 382 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 383 totalrows += indvc_ij; 384 } 385 } 386 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 387 offsets[0] = 0; 388 for (i = 0; i < nfrom; i++) { 389 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 390 sbsizes[2 * i + 1] = offsets[i]; 391 } 392 PetscCall(PetscFree(offsets)); 393 if (sbrowsizes) *sbrowsizes = sbsizes; 394 if (sbrows) *sbrows = sbdata; 395 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 396 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 397 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 398 PetscFunctionReturn(0); 399 } 400 401 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) 402 { 403 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 404 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 405 PetscInt lsize, lsize_tmp; 406 PetscMPIInt rank, owner; 407 Mat amat, bmat; 408 PetscBool done; 409 PetscLayout cmap, rmap; 410 MPI_Comm comm; 411 412 PetscFunctionBegin; 413 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 414 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 415 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 416 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 417 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 418 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 419 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 420 /* is it a safe way to compute number of nonzero values ? */ 421 tnz = ai[an] + bi[bn]; 422 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 423 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 424 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 425 /* it is a better way to estimate memory than the old implementation 426 * where global size of matrix is used 427 * */ 428 PetscCall(PetscMalloc1(tnz, &indices_temp)); 429 for (i = 0; i < nidx; i++) { 430 MPI_Comm iscomm; 431 432 PetscCall(ISGetLocalSize(is[i], &lsize)); 433 PetscCall(ISGetIndices(is[i], &indices)); 434 lsize_tmp = 0; 435 for (j = 0; j < lsize; j++) { 436 owner = -1; 437 row = indices[j]; 438 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 439 if (owner != rank) continue; 440 /* local number */ 441 row -= rstart; 442 start = ai[row]; 443 end = ai[row + 1]; 444 for (k = start; k < end; k++) { /* Amat */ 445 col = aj[k] + cstart; 446 indices_temp[lsize_tmp++] = col; 447 } 448 start = bi[row]; 449 end = bi[row + 1]; 450 for (k = start; k < end; k++) { /* Bmat */ 451 col = gcols[bj[k]]; 452 indices_temp[lsize_tmp++] = col; 453 } 454 } 455 PetscCall(ISRestoreIndices(is[i], &indices)); 456 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 457 PetscCall(ISDestroy(&is[i])); 458 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 459 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 460 PetscCall(PetscCommDestroy(&iscomm)); 461 } 462 PetscCall(PetscFree(indices_temp)); 463 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 464 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 465 PetscFunctionReturn(0); 466 } 467 468 /* 469 Sample message format: 470 If a processor A wants processor B to process some elements corresponding 471 to index sets is[1],is[5] 472 mesg [0] = 2 (no of index sets in the mesg) 473 ----------- 474 mesg [1] = 1 => is[1] 475 mesg [2] = sizeof(is[1]); 476 ----------- 477 mesg [3] = 5 => is[5] 478 mesg [4] = sizeof(is[5]); 479 ----------- 480 mesg [5] 481 mesg [n] datas[1] 482 ----------- 483 mesg[n+1] 484 mesg[m] data(is[5]) 485 ----------- 486 487 Notes: 488 nrqs - no of requests sent (or to be sent out) 489 nrqr - no of requests received (which have to be or which have been processed) 490 */ 491 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) 492 { 493 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 494 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 495 const PetscInt **idx, *idx_i; 496 PetscInt *n, **data, len; 497 #if defined(PETSC_USE_CTABLE) 498 PetscHMapI *table_data, table_data_i; 499 PetscInt *tdata, tcount, tcount_max; 500 #else 501 PetscInt *data_i, *d_p; 502 #endif 503 PetscMPIInt size, rank, tag1, tag2, proc = 0; 504 PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr; 505 PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2; 506 PetscBT *table; 507 MPI_Comm comm; 508 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 509 MPI_Status *recv_status; 510 MPI_Comm *iscomms; 511 char *t_p; 512 513 PetscFunctionBegin; 514 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 515 size = c->size; 516 rank = c->rank; 517 M = C->rmap->N; 518 519 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 520 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 521 522 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 523 524 for (i = 0; i < imax; i++) { 525 PetscCall(ISGetIndices(is[i], &idx[i])); 526 PetscCall(ISGetLocalSize(is[i], &n[i])); 527 } 528 529 /* evaluate communication - mesg to who,length of mesg, and buffer space 530 required. Based on this, buffers are allocated, and data copied into them */ 531 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 532 for (i = 0; i < imax; i++) { 533 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 534 idx_i = idx[i]; 535 len = n[i]; 536 for (j = 0; j < len; j++) { 537 row = idx_i[j]; 538 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 539 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 540 w4[proc]++; 541 } 542 for (j = 0; j < size; j++) { 543 if (w4[j]) { 544 w1[j] += w4[j]; 545 w3[j]++; 546 } 547 } 548 } 549 550 nrqs = 0; /* no of outgoing messages */ 551 msz = 0; /* total mesg length (for all proc */ 552 w1[rank] = 0; /* no mesg sent to intself */ 553 w3[rank] = 0; 554 for (i = 0; i < size; i++) { 555 if (w1[i]) { 556 w2[i] = 1; 557 nrqs++; 558 } /* there exists a message to proc i */ 559 } 560 /* pa - is list of processors to communicate with */ 561 PetscCall(PetscMalloc1(nrqs, &pa)); 562 for (i = 0, j = 0; i < size; i++) { 563 if (w1[i]) { 564 pa[j] = i; 565 j++; 566 } 567 } 568 569 /* Each message would have a header = 1 + 2*(no of IS) + data */ 570 for (i = 0; i < nrqs; i++) { 571 j = pa[i]; 572 w1[j] += w2[j] + 2 * w3[j]; 573 msz += w1[j]; 574 } 575 576 /* Determine the number of messages to expect, their lengths, from from-ids */ 577 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 578 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 579 580 /* Now post the Irecvs corresponding to these messages */ 581 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 582 583 /* Allocate Memory for outgoing messages */ 584 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 585 PetscCall(PetscArrayzero(outdat, size)); 586 PetscCall(PetscArrayzero(ptr, size)); 587 588 { 589 PetscInt *iptr = tmp, ict = 0; 590 for (i = 0; i < nrqs; i++) { 591 j = pa[i]; 592 iptr += ict; 593 outdat[j] = iptr; 594 ict = w1[j]; 595 } 596 } 597 598 /* Form the outgoing messages */ 599 /* plug in the headers */ 600 for (i = 0; i < nrqs; i++) { 601 j = pa[i]; 602 outdat[j][0] = 0; 603 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 604 ptr[j] = outdat[j] + 2 * w3[j] + 1; 605 } 606 607 /* Memory for doing local proc's work */ 608 { 609 PetscInt M_BPB_imax = 0; 610 #if defined(PETSC_USE_CTABLE) 611 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 612 PetscCall(PetscMalloc1(imax, &table_data)); 613 for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i)); 614 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 615 for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 616 #else 617 PetscInt Mimax = 0; 618 PetscCall(PetscIntMultError(M, imax, &Mimax)); 619 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 620 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 621 for (i = 0; i < imax; i++) { 622 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 623 data[i] = d_p + M * i; 624 } 625 #endif 626 } 627 628 /* Parse the IS and update local tables and the outgoing buf with the data */ 629 { 630 PetscInt n_i, isz_i, *outdat_j, ctr_j; 631 PetscBT table_i; 632 633 for (i = 0; i < imax; i++) { 634 PetscCall(PetscArrayzero(ctr, size)); 635 n_i = n[i]; 636 table_i = table[i]; 637 idx_i = idx[i]; 638 #if defined(PETSC_USE_CTABLE) 639 table_data_i = table_data[i]; 640 #else 641 data_i = data[i]; 642 #endif 643 isz_i = isz[i]; 644 for (j = 0; j < n_i; j++) { /* parse the indices of each IS */ 645 row = idx_i[j]; 646 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 647 if (proc != rank) { /* copy to the outgoing buffer */ 648 ctr[proc]++; 649 *ptr[proc] = row; 650 ptr[proc]++; 651 } else if (!PetscBTLookupSet(table_i, row)) { 652 #if defined(PETSC_USE_CTABLE) 653 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 654 #else 655 data_i[isz_i] = row; /* Update the local table */ 656 #endif 657 isz_i++; 658 } 659 } 660 /* Update the headers for the current IS */ 661 for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 662 if ((ctr_j = ctr[j])) { 663 outdat_j = outdat[j]; 664 k = ++outdat_j[0]; 665 outdat_j[2 * k] = ctr_j; 666 outdat_j[2 * k - 1] = i; 667 } 668 } 669 isz[i] = isz_i; 670 } 671 } 672 673 /* Now post the sends */ 674 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 675 for (i = 0; i < nrqs; ++i) { 676 j = pa[i]; 677 PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 678 } 679 680 /* No longer need the original indices */ 681 PetscCall(PetscMalloc1(imax, &iscomms)); 682 for (i = 0; i < imax; ++i) { 683 PetscCall(ISRestoreIndices(is[i], idx + i)); 684 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 685 } 686 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 687 688 for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i])); 689 690 /* Do Local work */ 691 #if defined(PETSC_USE_CTABLE) 692 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 693 #else 694 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 695 #endif 696 697 /* Receive messages */ 698 PetscCall(PetscMalloc1(nrqr, &recv_status)); 699 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 700 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 701 702 /* Phase 1 sends are complete - deallocate buffers */ 703 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 704 PetscCall(PetscFree4(w1, w2, w3, w4)); 705 706 PetscCall(PetscMalloc1(nrqr, &xdata)); 707 PetscCall(PetscMalloc1(nrqr, &isz1)); 708 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 709 PetscCall(PetscFree(rbuf[0])); 710 PetscCall(PetscFree(rbuf)); 711 712 /* Send the data back */ 713 /* Do a global reduction to know the buffer space req for incoming messages */ 714 { 715 PetscMPIInt *rw1; 716 717 PetscCall(PetscCalloc1(size, &rw1)); 718 719 for (i = 0; i < nrqr; ++i) { 720 proc = recv_status[i].MPI_SOURCE; 721 722 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 723 rw1[proc] = isz1[i]; 724 } 725 PetscCall(PetscFree(onodes1)); 726 PetscCall(PetscFree(olengths1)); 727 728 /* Determine the number of messages to expect, their lengths, from from-ids */ 729 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 730 PetscCall(PetscFree(rw1)); 731 } 732 /* Now post the Irecvs corresponding to these messages */ 733 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 734 735 /* Now post the sends */ 736 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 737 for (i = 0; i < nrqr; ++i) { 738 j = recv_status[i].MPI_SOURCE; 739 PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i)); 740 } 741 742 /* receive work done on other processors */ 743 { 744 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 745 PetscMPIInt idex; 746 PetscBT table_i; 747 748 for (i = 0; i < nrqs; ++i) { 749 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 750 /* Process the message */ 751 rbuf2_i = rbuf2[idex]; 752 ct1 = 2 * rbuf2_i[0] + 1; 753 jmax = rbuf2[idex][0]; 754 for (j = 1; j <= jmax; j++) { 755 max = rbuf2_i[2 * j]; 756 is_no = rbuf2_i[2 * j - 1]; 757 isz_i = isz[is_no]; 758 table_i = table[is_no]; 759 #if defined(PETSC_USE_CTABLE) 760 table_data_i = table_data[is_no]; 761 #else 762 data_i = data[is_no]; 763 #endif 764 for (k = 0; k < max; k++, ct1++) { 765 row = rbuf2_i[ct1]; 766 if (!PetscBTLookupSet(table_i, row)) { 767 #if defined(PETSC_USE_CTABLE) 768 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 769 #else 770 data_i[isz_i] = row; 771 #endif 772 isz_i++; 773 } 774 } 775 isz[is_no] = isz_i; 776 } 777 } 778 779 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 780 } 781 782 #if defined(PETSC_USE_CTABLE) 783 tcount_max = 0; 784 for (i = 0; i < imax; ++i) { 785 table_data_i = table_data[i]; 786 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 787 if (tcount_max < tcount) tcount_max = tcount; 788 } 789 PetscCall(PetscMalloc1(tcount_max + 1, &tdata)); 790 #endif 791 792 for (i = 0; i < imax; ++i) { 793 #if defined(PETSC_USE_CTABLE) 794 PetscHashIter tpos; 795 796 table_data_i = table_data[i]; 797 PetscHashIterBegin(table_data_i, tpos); 798 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 799 PetscHashIterGetKey(table_data_i, tpos, k); 800 PetscHashIterGetVal(table_data_i, tpos, j); 801 PetscHashIterNext(table_data_i, tpos); 802 tdata[--j] = --k; 803 } 804 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 805 #else 806 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 807 #endif 808 PetscCall(PetscCommDestroy(&iscomms[i])); 809 } 810 811 PetscCall(PetscFree(iscomms)); 812 PetscCall(PetscFree(onodes2)); 813 PetscCall(PetscFree(olengths2)); 814 815 PetscCall(PetscFree(pa)); 816 PetscCall(PetscFree(rbuf2[0])); 817 PetscCall(PetscFree(rbuf2)); 818 PetscCall(PetscFree(s_waits1)); 819 PetscCall(PetscFree(r_waits1)); 820 PetscCall(PetscFree(s_waits2)); 821 PetscCall(PetscFree(r_waits2)); 822 PetscCall(PetscFree(recv_status)); 823 if (xdata) { 824 PetscCall(PetscFree(xdata[0])); 825 PetscCall(PetscFree(xdata)); 826 } 827 PetscCall(PetscFree(isz1)); 828 #if defined(PETSC_USE_CTABLE) 829 for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i)); 830 PetscCall(PetscFree(table_data)); 831 PetscCall(PetscFree(tdata)); 832 PetscCall(PetscFree4(table, data, isz, t_p)); 833 #else 834 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 835 #endif 836 PetscFunctionReturn(0); 837 } 838 839 /* 840 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 841 the work on the local processor. 842 843 Inputs: 844 C - MAT_MPIAIJ; 845 imax - total no of index sets processed at a time; 846 table - an array of char - size = m bits. 847 848 Output: 849 isz - array containing the count of the solution elements corresponding 850 to each index set; 851 data or table_data - pointer to the solutions 852 */ 853 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data) 854 { 855 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 856 Mat A = c->A, B = c->B; 857 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 858 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 859 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 860 PetscBT table_i; 861 #if defined(PETSC_USE_CTABLE) 862 PetscHMapI table_data_i; 863 PetscHashIter tpos; 864 PetscInt tcount, *tdata; 865 #else 866 PetscInt *data_i; 867 #endif 868 869 PetscFunctionBegin; 870 rstart = C->rmap->rstart; 871 cstart = C->cmap->rstart; 872 ai = a->i; 873 aj = a->j; 874 bi = b->i; 875 bj = b->j; 876 garray = c->garray; 877 878 for (i = 0; i < imax; i++) { 879 #if defined(PETSC_USE_CTABLE) 880 /* copy existing entries of table_data_i into tdata[] */ 881 table_data_i = table_data[i]; 882 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 883 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 884 885 PetscCall(PetscMalloc1(tcount, &tdata)); 886 PetscHashIterBegin(table_data_i, tpos); 887 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 888 PetscHashIterGetKey(table_data_i, tpos, row); 889 PetscHashIterGetVal(table_data_i, tpos, j); 890 PetscHashIterNext(table_data_i, tpos); 891 tdata[--j] = --row; 892 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 893 } 894 #else 895 data_i = data[i]; 896 #endif 897 table_i = table[i]; 898 isz_i = isz[i]; 899 max = isz[i]; 900 901 for (j = 0; j < max; j++) { 902 #if defined(PETSC_USE_CTABLE) 903 row = tdata[j] - rstart; 904 #else 905 row = data_i[j] - rstart; 906 #endif 907 start = ai[row]; 908 end = ai[row + 1]; 909 for (k = start; k < end; k++) { /* Amat */ 910 val = aj[k] + cstart; 911 if (!PetscBTLookupSet(table_i, val)) { 912 #if defined(PETSC_USE_CTABLE) 913 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 914 #else 915 data_i[isz_i] = val; 916 #endif 917 isz_i++; 918 } 919 } 920 start = bi[row]; 921 end = bi[row + 1]; 922 for (k = start; k < end; k++) { /* Bmat */ 923 val = garray[bj[k]]; 924 if (!PetscBTLookupSet(table_i, val)) { 925 #if defined(PETSC_USE_CTABLE) 926 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 927 #else 928 data_i[isz_i] = val; 929 #endif 930 isz_i++; 931 } 932 } 933 } 934 isz[i] = isz_i; 935 936 #if defined(PETSC_USE_CTABLE) 937 PetscCall(PetscFree(tdata)); 938 #endif 939 } 940 PetscFunctionReturn(0); 941 } 942 943 /* 944 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 945 and return the output 946 947 Input: 948 C - the matrix 949 nrqr - no of messages being processed. 950 rbuf - an array of pointers to the received requests 951 952 Output: 953 xdata - array of messages to be sent back 954 isz1 - size of each message 955 956 For better efficiency perhaps we should malloc separately each xdata[i], 957 then if a remalloc is required we need only copy the data for that one row 958 rather then all previous rows as it is now where a single large chunk of 959 memory is used. 960 961 */ 962 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) 963 { 964 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 965 Mat A = c->A, B = c->B; 966 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 967 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 968 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 969 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 970 PetscInt *rbuf_i, kmax, rbuf_0; 971 PetscBT xtable; 972 973 PetscFunctionBegin; 974 m = C->rmap->N; 975 rstart = C->rmap->rstart; 976 cstart = C->cmap->rstart; 977 ai = a->i; 978 aj = a->j; 979 bi = b->i; 980 bj = b->j; 981 garray = c->garray; 982 983 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 984 rbuf_i = rbuf[i]; 985 rbuf_0 = rbuf_i[0]; 986 ct += rbuf_0; 987 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 988 } 989 990 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 991 else max1 = 1; 992 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 993 if (nrqr) { 994 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 995 ++no_malloc; 996 } 997 PetscCall(PetscBTCreate(m, &xtable)); 998 PetscCall(PetscArrayzero(isz1, nrqr)); 999 1000 ct3 = 0; 1001 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 1002 rbuf_i = rbuf[i]; 1003 rbuf_0 = rbuf_i[0]; 1004 ct1 = 2 * rbuf_0 + 1; 1005 ct2 = ct1; 1006 ct3 += ct1; 1007 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 1008 PetscCall(PetscBTMemzero(m, xtable)); 1009 oct2 = ct2; 1010 kmax = rbuf_i[2 * j]; 1011 for (k = 0; k < kmax; k++, ct1++) { 1012 row = rbuf_i[ct1]; 1013 if (!PetscBTLookupSet(xtable, row)) { 1014 if (!(ct3 < mem_estimate)) { 1015 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1016 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1017 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1018 PetscCall(PetscFree(xdata[0])); 1019 xdata[0] = tmp; 1020 mem_estimate = new_estimate; 1021 ++no_malloc; 1022 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1023 } 1024 xdata[i][ct2++] = row; 1025 ct3++; 1026 } 1027 } 1028 for (k = oct2, max2 = ct2; k < max2; k++) { 1029 row = xdata[i][k] - rstart; 1030 start = ai[row]; 1031 end = ai[row + 1]; 1032 for (l = start; l < end; l++) { 1033 val = aj[l] + cstart; 1034 if (!PetscBTLookupSet(xtable, val)) { 1035 if (!(ct3 < mem_estimate)) { 1036 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1037 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1038 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1039 PetscCall(PetscFree(xdata[0])); 1040 xdata[0] = tmp; 1041 mem_estimate = new_estimate; 1042 ++no_malloc; 1043 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1044 } 1045 xdata[i][ct2++] = val; 1046 ct3++; 1047 } 1048 } 1049 start = bi[row]; 1050 end = bi[row + 1]; 1051 for (l = start; l < end; l++) { 1052 val = garray[bj[l]]; 1053 if (!PetscBTLookupSet(xtable, val)) { 1054 if (!(ct3 < mem_estimate)) { 1055 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1056 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1057 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1058 PetscCall(PetscFree(xdata[0])); 1059 xdata[0] = tmp; 1060 mem_estimate = new_estimate; 1061 ++no_malloc; 1062 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1063 } 1064 xdata[i][ct2++] = val; 1065 ct3++; 1066 } 1067 } 1068 } 1069 /* Update the header*/ 1070 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1071 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1072 } 1073 xdata[i][0] = rbuf_0; 1074 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1075 isz1[i] = ct2; /* size of each message */ 1076 } 1077 PetscCall(PetscBTDestroy(&xtable)); 1078 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1079 PetscFunctionReturn(0); 1080 } 1081 /* -------------------------------------------------------------------------*/ 1082 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1083 /* 1084 Every processor gets the entire matrix 1085 */ 1086 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) 1087 { 1088 Mat B; 1089 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1090 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1091 PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL; 1092 PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j; 1093 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1094 1095 PetscFunctionBegin; 1096 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1097 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1098 if (scall == MAT_INITIAL_MATRIX) { 1099 /* ---------------------------------------------------------------- 1100 Tell every processor the number of nonzeros per row 1101 */ 1102 PetscCall(PetscMalloc1(A->rmap->N, &lens)); 1103 for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; 1104 PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1105 for (i = 0; i < size; i++) { 1106 recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i]; 1107 displs[i] = A->rmap->range[i]; 1108 } 1109 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1110 /* --------------------------------------------------------------- 1111 Create the sequential matrix of the same type as the local block diagonal 1112 */ 1113 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1114 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1115 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1116 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1117 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1118 PetscCall(PetscCalloc1(2, Bin)); 1119 **Bin = B; 1120 b = (Mat_SeqAIJ *)B->data; 1121 1122 /*-------------------------------------------------------------------- 1123 Copy my part of matrix column indices over 1124 */ 1125 sendcount = ad->nz + bd->nz; 1126 jsendbuf = b->j + b->i[rstarts[rank]]; 1127 a_jsendbuf = ad->j; 1128 b_jsendbuf = bd->j; 1129 n = A->rmap->rend - A->rmap->rstart; 1130 cnt = 0; 1131 for (i = 0; i < n; i++) { 1132 /* put in lower diagonal portion */ 1133 m = bd->i[i + 1] - bd->i[i]; 1134 while (m > 0) { 1135 /* is it above diagonal (in bd (compressed) numbering) */ 1136 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1137 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1138 m--; 1139 } 1140 1141 /* put in diagonal portion */ 1142 for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1143 1144 /* put in upper diagonal portion */ 1145 while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1146 } 1147 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1148 1149 /*-------------------------------------------------------------------- 1150 Gather all column indices to all processors 1151 */ 1152 for (i = 0; i < size; i++) { 1153 recvcounts[i] = 0; 1154 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j]; 1155 } 1156 displs[0] = 0; 1157 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1158 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1159 /*-------------------------------------------------------------------- 1160 Assemble the matrix into useable form (note numerical values not yet set) 1161 */ 1162 /* set the b->ilen (length of each row) values */ 1163 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1164 /* set the b->i indices */ 1165 b->i[0] = 0; 1166 for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1]; 1167 PetscCall(PetscFree(lens)); 1168 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1169 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1170 1171 } else { 1172 B = **Bin; 1173 b = (Mat_SeqAIJ *)B->data; 1174 } 1175 1176 /*-------------------------------------------------------------------- 1177 Copy my part of matrix numerical values into the values location 1178 */ 1179 if (flag == MAT_GET_VALUES) { 1180 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1181 MatScalar *sendbuf, *recvbuf; 1182 1183 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1184 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1185 sendcount = ad->nz + bd->nz; 1186 sendbuf = b->a + b->i[rstarts[rank]]; 1187 a_sendbuf = ada; 1188 b_sendbuf = bda; 1189 b_sendj = bd->j; 1190 n = A->rmap->rend - A->rmap->rstart; 1191 cnt = 0; 1192 for (i = 0; i < n; i++) { 1193 /* put in lower diagonal portion */ 1194 m = bd->i[i + 1] - bd->i[i]; 1195 while (m > 0) { 1196 /* is it above diagonal (in bd (compressed) numbering) */ 1197 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1198 sendbuf[cnt++] = *b_sendbuf++; 1199 m--; 1200 b_sendj++; 1201 } 1202 1203 /* put in diagonal portion */ 1204 for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++; 1205 1206 /* put in upper diagonal portion */ 1207 while (m-- > 0) { 1208 sendbuf[cnt++] = *b_sendbuf++; 1209 b_sendj++; 1210 } 1211 } 1212 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1213 1214 /* ----------------------------------------------------------------- 1215 Gather all numerical values to all processors 1216 */ 1217 if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1218 for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]]; 1219 displs[0] = 0; 1220 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1221 recvbuf = b->a; 1222 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A))); 1223 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1224 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1225 } /* endof (flag == MAT_GET_VALUES) */ 1226 PetscCall(PetscFree2(recvcounts, displs)); 1227 1228 PetscCall(MatPropagateSymmetryOptions(A, B)); 1229 PetscFunctionReturn(0); 1230 } 1231 1232 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) 1233 { 1234 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1235 Mat submat, A = c->A, B = c->B; 1236 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1237 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1238 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1239 const PetscInt *icol, *irow; 1240 PetscInt nrow, ncol, start; 1241 PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr; 1242 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc; 1243 PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr; 1244 PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz; 1245 PetscInt *lens, rmax, ncols, *cols, Crow; 1246 #if defined(PETSC_USE_CTABLE) 1247 PetscHMapI cmap, rmap; 1248 PetscInt *cmap_loc, *rmap_loc; 1249 #else 1250 PetscInt *cmap, *rmap; 1251 #endif 1252 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i; 1253 PetscInt *cworkB, lwrite, *subcols, *row2proc; 1254 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1255 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1256 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1257 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1258 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1259 MPI_Comm comm; 1260 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1261 PetscMPIInt *onodes1, *olengths1, idex, end; 1262 Mat_SubSppt *smatis1; 1263 PetscBool isrowsorted, iscolsorted; 1264 1265 PetscFunctionBegin; 1266 PetscValidLogicalCollectiveInt(C, ismax, 2); 1267 PetscValidLogicalCollectiveEnum(C, scall, 5); 1268 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1269 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1270 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1271 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1272 size = c->size; 1273 rank = c->rank; 1274 1275 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1276 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1277 PetscCall(ISGetIndices(isrow[0], &irow)); 1278 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1279 if (allcolumns) { 1280 icol = NULL; 1281 ncol = C->cmap->N; 1282 } else { 1283 PetscCall(ISGetIndices(iscol[0], &icol)); 1284 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1285 } 1286 1287 if (scall == MAT_INITIAL_MATRIX) { 1288 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1289 1290 /* Get some new tags to keep the communication clean */ 1291 tag1 = ((PetscObject)C)->tag; 1292 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1293 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1294 1295 /* evaluate communication - mesg to who, length of mesg, and buffer space 1296 required. Based on this, buffers are allocated, and data copied into them */ 1297 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1298 PetscCall(PetscMalloc1(nrow, &row2proc)); 1299 1300 /* w1[proc] = num of rows owned by proc -- to be requested */ 1301 proc = 0; 1302 nrqs = 0; /* num of outgoing messages */ 1303 for (j = 0; j < nrow; j++) { 1304 row = irow[j]; 1305 if (!isrowsorted) proc = 0; 1306 while (row >= C->rmap->range[proc + 1]) proc++; 1307 w1[proc]++; 1308 row2proc[j] = proc; /* map row index to proc */ 1309 1310 if (proc != rank && !w2[proc]) { 1311 w2[proc] = 1; 1312 nrqs++; 1313 } 1314 } 1315 w1[rank] = 0; /* rows owned by self will not be requested */ 1316 1317 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1318 for (proc = 0, j = 0; proc < size; proc++) { 1319 if (w1[proc]) pa[j++] = proc; 1320 } 1321 1322 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1323 msz = 0; /* total mesg length (for all procs) */ 1324 for (i = 0; i < nrqs; i++) { 1325 proc = pa[i]; 1326 w1[proc] += 3; 1327 msz += w1[proc]; 1328 } 1329 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1330 1331 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1332 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1333 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1334 1335 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1336 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1337 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1338 1339 /* Now post the Irecvs corresponding to these messages */ 1340 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1341 1342 PetscCall(PetscFree(onodes1)); 1343 PetscCall(PetscFree(olengths1)); 1344 1345 /* Allocate Memory for outgoing messages */ 1346 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1347 PetscCall(PetscArrayzero(sbuf1, size)); 1348 PetscCall(PetscArrayzero(ptr, size)); 1349 1350 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1351 iptr = tmp; 1352 for (i = 0; i < nrqs; i++) { 1353 proc = pa[i]; 1354 sbuf1[proc] = iptr; 1355 iptr += w1[proc]; 1356 } 1357 1358 /* Form the outgoing messages */ 1359 /* Initialize the header space */ 1360 for (i = 0; i < nrqs; i++) { 1361 proc = pa[i]; 1362 PetscCall(PetscArrayzero(sbuf1[proc], 3)); 1363 ptr[proc] = sbuf1[proc] + 3; 1364 } 1365 1366 /* Parse the isrow and copy data into outbuf */ 1367 PetscCall(PetscArrayzero(ctr, size)); 1368 for (j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1369 proc = row2proc[j]; 1370 if (proc != rank) { /* copy to the outgoing buf*/ 1371 *ptr[proc] = irow[j]; 1372 ctr[proc]++; 1373 ptr[proc]++; 1374 } 1375 } 1376 1377 /* Update the headers for the current IS */ 1378 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 1379 if ((ctr_j = ctr[j])) { 1380 sbuf1_j = sbuf1[j]; 1381 k = ++sbuf1_j[0]; 1382 sbuf1_j[2 * k] = ctr_j; 1383 sbuf1_j[2 * k - 1] = 0; 1384 } 1385 } 1386 1387 /* Now post the sends */ 1388 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1389 for (i = 0; i < nrqs; ++i) { 1390 proc = pa[i]; 1391 PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i)); 1392 } 1393 1394 /* Post Receives to capture the buffer size */ 1395 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1396 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1397 1398 if (nrqs) rbuf2[0] = tmp + msz; 1399 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1400 1401 for (i = 0; i < nrqs; ++i) { 1402 proc = pa[i]; 1403 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i)); 1404 } 1405 1406 PetscCall(PetscFree2(w1, w2)); 1407 1408 /* Send to other procs the buf size they should allocate */ 1409 /* Receive messages*/ 1410 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1411 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1412 1413 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1414 for (i = 0; i < nrqr; ++i) { 1415 req_size[i] = 0; 1416 rbuf1_i = rbuf1[i]; 1417 start = 2 * rbuf1_i[0] + 1; 1418 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1419 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1420 sbuf2_i = sbuf2[i]; 1421 for (j = start; j < end; j++) { 1422 k = rbuf1_i[j] - rstart; 1423 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1424 sbuf2_i[j] = ncols; 1425 req_size[i] += ncols; 1426 } 1427 req_source1[i] = r_status1[i].MPI_SOURCE; 1428 1429 /* form the header */ 1430 sbuf2_i[0] = req_size[i]; 1431 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1432 1433 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1434 } 1435 1436 PetscCall(PetscFree(r_status1)); 1437 PetscCall(PetscFree(r_waits1)); 1438 1439 /* rbuf2 is received, Post recv column indices a->j */ 1440 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1441 1442 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1443 for (i = 0; i < nrqs; ++i) { 1444 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1445 req_source2[i] = r_status2[i].MPI_SOURCE; 1446 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1447 } 1448 1449 /* Wait on sends1 and sends2 */ 1450 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1451 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1452 PetscCall(PetscFree(s_waits1)); 1453 PetscCall(PetscFree(s_status1)); 1454 1455 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1456 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1457 1458 /* Now allocate sending buffers for a->j, and send them off */ 1459 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1460 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1461 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 1462 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1463 1464 for (i = 0; i < nrqr; i++) { /* for each requested message */ 1465 rbuf1_i = rbuf1[i]; 1466 sbuf_aj_i = sbuf_aj[i]; 1467 ct1 = 2 * rbuf1_i[0] + 1; 1468 ct2 = 0; 1469 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1470 1471 kmax = rbuf1[i][2]; 1472 for (k = 0; k < kmax; k++, ct1++) { /* for each row */ 1473 row = rbuf1_i[ct1] - rstart; 1474 nzA = ai[row + 1] - ai[row]; 1475 nzB = bi[row + 1] - bi[row]; 1476 ncols = nzA + nzB; 1477 cworkA = aj + ai[row]; 1478 cworkB = bj + bi[row]; 1479 1480 /* load the column indices for this row into cols*/ 1481 cols = sbuf_aj_i + ct2; 1482 1483 lwrite = 0; 1484 for (l = 0; l < nzB; l++) { 1485 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1486 } 1487 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1488 for (l = 0; l < nzB; l++) { 1489 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1490 } 1491 1492 ct2 += ncols; 1493 } 1494 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1495 } 1496 1497 /* create column map (cmap): global col of C -> local col of submat */ 1498 #if defined(PETSC_USE_CTABLE) 1499 if (!allcolumns) { 1500 PetscCall(PetscHMapICreateWithSize(ncol, &cmap)); 1501 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1502 for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1503 if (icol[j] >= cstart && icol[j] < cend) { 1504 cmap_loc[icol[j] - cstart] = j + 1; 1505 } else { /* use PetscHMapI for non-local col indices */ 1506 PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1)); 1507 } 1508 } 1509 } else { 1510 cmap = NULL; 1511 cmap_loc = NULL; 1512 } 1513 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1514 #else 1515 if (!allcolumns) { 1516 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1517 for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1518 } else { 1519 cmap = NULL; 1520 } 1521 #endif 1522 1523 /* Create lens for MatSeqAIJSetPreallocation() */ 1524 PetscCall(PetscCalloc1(nrow, &lens)); 1525 1526 /* Compute lens from local part of C */ 1527 for (j = 0; j < nrow; j++) { 1528 row = irow[j]; 1529 proc = row2proc[j]; 1530 if (proc == rank) { 1531 /* diagonal part A = c->A */ 1532 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1533 cols = aj + ai[row - rstart]; 1534 if (!allcolumns) { 1535 for (k = 0; k < ncols; k++) { 1536 #if defined(PETSC_USE_CTABLE) 1537 tcol = cmap_loc[cols[k]]; 1538 #else 1539 tcol = cmap[cols[k] + cstart]; 1540 #endif 1541 if (tcol) lens[j]++; 1542 } 1543 } else { /* allcolumns */ 1544 lens[j] = ncols; 1545 } 1546 1547 /* off-diagonal part B = c->B */ 1548 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1549 cols = bj + bi[row - rstart]; 1550 if (!allcolumns) { 1551 for (k = 0; k < ncols; k++) { 1552 #if defined(PETSC_USE_CTABLE) 1553 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1554 #else 1555 tcol = cmap[bmap[cols[k]]]; 1556 #endif 1557 if (tcol) lens[j]++; 1558 } 1559 } else { /* allcolumns */ 1560 lens[j] += ncols; 1561 } 1562 } 1563 } 1564 1565 /* Create row map (rmap): global row of C -> local row of submat */ 1566 #if defined(PETSC_USE_CTABLE) 1567 PetscCall(PetscHMapICreateWithSize(nrow, &rmap)); 1568 for (j = 0; j < nrow; j++) { 1569 row = irow[j]; 1570 proc = row2proc[j]; 1571 if (proc == rank) { /* a local row */ 1572 rmap_loc[row - rstart] = j; 1573 } else { 1574 PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1)); 1575 } 1576 } 1577 #else 1578 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1579 for (j = 0; j < nrow; j++) rmap[irow[j]] = j; 1580 #endif 1581 1582 /* Update lens from offproc data */ 1583 /* recv a->j is done */ 1584 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1585 for (i = 0; i < nrqs; i++) { 1586 proc = pa[i]; 1587 sbuf1_i = sbuf1[proc]; 1588 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1589 ct1 = 2 + 1; 1590 ct2 = 0; 1591 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1592 rbuf3_i = rbuf3[i]; /* received C->j */ 1593 1594 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1595 max1 = sbuf1_i[2]; 1596 for (k = 0; k < max1; k++, ct1++) { 1597 #if defined(PETSC_USE_CTABLE) 1598 PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row)); 1599 row--; 1600 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1601 #else 1602 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1603 #endif 1604 /* Now, store row index of submat in sbuf1_i[ct1] */ 1605 sbuf1_i[ct1] = row; 1606 1607 nnz = rbuf2_i[ct1]; 1608 if (!allcolumns) { 1609 for (l = 0; l < nnz; l++, ct2++) { 1610 #if defined(PETSC_USE_CTABLE) 1611 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1612 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1613 } else { 1614 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1615 } 1616 #else 1617 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1618 #endif 1619 if (tcol) lens[row]++; 1620 } 1621 } else { /* allcolumns */ 1622 lens[row] += nnz; 1623 } 1624 } 1625 } 1626 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1627 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1628 1629 /* Create the submatrices */ 1630 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1631 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1632 1633 PetscCall(ISGetBlockSize(isrow[0], &i)); 1634 PetscCall(ISGetBlockSize(iscol[0], &j)); 1635 PetscCall(MatSetBlockSizes(submat, i, j)); 1636 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1637 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1638 1639 /* create struct Mat_SubSppt and attached it to submat */ 1640 PetscCall(PetscNew(&smatis1)); 1641 subc = (Mat_SeqAIJ *)submat->data; 1642 subc->submatis1 = smatis1; 1643 1644 smatis1->id = 0; 1645 smatis1->nrqs = nrqs; 1646 smatis1->nrqr = nrqr; 1647 smatis1->rbuf1 = rbuf1; 1648 smatis1->rbuf2 = rbuf2; 1649 smatis1->rbuf3 = rbuf3; 1650 smatis1->sbuf2 = sbuf2; 1651 smatis1->req_source2 = req_source2; 1652 1653 smatis1->sbuf1 = sbuf1; 1654 smatis1->ptr = ptr; 1655 smatis1->tmp = tmp; 1656 smatis1->ctr = ctr; 1657 1658 smatis1->pa = pa; 1659 smatis1->req_size = req_size; 1660 smatis1->req_source1 = req_source1; 1661 1662 smatis1->allcolumns = allcolumns; 1663 smatis1->singleis = PETSC_TRUE; 1664 smatis1->row2proc = row2proc; 1665 smatis1->rmap = rmap; 1666 smatis1->cmap = cmap; 1667 #if defined(PETSC_USE_CTABLE) 1668 smatis1->rmap_loc = rmap_loc; 1669 smatis1->cmap_loc = cmap_loc; 1670 #endif 1671 1672 smatis1->destroy = submat->ops->destroy; 1673 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1674 submat->factortype = C->factortype; 1675 1676 /* compute rmax */ 1677 rmax = 0; 1678 for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1679 1680 } else { /* scall == MAT_REUSE_MATRIX */ 1681 submat = submats[0]; 1682 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1683 1684 subc = (Mat_SeqAIJ *)submat->data; 1685 rmax = subc->rmax; 1686 smatis1 = subc->submatis1; 1687 nrqs = smatis1->nrqs; 1688 nrqr = smatis1->nrqr; 1689 rbuf1 = smatis1->rbuf1; 1690 rbuf2 = smatis1->rbuf2; 1691 rbuf3 = smatis1->rbuf3; 1692 req_source2 = smatis1->req_source2; 1693 1694 sbuf1 = smatis1->sbuf1; 1695 sbuf2 = smatis1->sbuf2; 1696 ptr = smatis1->ptr; 1697 tmp = smatis1->tmp; 1698 ctr = smatis1->ctr; 1699 1700 pa = smatis1->pa; 1701 req_size = smatis1->req_size; 1702 req_source1 = smatis1->req_source1; 1703 1704 allcolumns = smatis1->allcolumns; 1705 row2proc = smatis1->row2proc; 1706 rmap = smatis1->rmap; 1707 cmap = smatis1->cmap; 1708 #if defined(PETSC_USE_CTABLE) 1709 rmap_loc = smatis1->rmap_loc; 1710 cmap_loc = smatis1->cmap_loc; 1711 #endif 1712 } 1713 1714 /* Post recv matrix values */ 1715 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1716 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1717 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1718 for (i = 0; i < nrqs; ++i) { 1719 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1720 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1721 } 1722 1723 /* Allocate sending buffers for a->a, and send them off */ 1724 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1725 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1726 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 1727 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1728 1729 for (i = 0; i < nrqr; i++) { 1730 rbuf1_i = rbuf1[i]; 1731 sbuf_aa_i = sbuf_aa[i]; 1732 ct1 = 2 * rbuf1_i[0] + 1; 1733 ct2 = 0; 1734 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1735 1736 kmax = rbuf1_i[2]; 1737 for (k = 0; k < kmax; k++, ct1++) { 1738 row = rbuf1_i[ct1] - rstart; 1739 nzA = ai[row + 1] - ai[row]; 1740 nzB = bi[row + 1] - bi[row]; 1741 ncols = nzA + nzB; 1742 cworkB = bj + bi[row]; 1743 vworkA = a_a + ai[row]; 1744 vworkB = b_a + bi[row]; 1745 1746 /* load the column values for this row into vals*/ 1747 vals = sbuf_aa_i + ct2; 1748 1749 lwrite = 0; 1750 for (l = 0; l < nzB; l++) { 1751 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1752 } 1753 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1754 for (l = 0; l < nzB; l++) { 1755 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1756 } 1757 1758 ct2 += ncols; 1759 } 1760 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1761 } 1762 1763 /* Assemble submat */ 1764 /* First assemble the local rows */ 1765 for (j = 0; j < nrow; j++) { 1766 row = irow[j]; 1767 proc = row2proc[j]; 1768 if (proc == rank) { 1769 Crow = row - rstart; /* local row index of C */ 1770 #if defined(PETSC_USE_CTABLE) 1771 row = rmap_loc[Crow]; /* row index of submat */ 1772 #else 1773 row = rmap[row]; 1774 #endif 1775 1776 if (allcolumns) { 1777 /* diagonal part A = c->A */ 1778 ncols = ai[Crow + 1] - ai[Crow]; 1779 cols = aj + ai[Crow]; 1780 vals = a_a + ai[Crow]; 1781 i = 0; 1782 for (k = 0; k < ncols; k++) { 1783 subcols[i] = cols[k] + cstart; 1784 subvals[i++] = vals[k]; 1785 } 1786 1787 /* off-diagonal part B = c->B */ 1788 ncols = bi[Crow + 1] - bi[Crow]; 1789 cols = bj + bi[Crow]; 1790 vals = b_a + bi[Crow]; 1791 for (k = 0; k < ncols; k++) { 1792 subcols[i] = bmap[cols[k]]; 1793 subvals[i++] = vals[k]; 1794 } 1795 1796 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1797 1798 } else { /* !allcolumns */ 1799 #if defined(PETSC_USE_CTABLE) 1800 /* diagonal part A = c->A */ 1801 ncols = ai[Crow + 1] - ai[Crow]; 1802 cols = aj + ai[Crow]; 1803 vals = a_a + ai[Crow]; 1804 i = 0; 1805 for (k = 0; k < ncols; k++) { 1806 tcol = cmap_loc[cols[k]]; 1807 if (tcol) { 1808 subcols[i] = --tcol; 1809 subvals[i++] = vals[k]; 1810 } 1811 } 1812 1813 /* off-diagonal part B = c->B */ 1814 ncols = bi[Crow + 1] - bi[Crow]; 1815 cols = bj + bi[Crow]; 1816 vals = b_a + bi[Crow]; 1817 for (k = 0; k < ncols; k++) { 1818 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1819 if (tcol) { 1820 subcols[i] = --tcol; 1821 subvals[i++] = vals[k]; 1822 } 1823 } 1824 #else 1825 /* diagonal part A = c->A */ 1826 ncols = ai[Crow + 1] - ai[Crow]; 1827 cols = aj + ai[Crow]; 1828 vals = a_a + ai[Crow]; 1829 i = 0; 1830 for (k = 0; k < ncols; k++) { 1831 tcol = cmap[cols[k] + cstart]; 1832 if (tcol) { 1833 subcols[i] = --tcol; 1834 subvals[i++] = vals[k]; 1835 } 1836 } 1837 1838 /* off-diagonal part B = c->B */ 1839 ncols = bi[Crow + 1] - bi[Crow]; 1840 cols = bj + bi[Crow]; 1841 vals = b_a + bi[Crow]; 1842 for (k = 0; k < ncols; k++) { 1843 tcol = cmap[bmap[cols[k]]]; 1844 if (tcol) { 1845 subcols[i] = --tcol; 1846 subvals[i++] = vals[k]; 1847 } 1848 } 1849 #endif 1850 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1851 } 1852 } 1853 } 1854 1855 /* Now assemble the off-proc rows */ 1856 for (i = 0; i < nrqs; i++) { /* for each requested message */ 1857 /* recv values from other processes */ 1858 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1859 proc = pa[idex]; 1860 sbuf1_i = sbuf1[proc]; 1861 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1862 ct1 = 2 + 1; 1863 ct2 = 0; /* count of received C->j */ 1864 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1865 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1866 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1867 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1868 1869 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1870 max1 = sbuf1_i[2]; /* num of rows */ 1871 for (k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1872 row = sbuf1_i[ct1]; /* row index of submat */ 1873 if (!allcolumns) { 1874 idex = 0; 1875 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1876 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1877 for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1878 #if defined(PETSC_USE_CTABLE) 1879 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1880 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1881 } else { 1882 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1883 } 1884 #else 1885 tcol = cmap[rbuf3_i[ct2]]; 1886 #endif 1887 if (tcol) { 1888 subcols[idex] = --tcol; /* may not be sorted */ 1889 subvals[idex++] = rbuf4_i[ct2]; 1890 1891 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1892 For reuse, we replace received C->j with index that should be inserted to submat */ 1893 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1894 } 1895 } 1896 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1897 } else { /* scall == MAT_REUSE_MATRIX */ 1898 submat = submats[0]; 1899 subc = (Mat_SeqAIJ *)submat->data; 1900 1901 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1902 for (l = 0; l < nnz; l++) { 1903 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1904 subvals[idex++] = rbuf4_i[ct2]; 1905 } 1906 1907 bj = subc->j + subc->i[row]; /* sorted column indices */ 1908 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1909 } 1910 } else { /* allcolumns */ 1911 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1912 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES)); 1913 ct2 += nnz; 1914 } 1915 } 1916 } 1917 1918 /* sending a->a are done */ 1919 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1920 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1921 1922 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1923 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1924 submats[0] = submat; 1925 1926 /* Restore the indices */ 1927 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1928 if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol)); 1929 1930 /* Destroy allocated memory */ 1931 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 1932 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1933 if (sbuf_aa) { 1934 PetscCall(PetscFree(sbuf_aa[0])); 1935 PetscCall(PetscFree(sbuf_aa)); 1936 } 1937 1938 if (scall == MAT_INITIAL_MATRIX) { 1939 PetscCall(PetscFree(lens)); 1940 if (sbuf_aj) { 1941 PetscCall(PetscFree(sbuf_aj[0])); 1942 PetscCall(PetscFree(sbuf_aj)); 1943 } 1944 } 1945 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1946 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1947 PetscFunctionReturn(0); 1948 } 1949 1950 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1951 { 1952 PetscInt ncol; 1953 PetscBool colflag, allcolumns = PETSC_FALSE; 1954 1955 PetscFunctionBegin; 1956 /* Allocate memory to hold all the submatrices */ 1957 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat)); 1958 1959 /* Check for special case: each processor gets entire matrix columns */ 1960 PetscCall(ISIdentity(iscol[0], &colflag)); 1961 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1962 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1963 1964 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1965 PetscFunctionReturn(0); 1966 } 1967 1968 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1969 { 1970 PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2]; 1971 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1972 Mat_SeqAIJ *subc; 1973 Mat_SubSppt *smat; 1974 1975 PetscFunctionBegin; 1976 /* Check for special case: each processor has a single IS */ 1977 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1978 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1979 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1980 PetscFunctionReturn(0); 1981 } 1982 1983 /* Collect global wantallmatrix and nstages */ 1984 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1985 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1986 if (!nmax) nmax = 1; 1987 1988 if (scall == MAT_INITIAL_MATRIX) { 1989 /* Collect global wantallmatrix and nstages */ 1990 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1991 PetscCall(ISIdentity(*isrow, &rowflag)); 1992 PetscCall(ISIdentity(*iscol, &colflag)); 1993 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1994 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1995 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1996 wantallmatrix = PETSC_TRUE; 1997 1998 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1999 } 2000 } 2001 2002 /* Determine the number of stages through which submatrices are done 2003 Each stage will extract nmax submatrices. 2004 nmax is determined by the matrix column dimension. 2005 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2006 */ 2007 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2008 2009 in[0] = -1 * (PetscInt)wantallmatrix; 2010 in[1] = nstages; 2011 PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 2012 wantallmatrix = (PetscBool)(-out[0]); 2013 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2014 2015 } else { /* MAT_REUSE_MATRIX */ 2016 if (ismax) { 2017 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2018 smat = subc->submatis1; 2019 } else { /* (*submat)[0] is a dummy matrix */ 2020 smat = (Mat_SubSppt *)(*submat)[0]->data; 2021 } 2022 if (!smat) { 2023 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2024 wantallmatrix = PETSC_TRUE; 2025 } else if (smat->singleis) { 2026 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 2027 PetscFunctionReturn(0); 2028 } else { 2029 nstages = smat->nstages; 2030 } 2031 } 2032 2033 if (wantallmatrix) { 2034 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 2035 PetscFunctionReturn(0); 2036 } 2037 2038 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2039 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat)); 2040 2041 for (i = 0, pos = 0; i < nstages; i++) { 2042 if (pos + nmax <= ismax) max_no = nmax; 2043 else if (pos >= ismax) max_no = 0; 2044 else max_no = ismax - pos; 2045 2046 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos)); 2047 if (!max_no) { 2048 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2049 smat = (Mat_SubSppt *)(*submat)[pos]->data; 2050 smat->nstages = nstages; 2051 } 2052 pos++; /* advance to next dummy matrix if any */ 2053 } else pos += max_no; 2054 } 2055 2056 if (ismax && scall == MAT_INITIAL_MATRIX) { 2057 /* save nstages for reuse */ 2058 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2059 smat = subc->submatis1; 2060 smat->nstages = nstages; 2061 } 2062 PetscFunctionReturn(0); 2063 } 2064 2065 /* -------------------------------------------------------------------------*/ 2066 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) 2067 { 2068 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2069 Mat A = c->A; 2070 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2071 const PetscInt **icol, **irow; 2072 PetscInt *nrow, *ncol, start; 2073 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr; 2074 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1; 2075 PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol; 2076 PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2077 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2078 #if defined(PETSC_USE_CTABLE) 2079 PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i; 2080 #else 2081 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2082 #endif 2083 const PetscInt *irow_i; 2084 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2085 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2086 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2087 MPI_Comm comm; 2088 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2089 PetscMPIInt *onodes1, *olengths1, end; 2090 PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2091 Mat_SubSppt *smat_i; 2092 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2093 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen; 2094 2095 PetscFunctionBegin; 2096 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2097 size = c->size; 2098 rank = c->rank; 2099 2100 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2101 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2102 2103 for (i = 0; i < ismax; i++) { 2104 PetscCall(ISSorted(iscol[i], &issorted[i])); 2105 if (!issorted[i]) iscsorted = issorted[i]; 2106 2107 PetscCall(ISSorted(isrow[i], &issorted[i])); 2108 2109 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2110 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2111 2112 /* Check for special case: allcolumn */ 2113 PetscCall(ISIdentity(iscol[i], &colflag)); 2114 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2115 if (colflag && ncol[i] == C->cmap->N) { 2116 allcolumns[i] = PETSC_TRUE; 2117 icol[i] = NULL; 2118 } else { 2119 allcolumns[i] = PETSC_FALSE; 2120 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2121 } 2122 } 2123 2124 if (scall == MAT_REUSE_MATRIX) { 2125 /* Assumes new rows are same length as the old rows */ 2126 for (i = 0; i < ismax; i++) { 2127 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2128 subc = (Mat_SeqAIJ *)submats[i]->data; 2129 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2130 2131 /* Initial matrix as if empty */ 2132 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2133 2134 smat_i = subc->submatis1; 2135 2136 nrqs = smat_i->nrqs; 2137 nrqr = smat_i->nrqr; 2138 rbuf1 = smat_i->rbuf1; 2139 rbuf2 = smat_i->rbuf2; 2140 rbuf3 = smat_i->rbuf3; 2141 req_source2 = smat_i->req_source2; 2142 2143 sbuf1 = smat_i->sbuf1; 2144 sbuf2 = smat_i->sbuf2; 2145 ptr = smat_i->ptr; 2146 tmp = smat_i->tmp; 2147 ctr = smat_i->ctr; 2148 2149 pa = smat_i->pa; 2150 req_size = smat_i->req_size; 2151 req_source1 = smat_i->req_source1; 2152 2153 allcolumns[i] = smat_i->allcolumns; 2154 row2proc[i] = smat_i->row2proc; 2155 rmap[i] = smat_i->rmap; 2156 cmap[i] = smat_i->cmap; 2157 } 2158 2159 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2160 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2161 smat_i = (Mat_SubSppt *)submats[0]->data; 2162 2163 nrqs = smat_i->nrqs; 2164 nrqr = smat_i->nrqr; 2165 rbuf1 = smat_i->rbuf1; 2166 rbuf2 = smat_i->rbuf2; 2167 rbuf3 = smat_i->rbuf3; 2168 req_source2 = smat_i->req_source2; 2169 2170 sbuf1 = smat_i->sbuf1; 2171 sbuf2 = smat_i->sbuf2; 2172 ptr = smat_i->ptr; 2173 tmp = smat_i->tmp; 2174 ctr = smat_i->ctr; 2175 2176 pa = smat_i->pa; 2177 req_size = smat_i->req_size; 2178 req_source1 = smat_i->req_source1; 2179 2180 allcolumns[0] = PETSC_FALSE; 2181 } 2182 } else { /* scall == MAT_INITIAL_MATRIX */ 2183 /* Get some new tags to keep the communication clean */ 2184 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2185 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2186 2187 /* evaluate communication - mesg to who, length of mesg, and buffer space 2188 required. Based on this, buffers are allocated, and data copied into them*/ 2189 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2190 2191 for (i = 0; i < ismax; i++) { 2192 jmax = nrow[i]; 2193 irow_i = irow[i]; 2194 2195 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2196 row2proc[i] = row2proc_i; 2197 2198 if (issorted[i]) proc = 0; 2199 for (j = 0; j < jmax; j++) { 2200 if (!issorted[i]) proc = 0; 2201 row = irow_i[j]; 2202 while (row >= C->rmap->range[proc + 1]) proc++; 2203 w4[proc]++; 2204 row2proc_i[j] = proc; /* map row index to proc */ 2205 } 2206 for (j = 0; j < size; j++) { 2207 if (w4[j]) { 2208 w1[j] += w4[j]; 2209 w3[j]++; 2210 w4[j] = 0; 2211 } 2212 } 2213 } 2214 2215 nrqs = 0; /* no of outgoing messages */ 2216 msz = 0; /* total mesg length (for all procs) */ 2217 w1[rank] = 0; /* no mesg sent to self */ 2218 w3[rank] = 0; 2219 for (i = 0; i < size; i++) { 2220 if (w1[i]) { 2221 w2[i] = 1; 2222 nrqs++; 2223 } /* there exists a message to proc i */ 2224 } 2225 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2226 for (i = 0, j = 0; i < size; i++) { 2227 if (w1[i]) { 2228 pa[j] = i; 2229 j++; 2230 } 2231 } 2232 2233 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2234 for (i = 0; i < nrqs; i++) { 2235 j = pa[i]; 2236 w1[j] += w2[j] + 2 * w3[j]; 2237 msz += w1[j]; 2238 } 2239 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2240 2241 /* Determine the number of messages to expect, their lengths, from from-ids */ 2242 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2243 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2244 2245 /* Now post the Irecvs corresponding to these messages */ 2246 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2247 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2248 2249 /* Allocate Memory for outgoing messages */ 2250 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2251 PetscCall(PetscArrayzero(sbuf1, size)); 2252 PetscCall(PetscArrayzero(ptr, size)); 2253 2254 { 2255 PetscInt *iptr = tmp; 2256 k = 0; 2257 for (i = 0; i < nrqs; i++) { 2258 j = pa[i]; 2259 iptr += k; 2260 sbuf1[j] = iptr; 2261 k = w1[j]; 2262 } 2263 } 2264 2265 /* Form the outgoing messages. Initialize the header space */ 2266 for (i = 0; i < nrqs; i++) { 2267 j = pa[i]; 2268 sbuf1[j][0] = 0; 2269 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2270 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2271 } 2272 2273 /* Parse the isrow and copy data into outbuf */ 2274 for (i = 0; i < ismax; i++) { 2275 row2proc_i = row2proc[i]; 2276 PetscCall(PetscArrayzero(ctr, size)); 2277 irow_i = irow[i]; 2278 jmax = nrow[i]; 2279 for (j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2280 proc = row2proc_i[j]; 2281 if (proc != rank) { /* copy to the outgoing buf*/ 2282 ctr[proc]++; 2283 *ptr[proc] = irow_i[j]; 2284 ptr[proc]++; 2285 } 2286 } 2287 /* Update the headers for the current IS */ 2288 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 2289 if ((ctr_j = ctr[j])) { 2290 sbuf1_j = sbuf1[j]; 2291 k = ++sbuf1_j[0]; 2292 sbuf1_j[2 * k] = ctr_j; 2293 sbuf1_j[2 * k - 1] = i; 2294 } 2295 } 2296 } 2297 2298 /* Now post the sends */ 2299 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2300 for (i = 0; i < nrqs; ++i) { 2301 j = pa[i]; 2302 PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2303 } 2304 2305 /* Post Receives to capture the buffer size */ 2306 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2307 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2308 if (nrqs) rbuf2[0] = tmp + msz; 2309 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 2310 for (i = 0; i < nrqs; ++i) { 2311 j = pa[i]; 2312 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2313 } 2314 2315 /* Send to other procs the buf size they should allocate */ 2316 /* Receive messages*/ 2317 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2318 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2319 { 2320 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2321 PetscInt *sbuf2_i; 2322 2323 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2324 for (i = 0; i < nrqr; ++i) { 2325 req_size[i] = 0; 2326 rbuf1_i = rbuf1[i]; 2327 start = 2 * rbuf1_i[0] + 1; 2328 end = olengths1[i]; 2329 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2330 sbuf2_i = sbuf2[i]; 2331 for (j = start; j < end; j++) { 2332 id = rbuf1_i[j] - rstart; 2333 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2334 sbuf2_i[j] = ncols; 2335 req_size[i] += ncols; 2336 } 2337 req_source1[i] = onodes1[i]; 2338 /* form the header */ 2339 sbuf2_i[0] = req_size[i]; 2340 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2341 2342 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2343 } 2344 } 2345 2346 PetscCall(PetscFree(onodes1)); 2347 PetscCall(PetscFree(olengths1)); 2348 PetscCall(PetscFree(r_waits1)); 2349 PetscCall(PetscFree4(w1, w2, w3, w4)); 2350 2351 /* Receive messages*/ 2352 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2353 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2354 for (i = 0; i < nrqs; ++i) { 2355 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2356 req_source2[i] = pa[i]; 2357 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2358 } 2359 PetscCall(PetscFree(r_waits2)); 2360 2361 /* Wait on sends1 and sends2 */ 2362 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2363 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2364 PetscCall(PetscFree(s_waits1)); 2365 PetscCall(PetscFree(s_waits2)); 2366 2367 /* Now allocate sending buffers for a->j, and send them off */ 2368 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2369 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2370 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 2371 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2372 2373 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2374 { 2375 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2376 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2377 PetscInt cend = C->cmap->rend; 2378 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2379 2380 for (i = 0; i < nrqr; i++) { 2381 rbuf1_i = rbuf1[i]; 2382 sbuf_aj_i = sbuf_aj[i]; 2383 ct1 = 2 * rbuf1_i[0] + 1; 2384 ct2 = 0; 2385 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2386 kmax = rbuf1[i][2 * j]; 2387 for (k = 0; k < kmax; k++, ct1++) { 2388 row = rbuf1_i[ct1] - rstart; 2389 nzA = a_i[row + 1] - a_i[row]; 2390 nzB = b_i[row + 1] - b_i[row]; 2391 ncols = nzA + nzB; 2392 cworkA = a_j + a_i[row]; 2393 cworkB = b_j + b_i[row]; 2394 2395 /* load the column indices for this row into cols */ 2396 cols = sbuf_aj_i + ct2; 2397 2398 lwrite = 0; 2399 for (l = 0; l < nzB; l++) { 2400 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2401 } 2402 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2403 for (l = 0; l < nzB; l++) { 2404 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2405 } 2406 2407 ct2 += ncols; 2408 } 2409 } 2410 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2411 } 2412 } 2413 2414 /* create col map: global col of C -> local col of submatrices */ 2415 { 2416 const PetscInt *icol_i; 2417 #if defined(PETSC_USE_CTABLE) 2418 for (i = 0; i < ismax; i++) { 2419 if (!allcolumns[i]) { 2420 PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i)); 2421 2422 jmax = ncol[i]; 2423 icol_i = icol[i]; 2424 cmap_i = cmap[i]; 2425 for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1)); 2426 } else cmap[i] = NULL; 2427 } 2428 #else 2429 for (i = 0; i < ismax; i++) { 2430 if (!allcolumns[i]) { 2431 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2432 jmax = ncol[i]; 2433 icol_i = icol[i]; 2434 cmap_i = cmap[i]; 2435 for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1; 2436 } else cmap[i] = NULL; 2437 } 2438 #endif 2439 } 2440 2441 /* Create lens which is required for MatCreate... */ 2442 for (i = 0, j = 0; i < ismax; i++) j += nrow[i]; 2443 PetscCall(PetscMalloc1(ismax, &lens)); 2444 2445 if (ismax) PetscCall(PetscCalloc1(j, &lens[0])); 2446 for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1]; 2447 2448 /* Update lens from local data */ 2449 for (i = 0; i < ismax; i++) { 2450 row2proc_i = row2proc[i]; 2451 jmax = nrow[i]; 2452 if (!allcolumns[i]) cmap_i = cmap[i]; 2453 irow_i = irow[i]; 2454 lens_i = lens[i]; 2455 for (j = 0; j < jmax; j++) { 2456 row = irow_i[j]; 2457 proc = row2proc_i[j]; 2458 if (proc == rank) { 2459 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2460 if (!allcolumns[i]) { 2461 for (k = 0; k < ncols; k++) { 2462 #if defined(PETSC_USE_CTABLE) 2463 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2464 #else 2465 tcol = cmap_i[cols[k]]; 2466 #endif 2467 if (tcol) lens_i[j]++; 2468 } 2469 } else { /* allcolumns */ 2470 lens_i[j] = ncols; 2471 } 2472 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2473 } 2474 } 2475 } 2476 2477 /* Create row map: global row of C -> local row of submatrices */ 2478 #if defined(PETSC_USE_CTABLE) 2479 for (i = 0; i < ismax; i++) { 2480 PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i)); 2481 irow_i = irow[i]; 2482 jmax = nrow[i]; 2483 for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1)); 2484 } 2485 #else 2486 for (i = 0; i < ismax; i++) { 2487 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2488 rmap_i = rmap[i]; 2489 irow_i = irow[i]; 2490 jmax = nrow[i]; 2491 for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j; 2492 } 2493 #endif 2494 2495 /* Update lens from offproc data */ 2496 { 2497 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2498 2499 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2500 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2501 sbuf1_i = sbuf1[pa[tmp2]]; 2502 jmax = sbuf1_i[0]; 2503 ct1 = 2 * jmax + 1; 2504 ct2 = 0; 2505 rbuf2_i = rbuf2[tmp2]; 2506 rbuf3_i = rbuf3[tmp2]; 2507 for (j = 1; j <= jmax; j++) { 2508 is_no = sbuf1_i[2 * j - 1]; 2509 max1 = sbuf1_i[2 * j]; 2510 lens_i = lens[is_no]; 2511 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2512 rmap_i = rmap[is_no]; 2513 for (k = 0; k < max1; k++, ct1++) { 2514 #if defined(PETSC_USE_CTABLE) 2515 PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row)); 2516 row--; 2517 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2518 #else 2519 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2520 #endif 2521 max2 = rbuf2_i[ct1]; 2522 for (l = 0; l < max2; l++, ct2++) { 2523 if (!allcolumns[is_no]) { 2524 #if defined(PETSC_USE_CTABLE) 2525 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2526 #else 2527 tcol = cmap_i[rbuf3_i[ct2]]; 2528 #endif 2529 if (tcol) lens_i[row]++; 2530 } else { /* allcolumns */ 2531 lens_i[row]++; /* lens_i[row] += max2 ? */ 2532 } 2533 } 2534 } 2535 } 2536 } 2537 } 2538 PetscCall(PetscFree(r_waits3)); 2539 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2540 PetscCall(PetscFree(s_waits3)); 2541 2542 /* Create the submatrices */ 2543 for (i = 0; i < ismax; i++) { 2544 PetscInt rbs, cbs; 2545 2546 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2547 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2548 2549 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2550 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2551 2552 PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2553 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2554 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2555 2556 /* create struct Mat_SubSppt and attached it to submat */ 2557 PetscCall(PetscNew(&smat_i)); 2558 subc = (Mat_SeqAIJ *)submats[i]->data; 2559 subc->submatis1 = smat_i; 2560 2561 smat_i->destroy = submats[i]->ops->destroy; 2562 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2563 submats[i]->factortype = C->factortype; 2564 2565 smat_i->id = i; 2566 smat_i->nrqs = nrqs; 2567 smat_i->nrqr = nrqr; 2568 smat_i->rbuf1 = rbuf1; 2569 smat_i->rbuf2 = rbuf2; 2570 smat_i->rbuf3 = rbuf3; 2571 smat_i->sbuf2 = sbuf2; 2572 smat_i->req_source2 = req_source2; 2573 2574 smat_i->sbuf1 = sbuf1; 2575 smat_i->ptr = ptr; 2576 smat_i->tmp = tmp; 2577 smat_i->ctr = ctr; 2578 2579 smat_i->pa = pa; 2580 smat_i->req_size = req_size; 2581 smat_i->req_source1 = req_source1; 2582 2583 smat_i->allcolumns = allcolumns[i]; 2584 smat_i->singleis = PETSC_FALSE; 2585 smat_i->row2proc = row2proc[i]; 2586 smat_i->rmap = rmap[i]; 2587 smat_i->cmap = cmap[i]; 2588 } 2589 2590 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2591 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2592 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2593 PetscCall(MatSetType(submats[0], MATDUMMY)); 2594 2595 /* create struct Mat_SubSppt and attached it to submat */ 2596 PetscCall(PetscNew(&smat_i)); 2597 submats[0]->data = (void *)smat_i; 2598 2599 smat_i->destroy = submats[0]->ops->destroy; 2600 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2601 submats[0]->factortype = C->factortype; 2602 2603 smat_i->id = 0; 2604 smat_i->nrqs = nrqs; 2605 smat_i->nrqr = nrqr; 2606 smat_i->rbuf1 = rbuf1; 2607 smat_i->rbuf2 = rbuf2; 2608 smat_i->rbuf3 = rbuf3; 2609 smat_i->sbuf2 = sbuf2; 2610 smat_i->req_source2 = req_source2; 2611 2612 smat_i->sbuf1 = sbuf1; 2613 smat_i->ptr = ptr; 2614 smat_i->tmp = tmp; 2615 smat_i->ctr = ctr; 2616 2617 smat_i->pa = pa; 2618 smat_i->req_size = req_size; 2619 smat_i->req_source1 = req_source1; 2620 2621 smat_i->allcolumns = PETSC_FALSE; 2622 smat_i->singleis = PETSC_FALSE; 2623 smat_i->row2proc = NULL; 2624 smat_i->rmap = NULL; 2625 smat_i->cmap = NULL; 2626 } 2627 2628 if (ismax) PetscCall(PetscFree(lens[0])); 2629 PetscCall(PetscFree(lens)); 2630 if (sbuf_aj) { 2631 PetscCall(PetscFree(sbuf_aj[0])); 2632 PetscCall(PetscFree(sbuf_aj)); 2633 } 2634 2635 } /* endof scall == MAT_INITIAL_MATRIX */ 2636 2637 /* Post recv matrix values */ 2638 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2639 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2640 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2641 for (i = 0; i < nrqs; ++i) { 2642 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2643 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2644 } 2645 2646 /* Allocate sending buffers for a->a, and send them off */ 2647 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2648 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2649 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 2650 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2651 2652 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2653 { 2654 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2655 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2656 PetscInt cend = C->cmap->rend; 2657 PetscInt *b_j = b->j; 2658 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2659 2660 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2661 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2662 for (i = 0; i < nrqr; i++) { 2663 rbuf1_i = rbuf1[i]; 2664 sbuf_aa_i = sbuf_aa[i]; 2665 ct1 = 2 * rbuf1_i[0] + 1; 2666 ct2 = 0; 2667 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2668 kmax = rbuf1_i[2 * j]; 2669 for (k = 0; k < kmax; k++, ct1++) { 2670 row = rbuf1_i[ct1] - rstart; 2671 nzA = a_i[row + 1] - a_i[row]; 2672 nzB = b_i[row + 1] - b_i[row]; 2673 ncols = nzA + nzB; 2674 cworkB = b_j + b_i[row]; 2675 vworkA = a_a + a_i[row]; 2676 vworkB = b_a + b_i[row]; 2677 2678 /* load the column values for this row into vals*/ 2679 vals = sbuf_aa_i + ct2; 2680 2681 lwrite = 0; 2682 for (l = 0; l < nzB; l++) { 2683 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2684 } 2685 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2686 for (l = 0; l < nzB; l++) { 2687 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2688 } 2689 2690 ct2 += ncols; 2691 } 2692 } 2693 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2694 } 2695 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2696 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2697 } 2698 2699 /* Assemble the matrices */ 2700 /* First assemble the local rows */ 2701 for (i = 0; i < ismax; i++) { 2702 row2proc_i = row2proc[i]; 2703 subc = (Mat_SeqAIJ *)submats[i]->data; 2704 imat_ilen = subc->ilen; 2705 imat_j = subc->j; 2706 imat_i = subc->i; 2707 imat_a = subc->a; 2708 2709 if (!allcolumns[i]) cmap_i = cmap[i]; 2710 rmap_i = rmap[i]; 2711 irow_i = irow[i]; 2712 jmax = nrow[i]; 2713 for (j = 0; j < jmax; j++) { 2714 row = irow_i[j]; 2715 proc = row2proc_i[j]; 2716 if (proc == rank) { 2717 old_row = row; 2718 #if defined(PETSC_USE_CTABLE) 2719 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2720 row--; 2721 #else 2722 row = rmap_i[row]; 2723 #endif 2724 ilen_row = imat_ilen[row]; 2725 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2726 mat_i = imat_i[row]; 2727 mat_a = imat_a + mat_i; 2728 mat_j = imat_j + mat_i; 2729 if (!allcolumns[i]) { 2730 for (k = 0; k < ncols; k++) { 2731 #if defined(PETSC_USE_CTABLE) 2732 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2733 #else 2734 tcol = cmap_i[cols[k]]; 2735 #endif 2736 if (tcol) { 2737 *mat_j++ = tcol - 1; 2738 *mat_a++ = vals[k]; 2739 ilen_row++; 2740 } 2741 } 2742 } else { /* allcolumns */ 2743 for (k = 0; k < ncols; k++) { 2744 *mat_j++ = cols[k]; /* global col index! */ 2745 *mat_a++ = vals[k]; 2746 ilen_row++; 2747 } 2748 } 2749 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2750 2751 imat_ilen[row] = ilen_row; 2752 } 2753 } 2754 } 2755 2756 /* Now assemble the off proc rows */ 2757 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2758 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2759 sbuf1_i = sbuf1[pa[tmp2]]; 2760 jmax = sbuf1_i[0]; 2761 ct1 = 2 * jmax + 1; 2762 ct2 = 0; 2763 rbuf2_i = rbuf2[tmp2]; 2764 rbuf3_i = rbuf3[tmp2]; 2765 rbuf4_i = rbuf4[tmp2]; 2766 for (j = 1; j <= jmax; j++) { 2767 is_no = sbuf1_i[2 * j - 1]; 2768 rmap_i = rmap[is_no]; 2769 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2770 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2771 imat_ilen = subc->ilen; 2772 imat_j = subc->j; 2773 imat_i = subc->i; 2774 imat_a = subc->a; 2775 max1 = sbuf1_i[2 * j]; 2776 for (k = 0; k < max1; k++, ct1++) { 2777 row = sbuf1_i[ct1]; 2778 #if defined(PETSC_USE_CTABLE) 2779 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2780 row--; 2781 #else 2782 row = rmap_i[row]; 2783 #endif 2784 ilen = imat_ilen[row]; 2785 mat_i = imat_i[row]; 2786 mat_a = imat_a + mat_i; 2787 mat_j = imat_j + mat_i; 2788 max2 = rbuf2_i[ct1]; 2789 if (!allcolumns[is_no]) { 2790 for (l = 0; l < max2; l++, ct2++) { 2791 #if defined(PETSC_USE_CTABLE) 2792 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2793 #else 2794 tcol = cmap_i[rbuf3_i[ct2]]; 2795 #endif 2796 if (tcol) { 2797 *mat_j++ = tcol - 1; 2798 *mat_a++ = rbuf4_i[ct2]; 2799 ilen++; 2800 } 2801 } 2802 } else { /* allcolumns */ 2803 for (l = 0; l < max2; l++, ct2++) { 2804 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2805 *mat_a++ = rbuf4_i[ct2]; 2806 ilen++; 2807 } 2808 } 2809 imat_ilen[row] = ilen; 2810 } 2811 } 2812 } 2813 2814 if (!iscsorted) { /* sort column indices of the rows */ 2815 for (i = 0; i < ismax; i++) { 2816 subc = (Mat_SeqAIJ *)submats[i]->data; 2817 imat_j = subc->j; 2818 imat_i = subc->i; 2819 imat_a = subc->a; 2820 imat_ilen = subc->ilen; 2821 2822 if (allcolumns[i]) continue; 2823 jmax = nrow[i]; 2824 for (j = 0; j < jmax; j++) { 2825 mat_i = imat_i[j]; 2826 mat_a = imat_a + mat_i; 2827 mat_j = imat_j + mat_i; 2828 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2829 } 2830 } 2831 } 2832 2833 PetscCall(PetscFree(r_waits4)); 2834 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2835 PetscCall(PetscFree(s_waits4)); 2836 2837 /* Restore the indices */ 2838 for (i = 0; i < ismax; i++) { 2839 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2840 if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i)); 2841 } 2842 2843 for (i = 0; i < ismax; i++) { 2844 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2845 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2846 } 2847 2848 /* Destroy allocated memory */ 2849 if (sbuf_aa) { 2850 PetscCall(PetscFree(sbuf_aa[0])); 2851 PetscCall(PetscFree(sbuf_aa)); 2852 } 2853 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2854 2855 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 2856 PetscCall(PetscFree(rbuf4)); 2857 2858 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2859 PetscFunctionReturn(0); 2860 } 2861 2862 /* 2863 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2864 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2865 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2866 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2867 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2868 state, and needs to be "assembled" later by compressing B's column space. 2869 2870 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2871 Following this call, C->A & C->B have been created, even if empty. 2872 */ 2873 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) 2874 { 2875 /* If making this function public, change the error returned in this function away from _PLIB. */ 2876 Mat_MPIAIJ *aij; 2877 Mat_SeqAIJ *Baij; 2878 PetscBool seqaij, Bdisassembled; 2879 PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count; 2880 PetscScalar v; 2881 const PetscInt *rowindices, *colindices; 2882 2883 PetscFunctionBegin; 2884 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2885 if (A) { 2886 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2887 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2888 if (rowemb) { 2889 PetscCall(ISGetLocalSize(rowemb, &m)); 2890 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2891 } else { 2892 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2893 } 2894 if (dcolemb) { 2895 PetscCall(ISGetLocalSize(dcolemb, &n)); 2896 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2897 } else { 2898 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2899 } 2900 } 2901 if (B) { 2902 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2903 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2904 if (rowemb) { 2905 PetscCall(ISGetLocalSize(rowemb, &m)); 2906 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2907 } else { 2908 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2909 } 2910 if (ocolemb) { 2911 PetscCall(ISGetLocalSize(ocolemb, &n)); 2912 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2913 } else { 2914 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2915 } 2916 } 2917 2918 aij = (Mat_MPIAIJ *)C->data; 2919 if (!aij->A) { 2920 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2921 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2922 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2923 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2924 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2925 } 2926 if (A) { 2927 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2928 } else { 2929 PetscCall(MatSetUp(aij->A)); 2930 } 2931 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2932 /* 2933 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2934 need to "disassemble" B -- convert it to using C's global indices. 2935 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2936 2937 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2938 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2939 2940 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2941 At least avoid calling MatSetValues() and the implied searches? 2942 */ 2943 2944 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2945 #if defined(PETSC_USE_CTABLE) 2946 PetscCall(PetscHMapIDestroy(&aij->colmap)); 2947 #else 2948 PetscCall(PetscFree(aij->colmap)); 2949 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2950 #endif 2951 ngcol = 0; 2952 if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol)); 2953 if (aij->garray) { PetscCall(PetscFree(aij->garray)); } 2954 PetscCall(VecDestroy(&aij->lvec)); 2955 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2956 } 2957 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B)); 2958 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B)); 2959 } 2960 Bdisassembled = PETSC_FALSE; 2961 if (!aij->B) { 2962 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2963 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2964 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2965 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2966 Bdisassembled = PETSC_TRUE; 2967 } 2968 if (B) { 2969 Baij = (Mat_SeqAIJ *)B->data; 2970 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2971 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2972 for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i]; 2973 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2974 PetscCall(PetscFree(nz)); 2975 } 2976 2977 PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend)); 2978 shift = rend - rstart; 2979 count = 0; 2980 rowindices = NULL; 2981 colindices = NULL; 2982 if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices)); 2983 if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices)); 2984 for (i = 0; i < B->rmap->n; i++) { 2985 PetscInt row; 2986 row = i; 2987 if (rowindices) row = rowindices[i]; 2988 for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2989 col = Baij->j[count]; 2990 if (colindices) col = colindices[col]; 2991 if (Bdisassembled && col >= rstart) col += shift; 2992 v = Baij->a[count]; 2993 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2994 ++count; 2995 } 2996 } 2997 /* No assembly for aij->B is necessary. */ 2998 /* FIXME: set aij->B's nonzerostate correctly. */ 2999 } else { 3000 PetscCall(MatSetUp(aij->B)); 3001 } 3002 C->preallocated = PETSC_TRUE; 3003 C->was_assembled = PETSC_FALSE; 3004 C->assembled = PETSC_FALSE; 3005 /* 3006 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3007 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3008 */ 3009 PetscFunctionReturn(0); 3010 } 3011 3012 /* 3013 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3014 */ 3015 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) 3016 { 3017 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 3018 3019 PetscFunctionBegin; 3020 PetscValidPointer(A, 2); 3021 PetscValidPointer(B, 3); 3022 /* FIXME: make sure C is assembled */ 3023 *A = aij->A; 3024 *B = aij->B; 3025 /* Note that we don't incref *A and *B, so be careful! */ 3026 PetscFunctionReturn(0); 3027 } 3028 3029 /* 3030 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3031 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3032 */ 3033 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) 3034 { 3035 PetscMPIInt size, flag; 3036 PetscInt i, ii, cismax, ispar; 3037 Mat *A, *B; 3038 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 3039 3040 PetscFunctionBegin; 3041 if (!ismax) PetscFunctionReturn(0); 3042 3043 for (i = 0, cismax = 0; i < ismax; ++i) { 3044 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 3045 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3046 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3047 if (size > 1) ++cismax; 3048 } 3049 3050 /* 3051 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3052 ispar counts the number of parallel ISs across C's comm. 3053 */ 3054 PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 3055 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3056 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 3057 PetscFunctionReturn(0); 3058 } 3059 3060 /* if (ispar) */ 3061 /* 3062 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3063 These are used to extract the off-diag portion of the resulting parallel matrix. 3064 The row IS for the off-diag portion is the same as for the diag portion, 3065 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3066 */ 3067 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3068 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3069 for (i = 0, ii = 0; i < ismax; ++i) { 3070 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3071 if (size > 1) { 3072 /* 3073 TODO: This is the part that's ***NOT SCALABLE***. 3074 To fix this we need to extract just the indices of C's nonzero columns 3075 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3076 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3077 to be done without serializing on the IS list, so, most likely, it is best 3078 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3079 */ 3080 PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii]))); 3081 /* Now we have to 3082 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3083 were sorted on each rank, concatenated they might no longer be sorted; 3084 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3085 indices in the nondecreasing order to the original index positions. 3086 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3087 */ 3088 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3089 PetscCall(ISSort(ciscol[ii])); 3090 ++ii; 3091 } 3092 } 3093 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3094 for (i = 0, ii = 0; i < ismax; ++i) { 3095 PetscInt j, issize; 3096 const PetscInt *indices; 3097 3098 /* 3099 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3100 */ 3101 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3102 PetscCall(ISSort(isrow[i])); 3103 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3104 PetscCall(ISGetIndices(isrow[i], &indices)); 3105 for (j = 1; j < issize; ++j) { 3106 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3107 } 3108 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3109 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3110 PetscCall(ISSort(iscol[i])); 3111 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3112 PetscCall(ISGetIndices(iscol[i], &indices)); 3113 for (j = 1; j < issize; ++j) { 3114 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3115 } 3116 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3117 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3118 if (size > 1) { 3119 cisrow[ii] = isrow[i]; 3120 ++ii; 3121 } 3122 } 3123 /* 3124 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3125 array of sequential matrices underlying the resulting parallel matrices. 3126 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3127 contain duplicates. 3128 3129 There are as many diag matrices as there are original index sets. There are only as many parallel 3130 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3131 3132 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3133 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3134 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3135 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3136 with A[i] and B[ii] extracted from the corresponding MPI submat. 3137 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3138 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3139 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3140 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3141 values into A[i] and B[ii] sitting inside the corresponding submat. 3142 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3143 A[i], B[ii], AA[i] or BB[ii] matrices. 3144 */ 3145 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3146 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat)); 3147 3148 /* Now obtain the sequential A and B submatrices separately. */ 3149 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3150 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3151 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3152 3153 /* 3154 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3155 matrices A & B have been extracted directly into the parallel matrices containing them, or 3156 simply into the sequential matrix identical with the corresponding A (if size == 1). 3157 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3158 to have the same sparsity pattern. 3159 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3160 must be constructed for C. This is done by setseqmat(s). 3161 */ 3162 for (i = 0, ii = 0; i < ismax; ++i) { 3163 /* 3164 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3165 That way we can avoid sorting and computing permutations when reusing. 3166 To this end: 3167 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3168 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3169 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3170 */ 3171 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3172 3173 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3174 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3175 if (size > 1) { 3176 if (scall == MAT_INITIAL_MATRIX) { 3177 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3178 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3179 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3180 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3181 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3182 } 3183 /* 3184 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3185 */ 3186 { 3187 Mat AA = A[i], BB = B[ii]; 3188 3189 if (AA || BB) { 3190 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3191 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3192 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3193 } 3194 PetscCall(MatDestroy(&AA)); 3195 } 3196 PetscCall(ISDestroy(ciscol + ii)); 3197 PetscCall(ISDestroy(ciscol_p + ii)); 3198 ++ii; 3199 } else { /* if (size == 1) */ 3200 if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i])); 3201 if (isrow_p[i] || iscol_p[i]) { 3202 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3203 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3204 /* Otherwise A is extracted straight into (*submats)[i]. */ 3205 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3206 PetscCall(MatDestroy(A + i)); 3207 } else (*submat)[i] = A[i]; 3208 } 3209 PetscCall(ISDestroy(&isrow_p[i])); 3210 PetscCall(ISDestroy(&iscol_p[i])); 3211 } 3212 PetscCall(PetscFree2(cisrow, ciscol)); 3213 PetscCall(PetscFree2(isrow_p, iscol_p)); 3214 PetscCall(PetscFree(ciscol_p)); 3215 PetscCall(PetscFree(A)); 3216 PetscCall(MatDestroySubMatrices(cismax, &B)); 3217 PetscFunctionReturn(0); 3218 } 3219 3220 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 3221 { 3222 PetscFunctionBegin; 3223 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3224 PetscFunctionReturn(0); 3225 } 3226