1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 /* 22 Takes a general IS and builds a block version of the IS that contains the given IS plus any needed values to fill out the blocks 23 24 The entire MatIncreaseOverlap_MPIAIJ() stack could be rewritten to respect the bs and it would offer higher performance but 25 that is a very major recoding job. 26 27 Possible scalability issues with this routine because it allocates space proportional to Nmax-Nmin 28 */ 29 static PetscErrorCode ISAdjustForBlockSize(PetscInt bs, PetscInt imax, IS is[]) 30 { 31 PetscFunctionBegin; 32 for (PetscInt i = 0; i < imax; i++) { 33 if (!is[i]) break; 34 PetscInt n = 0, N, Nmax, Nmin; 35 const PetscInt *idx; 36 PetscInt *nidx = NULL; 37 MPI_Comm comm; 38 PetscBT bt; 39 40 PetscCall(ISGetLocalSize(is[i], &N)); 41 if (N > 0) { /* Nmax and Nmin are garbage for empty IS */ 42 PetscCall(ISGetIndices(is[i], &idx)); 43 PetscCall(ISGetMinMax(is[i], &Nmin, &Nmax)); 44 Nmin = Nmin / bs; 45 Nmax = Nmax / bs; 46 PetscCall(PetscBTCreate(Nmax - Nmin, &bt)); 47 for (PetscInt j = 0; j < N; j++) { 48 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) n++; 49 } 50 PetscCall(PetscMalloc1(n, &nidx)); 51 n = 0; 52 PetscCall(PetscBTMemzero(Nmax - Nmin, bt)); 53 for (PetscInt j = 0; j < N; j++) { 54 if (!PetscBTLookupSet(bt, idx[j] / bs - Nmin)) nidx[n++] = idx[j] / bs; 55 } 56 PetscCall(PetscBTDestroy(&bt)); 57 PetscCall(ISRestoreIndices(is[i], &idx)); 58 } 59 PetscCall(PetscObjectGetComm((PetscObject)is[i], &comm)); 60 PetscCall(ISDestroy(is + i)); 61 PetscCall(ISCreateBlock(comm, bs, n, nidx, PETSC_OWN_POINTER, is + i)); 62 } 63 PetscFunctionReturn(0); 64 } 65 66 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) 67 { 68 PetscInt i; 69 70 PetscFunctionBegin; 71 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 72 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); 73 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) 78 { 79 PetscInt i; 80 81 PetscFunctionBegin; 82 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 83 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); 84 if (C->rmap->bs > 1 && C->rmap->bs == C->cmap->bs) PetscCall(ISAdjustForBlockSize(C->rmap->bs, imax, is)); 85 PetscFunctionReturn(0); 86 } 87 88 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) 89 { 90 MPI_Comm comm; 91 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j; 92 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 93 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 94 const PetscInt *indices_i, **indices; 95 PetscLayout rmap; 96 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner; 97 PetscSF sf; 98 PetscSFNode *remote; 99 100 PetscFunctionBegin; 101 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 102 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 103 PetscCallMPI(MPI_Comm_size(comm, &size)); 104 /* get row map to determine where rows should be going */ 105 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 106 /* retrieve IS data and put all together so that we 107 * can optimize communication 108 * */ 109 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 110 for (i = 0, tlength = 0; i < nidx; i++) { 111 PetscCall(ISGetLocalSize(is[i], &length[i])); 112 tlength += length[i]; 113 PetscCall(ISGetIndices(is[i], &indices[i])); 114 } 115 /* find these rows on remote processors */ 116 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 117 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 118 nrrows = 0; 119 for (i = 0; i < nidx; i++) { 120 length_i = length[i]; 121 indices_i = indices[i]; 122 for (j = 0; j < length_i; j++) { 123 owner = -1; 124 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 125 /* remote processors */ 126 if (owner != rank) { 127 tosizes_temp[owner]++; /* number of rows to owner */ 128 rrow_ranks[nrrows] = owner; /* processor */ 129 rrow_isids[nrrows] = i; /* is id */ 130 remoterows[nrrows++] = indices_i[j]; /* row */ 131 } 132 } 133 PetscCall(ISRestoreIndices(is[i], &indices[i])); 134 } 135 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 136 /* test if we need to exchange messages 137 * generally speaking, we do not need to exchange 138 * data when overlap is 1 139 * */ 140 PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 141 /* we do not have any messages 142 * It usually corresponds to overlap 1 143 * */ 144 if (!reducednrrows) { 145 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 146 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 147 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 148 PetscFunctionReturn(0); 149 } 150 nto = 0; 151 /* send sizes and ranks for building a two-sided communcation */ 152 for (i = 0; i < size; i++) { 153 if (tosizes_temp[i]) { 154 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 155 tosizes_temp[i] = nto; /* a map from processor to index */ 156 toranks[nto++] = i; /* processor */ 157 } 158 } 159 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 160 toffsets[0] = 0; 161 for (i = 0; i < nto; i++) { 162 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 163 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 164 } 165 /* send information to other processors */ 166 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 167 nrecvrows = 0; 168 for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 169 PetscCall(PetscMalloc1(nrecvrows, &remote)); 170 nrecvrows = 0; 171 for (i = 0; i < nfrom; i++) { 172 for (j = 0; j < fromsizes[2 * i]; j++) { 173 remote[nrecvrows].rank = fromranks[i]; 174 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 175 } 176 } 177 PetscCall(PetscSFCreate(comm, &sf)); 178 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 179 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 180 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 181 PetscCall(PetscSFSetFromOptions(sf)); 182 /* message pair <no of is, row> */ 183 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 184 for (i = 0; i < nrrows; i++) { 185 owner = rrow_ranks[i]; /* processor */ 186 j = tosizes_temp[owner]; /* index */ 187 todata[toffsets[j]++] = rrow_isids[i]; 188 todata[toffsets[j]++] = remoterows[i]; 189 } 190 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 191 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 192 PetscCall(PetscFree(toffsets)); 193 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 194 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 195 PetscCall(PetscSFDestroy(&sf)); 196 /* send rows belonging to the remote so that then we could get the overlapping data back */ 197 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 198 PetscCall(PetscFree2(todata, fromdata)); 199 PetscCall(PetscFree(fromsizes)); 200 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 201 PetscCall(PetscFree(fromranks)); 202 nrecvrows = 0; 203 for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 204 PetscCall(PetscCalloc1(nrecvrows, &todata)); 205 PetscCall(PetscMalloc1(nrecvrows, &remote)); 206 nrecvrows = 0; 207 for (i = 0; i < nto; i++) { 208 for (j = 0; j < tosizes[2 * i]; j++) { 209 remote[nrecvrows].rank = toranks[i]; 210 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 211 } 212 } 213 PetscCall(PetscSFCreate(comm, &sf)); 214 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 215 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 216 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 217 PetscCall(PetscSFSetFromOptions(sf)); 218 /* overlap communication and computation */ 219 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 220 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 221 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 222 PetscCall(PetscSFDestroy(&sf)); 223 PetscCall(PetscFree2(sbdata, sbsizes)); 224 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 225 PetscCall(PetscFree(toranks)); 226 PetscCall(PetscFree(tosizes)); 227 PetscCall(PetscFree(todata)); 228 PetscFunctionReturn(0); 229 } 230 231 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 232 { 233 PetscInt *isz, isz_i, i, j, is_id, data_size; 234 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 235 const PetscInt *indices_i_temp; 236 MPI_Comm *iscomms; 237 238 PetscFunctionBegin; 239 max_lsize = 0; 240 PetscCall(PetscMalloc1(nidx, &isz)); 241 for (i = 0; i < nidx; i++) { 242 PetscCall(ISGetLocalSize(is[i], &lsize)); 243 max_lsize = lsize > max_lsize ? lsize : max_lsize; 244 isz[i] = lsize; 245 } 246 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 247 for (i = 0; i < nidx; i++) { 248 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 249 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 250 PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i])); 251 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 252 PetscCall(ISDestroy(&is[i])); 253 } 254 /* retrieve information to get row id and its overlap */ 255 for (i = 0; i < nrecvs;) { 256 is_id = recvdata[i++]; 257 data_size = recvdata[i++]; 258 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 259 isz_i = isz[is_id]; 260 for (j = 0; j < data_size; j++) { 261 col = recvdata[i++]; 262 indices_i[isz_i++] = col; 263 } 264 isz[is_id] = isz_i; 265 } 266 /* remove duplicate entities */ 267 for (i = 0; i < nidx; i++) { 268 indices_i = indices_temp + (max_lsize + nrecvs) * i; 269 isz_i = isz[i]; 270 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 271 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 272 PetscCall(PetscCommDestroy(&iscomms[i])); 273 } 274 PetscCall(PetscFree(isz)); 275 PetscCall(PetscFree2(indices_temp, iscomms)); 276 PetscFunctionReturn(0); 277 } 278 279 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 280 { 281 PetscLayout rmap, cmap; 282 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 283 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 284 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 285 const PetscInt *gcols, *ai, *aj, *bi, *bj; 286 Mat amat, bmat; 287 PetscMPIInt rank; 288 PetscBool done; 289 MPI_Comm comm; 290 291 PetscFunctionBegin; 292 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 293 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 294 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 295 /* Even if the mat is symmetric, we still assume it is not symmetric */ 296 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 297 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 298 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 299 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 300 /* total number of nonzero values is used to estimate the memory usage in the next step */ 301 tnz = ai[an] + bi[bn]; 302 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 303 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 304 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 305 /* to find the longest message */ 306 max_fszs = 0; 307 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 308 /* better way to estimate number of nonzero in the mat??? */ 309 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 310 for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i; 311 rows_pos = 0; 312 totalrows = 0; 313 for (i = 0; i < nfrom; i++) { 314 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 315 /* group data together */ 316 for (j = 0; j < fromsizes[2 * i]; j += 2) { 317 is_id = fromrows[rows_pos++]; /* no of is */ 318 rows_i = rows_data[is_id]; 319 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 320 } 321 /* estimate a space to avoid multiple allocations */ 322 for (j = 0; j < nidx; j++) { 323 indvc_ij = 0; 324 rows_i = rows_data[j]; 325 for (l = 0; l < rows_pos_i[j]; l++) { 326 row = rows_i[l] - rstart; 327 start = ai[row]; 328 end = ai[row + 1]; 329 for (k = start; k < end; k++) { /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 332 } 333 start = bi[row]; 334 end = bi[row + 1]; 335 for (k = start; k < end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 341 indv_counts[i * nidx + j] = indvc_ij; 342 totalrows += indvc_ij; 343 } 344 } 345 /* message triple <no of is, number of rows, rows> */ 346 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 347 totalrows = 0; 348 rows_pos = 0; 349 /* use this code again */ 350 for (i = 0; i < nfrom; i++) { 351 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 352 for (j = 0; j < fromsizes[2 * i]; j += 2) { 353 is_id = fromrows[rows_pos++]; 354 rows_i = rows_data[is_id]; 355 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 356 } 357 /* add data */ 358 for (j = 0; j < nidx; j++) { 359 if (!indv_counts[i * nidx + j]) continue; 360 indvc_ij = 0; 361 sbdata[totalrows++] = j; 362 sbdata[totalrows++] = indv_counts[i * nidx + j]; 363 sbsizes[2 * i] += 2; 364 rows_i = rows_data[j]; 365 for (l = 0; l < rows_pos_i[j]; l++) { 366 row = rows_i[l] - rstart; 367 start = ai[row]; 368 end = ai[row + 1]; 369 for (k = start; k < end; k++) { /* Amat */ 370 col = aj[k] + cstart; 371 indices_tmp[indvc_ij++] = col; 372 } 373 start = bi[row]; 374 end = bi[row + 1]; 375 for (k = start; k < end; k++) { /* Bmat */ 376 col = gcols[bj[k]]; 377 indices_tmp[indvc_ij++] = col; 378 } 379 } 380 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 381 sbsizes[2 * i] += indvc_ij; 382 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 383 totalrows += indvc_ij; 384 } 385 } 386 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 387 offsets[0] = 0; 388 for (i = 0; i < nfrom; i++) { 389 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 390 sbsizes[2 * i + 1] = offsets[i]; 391 } 392 PetscCall(PetscFree(offsets)); 393 if (sbrowsizes) *sbrowsizes = sbsizes; 394 if (sbrows) *sbrows = sbdata; 395 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 396 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 397 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 398 PetscFunctionReturn(0); 399 } 400 401 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) 402 { 403 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 404 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 405 PetscInt lsize, lsize_tmp; 406 PetscMPIInt rank, owner; 407 Mat amat, bmat; 408 PetscBool done; 409 PetscLayout cmap, rmap; 410 MPI_Comm comm; 411 412 PetscFunctionBegin; 413 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 414 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 415 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 416 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 417 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 418 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 419 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 420 /* is it a safe way to compute number of nonzero values ? */ 421 tnz = ai[an] + bi[bn]; 422 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 423 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 424 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 425 /* it is a better way to estimate memory than the old implementation 426 * where global size of matrix is used 427 * */ 428 PetscCall(PetscMalloc1(tnz, &indices_temp)); 429 for (i = 0; i < nidx; i++) { 430 MPI_Comm iscomm; 431 432 PetscCall(ISGetLocalSize(is[i], &lsize)); 433 PetscCall(ISGetIndices(is[i], &indices)); 434 lsize_tmp = 0; 435 for (j = 0; j < lsize; j++) { 436 owner = -1; 437 row = indices[j]; 438 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 439 if (owner != rank) continue; 440 /* local number */ 441 row -= rstart; 442 start = ai[row]; 443 end = ai[row + 1]; 444 for (k = start; k < end; k++) { /* Amat */ 445 col = aj[k] + cstart; 446 indices_temp[lsize_tmp++] = col; 447 } 448 start = bi[row]; 449 end = bi[row + 1]; 450 for (k = start; k < end; k++) { /* Bmat */ 451 col = gcols[bj[k]]; 452 indices_temp[lsize_tmp++] = col; 453 } 454 } 455 PetscCall(ISRestoreIndices(is[i], &indices)); 456 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 457 PetscCall(ISDestroy(&is[i])); 458 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 459 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 460 PetscCall(PetscCommDestroy(&iscomm)); 461 } 462 PetscCall(PetscFree(indices_temp)); 463 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 464 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 465 PetscFunctionReturn(0); 466 } 467 468 /* 469 Sample message format: 470 If a processor A wants processor B to process some elements corresponding 471 to index sets is[1],is[5] 472 mesg [0] = 2 (no of index sets in the mesg) 473 ----------- 474 mesg [1] = 1 => is[1] 475 mesg [2] = sizeof(is[1]); 476 ----------- 477 mesg [3] = 5 => is[5] 478 mesg [4] = sizeof(is[5]); 479 ----------- 480 mesg [5] 481 mesg [n] datas[1] 482 ----------- 483 mesg[n+1] 484 mesg[m] data(is[5]) 485 ----------- 486 487 Notes: 488 nrqs - no of requests sent (or to be sent out) 489 nrqr - no of requests received (which have to be or which have been processed) 490 */ 491 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) 492 { 493 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 494 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 495 const PetscInt **idx, *idx_i; 496 PetscInt *n, **data, len; 497 #if defined(PETSC_USE_CTABLE) 498 PetscTable *table_data, table_data_i; 499 PetscInt *tdata, tcount, tcount_max; 500 #else 501 PetscInt *data_i, *d_p; 502 #endif 503 PetscMPIInt size, rank, tag1, tag2, proc = 0; 504 PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr; 505 PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2; 506 PetscBT *table; 507 MPI_Comm comm; 508 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 509 MPI_Status *recv_status; 510 MPI_Comm *iscomms; 511 char *t_p; 512 513 PetscFunctionBegin; 514 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 515 size = c->size; 516 rank = c->rank; 517 M = C->rmap->N; 518 519 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 520 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 521 522 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 523 524 for (i = 0; i < imax; i++) { 525 PetscCall(ISGetIndices(is[i], &idx[i])); 526 PetscCall(ISGetLocalSize(is[i], &n[i])); 527 } 528 529 /* evaluate communication - mesg to who,length of mesg, and buffer space 530 required. Based on this, buffers are allocated, and data copied into them */ 531 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 532 for (i = 0; i < imax; i++) { 533 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 534 idx_i = idx[i]; 535 len = n[i]; 536 for (j = 0; j < len; j++) { 537 row = idx_i[j]; 538 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 539 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 540 w4[proc]++; 541 } 542 for (j = 0; j < size; j++) { 543 if (w4[j]) { 544 w1[j] += w4[j]; 545 w3[j]++; 546 } 547 } 548 } 549 550 nrqs = 0; /* no of outgoing messages */ 551 msz = 0; /* total mesg length (for all proc */ 552 w1[rank] = 0; /* no mesg sent to intself */ 553 w3[rank] = 0; 554 for (i = 0; i < size; i++) { 555 if (w1[i]) { 556 w2[i] = 1; 557 nrqs++; 558 } /* there exists a message to proc i */ 559 } 560 /* pa - is list of processors to communicate with */ 561 PetscCall(PetscMalloc1(nrqs, &pa)); 562 for (i = 0, j = 0; i < size; i++) { 563 if (w1[i]) { 564 pa[j] = i; 565 j++; 566 } 567 } 568 569 /* Each message would have a header = 1 + 2*(no of IS) + data */ 570 for (i = 0; i < nrqs; i++) { 571 j = pa[i]; 572 w1[j] += w2[j] + 2 * w3[j]; 573 msz += w1[j]; 574 } 575 576 /* Determine the number of messages to expect, their lengths, from from-ids */ 577 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 578 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 579 580 /* Now post the Irecvs corresponding to these messages */ 581 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 582 583 /* Allocate Memory for outgoing messages */ 584 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 585 PetscCall(PetscArrayzero(outdat, size)); 586 PetscCall(PetscArrayzero(ptr, size)); 587 588 { 589 PetscInt *iptr = tmp, ict = 0; 590 for (i = 0; i < nrqs; i++) { 591 j = pa[i]; 592 iptr += ict; 593 outdat[j] = iptr; 594 ict = w1[j]; 595 } 596 } 597 598 /* Form the outgoing messages */ 599 /* plug in the headers */ 600 for (i = 0; i < nrqs; i++) { 601 j = pa[i]; 602 outdat[j][0] = 0; 603 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 604 ptr[j] = outdat[j] + 2 * w3[j] + 1; 605 } 606 607 /* Memory for doing local proc's work */ 608 { 609 PetscInt M_BPB_imax = 0; 610 #if defined(PETSC_USE_CTABLE) 611 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 612 PetscCall(PetscMalloc1(imax, &table_data)); 613 for (i = 0; i < imax; i++) PetscCall(PetscTableCreate(n[i], M, &table_data[i])); 614 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 615 for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 616 #else 617 PetscInt Mimax = 0; 618 PetscCall(PetscIntMultError(M, imax, &Mimax)); 619 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 620 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 621 for (i = 0; i < imax; i++) { 622 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 623 data[i] = d_p + M * i; 624 } 625 #endif 626 } 627 628 /* Parse the IS and update local tables and the outgoing buf with the data */ 629 { 630 PetscInt n_i, isz_i, *outdat_j, ctr_j; 631 PetscBT table_i; 632 633 for (i = 0; i < imax; i++) { 634 PetscCall(PetscArrayzero(ctr, size)); 635 n_i = n[i]; 636 table_i = table[i]; 637 idx_i = idx[i]; 638 #if defined(PETSC_USE_CTABLE) 639 table_data_i = table_data[i]; 640 #else 641 data_i = data[i]; 642 #endif 643 isz_i = isz[i]; 644 for (j = 0; j < n_i; j++) { /* parse the indices of each IS */ 645 row = idx_i[j]; 646 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 647 if (proc != rank) { /* copy to the outgoing buffer */ 648 ctr[proc]++; 649 *ptr[proc] = row; 650 ptr[proc]++; 651 } else if (!PetscBTLookupSet(table_i, row)) { 652 #if defined(PETSC_USE_CTABLE) 653 PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES)); 654 #else 655 data_i[isz_i] = row; /* Update the local table */ 656 #endif 657 isz_i++; 658 } 659 } 660 /* Update the headers for the current IS */ 661 for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 662 if ((ctr_j = ctr[j])) { 663 outdat_j = outdat[j]; 664 k = ++outdat_j[0]; 665 outdat_j[2 * k] = ctr_j; 666 outdat_j[2 * k - 1] = i; 667 } 668 } 669 isz[i] = isz_i; 670 } 671 } 672 673 /* Now post the sends */ 674 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 675 for (i = 0; i < nrqs; ++i) { 676 j = pa[i]; 677 PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 678 } 679 680 /* No longer need the original indices */ 681 PetscCall(PetscMalloc1(imax, &iscomms)); 682 for (i = 0; i < imax; ++i) { 683 PetscCall(ISRestoreIndices(is[i], idx + i)); 684 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 685 } 686 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 687 688 for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i])); 689 690 /* Do Local work */ 691 #if defined(PETSC_USE_CTABLE) 692 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 693 #else 694 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 695 #endif 696 697 /* Receive messages */ 698 PetscCall(PetscMalloc1(nrqr, &recv_status)); 699 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 700 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 701 702 /* Phase 1 sends are complete - deallocate buffers */ 703 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 704 PetscCall(PetscFree4(w1, w2, w3, w4)); 705 706 PetscCall(PetscMalloc1(nrqr, &xdata)); 707 PetscCall(PetscMalloc1(nrqr, &isz1)); 708 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 709 PetscCall(PetscFree(rbuf[0])); 710 PetscCall(PetscFree(rbuf)); 711 712 /* Send the data back */ 713 /* Do a global reduction to know the buffer space req for incoming messages */ 714 { 715 PetscMPIInt *rw1; 716 717 PetscCall(PetscCalloc1(size, &rw1)); 718 719 for (i = 0; i < nrqr; ++i) { 720 proc = recv_status[i].MPI_SOURCE; 721 722 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 723 rw1[proc] = isz1[i]; 724 } 725 PetscCall(PetscFree(onodes1)); 726 PetscCall(PetscFree(olengths1)); 727 728 /* Determine the number of messages to expect, their lengths, from from-ids */ 729 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 730 PetscCall(PetscFree(rw1)); 731 } 732 /* Now post the Irecvs corresponding to these messages */ 733 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 734 735 /* Now post the sends */ 736 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 737 for (i = 0; i < nrqr; ++i) { 738 j = recv_status[i].MPI_SOURCE; 739 PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i)); 740 } 741 742 /* receive work done on other processors */ 743 { 744 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 745 PetscMPIInt idex; 746 PetscBT table_i; 747 748 for (i = 0; i < nrqs; ++i) { 749 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 750 /* Process the message */ 751 rbuf2_i = rbuf2[idex]; 752 ct1 = 2 * rbuf2_i[0] + 1; 753 jmax = rbuf2[idex][0]; 754 for (j = 1; j <= jmax; j++) { 755 max = rbuf2_i[2 * j]; 756 is_no = rbuf2_i[2 * j - 1]; 757 isz_i = isz[is_no]; 758 table_i = table[is_no]; 759 #if defined(PETSC_USE_CTABLE) 760 table_data_i = table_data[is_no]; 761 #else 762 data_i = data[is_no]; 763 #endif 764 for (k = 0; k < max; k++, ct1++) { 765 row = rbuf2_i[ct1]; 766 if (!PetscBTLookupSet(table_i, row)) { 767 #if defined(PETSC_USE_CTABLE) 768 PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES)); 769 #else 770 data_i[isz_i] = row; 771 #endif 772 isz_i++; 773 } 774 } 775 isz[is_no] = isz_i; 776 } 777 } 778 779 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 780 } 781 782 #if defined(PETSC_USE_CTABLE) 783 tcount_max = 0; 784 for (i = 0; i < imax; ++i) { 785 table_data_i = table_data[i]; 786 PetscCall(PetscTableGetCount(table_data_i, &tcount)); 787 if (tcount_max < tcount) tcount_max = tcount; 788 } 789 PetscCall(PetscMalloc1(tcount_max + 1, &tdata)); 790 #endif 791 792 for (i = 0; i < imax; ++i) { 793 #if defined(PETSC_USE_CTABLE) 794 PetscTablePosition tpos; 795 table_data_i = table_data[i]; 796 797 PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos)); 798 while (tpos) { 799 PetscCall(PetscTableGetNext(table_data_i, &tpos, &k, &j)); 800 tdata[--j] = --k; 801 } 802 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 803 #else 804 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 805 #endif 806 PetscCall(PetscCommDestroy(&iscomms[i])); 807 } 808 809 PetscCall(PetscFree(iscomms)); 810 PetscCall(PetscFree(onodes2)); 811 PetscCall(PetscFree(olengths2)); 812 813 PetscCall(PetscFree(pa)); 814 PetscCall(PetscFree(rbuf2[0])); 815 PetscCall(PetscFree(rbuf2)); 816 PetscCall(PetscFree(s_waits1)); 817 PetscCall(PetscFree(r_waits1)); 818 PetscCall(PetscFree(s_waits2)); 819 PetscCall(PetscFree(r_waits2)); 820 PetscCall(PetscFree(recv_status)); 821 if (xdata) { 822 PetscCall(PetscFree(xdata[0])); 823 PetscCall(PetscFree(xdata)); 824 } 825 PetscCall(PetscFree(isz1)); 826 #if defined(PETSC_USE_CTABLE) 827 for (i = 0; i < imax; i++) PetscCall(PetscTableDestroy((PetscTable *)&table_data[i])); 828 PetscCall(PetscFree(table_data)); 829 PetscCall(PetscFree(tdata)); 830 PetscCall(PetscFree4(table, data, isz, t_p)); 831 #else 832 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 833 #endif 834 PetscFunctionReturn(0); 835 } 836 837 /* 838 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 839 the work on the local processor. 840 841 Inputs: 842 C - MAT_MPIAIJ; 843 imax - total no of index sets processed at a time; 844 table - an array of char - size = m bits. 845 846 Output: 847 isz - array containing the count of the solution elements corresponding 848 to each index set; 849 data or table_data - pointer to the solutions 850 */ 851 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data) 852 { 853 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 854 Mat A = c->A, B = c->B; 855 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 856 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 857 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 858 PetscBT table_i; 859 #if defined(PETSC_USE_CTABLE) 860 PetscTable table_data_i; 861 PetscTablePosition tpos; 862 PetscInt tcount, *tdata; 863 #else 864 PetscInt *data_i; 865 #endif 866 867 PetscFunctionBegin; 868 rstart = C->rmap->rstart; 869 cstart = C->cmap->rstart; 870 ai = a->i; 871 aj = a->j; 872 bi = b->i; 873 bj = b->j; 874 garray = c->garray; 875 876 for (i = 0; i < imax; i++) { 877 #if defined(PETSC_USE_CTABLE) 878 /* copy existing entries of table_data_i into tdata[] */ 879 table_data_i = table_data[i]; 880 PetscCall(PetscTableGetCount(table_data_i, &tcount)); 881 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 882 883 PetscCall(PetscMalloc1(tcount, &tdata)); 884 PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos)); 885 while (tpos) { 886 PetscCall(PetscTableGetNext(table_data_i, &tpos, &row, &j)); 887 tdata[--j] = --row; 888 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 889 } 890 #else 891 data_i = data[i]; 892 #endif 893 table_i = table[i]; 894 isz_i = isz[i]; 895 max = isz[i]; 896 897 for (j = 0; j < max; j++) { 898 #if defined(PETSC_USE_CTABLE) 899 row = tdata[j] - rstart; 900 #else 901 row = data_i[j] - rstart; 902 #endif 903 start = ai[row]; 904 end = ai[row + 1]; 905 for (k = start; k < end; k++) { /* Amat */ 906 val = aj[k] + cstart; 907 if (!PetscBTLookupSet(table_i, val)) { 908 #if defined(PETSC_USE_CTABLE) 909 PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES)); 910 #else 911 data_i[isz_i] = val; 912 #endif 913 isz_i++; 914 } 915 } 916 start = bi[row]; 917 end = bi[row + 1]; 918 for (k = start; k < end; k++) { /* Bmat */ 919 val = garray[bj[k]]; 920 if (!PetscBTLookupSet(table_i, val)) { 921 #if defined(PETSC_USE_CTABLE) 922 PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES)); 923 #else 924 data_i[isz_i] = val; 925 #endif 926 isz_i++; 927 } 928 } 929 } 930 isz[i] = isz_i; 931 932 #if defined(PETSC_USE_CTABLE) 933 PetscCall(PetscFree(tdata)); 934 #endif 935 } 936 PetscFunctionReturn(0); 937 } 938 939 /* 940 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 941 and return the output 942 943 Input: 944 C - the matrix 945 nrqr - no of messages being processed. 946 rbuf - an array of pointers to the received requests 947 948 Output: 949 xdata - array of messages to be sent back 950 isz1 - size of each message 951 952 For better efficiency perhaps we should malloc separately each xdata[i], 953 then if a remalloc is required we need only copy the data for that one row 954 rather then all previous rows as it is now where a single large chunk of 955 memory is used. 956 957 */ 958 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) 959 { 960 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 961 Mat A = c->A, B = c->B; 962 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 963 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 964 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 965 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 966 PetscInt *rbuf_i, kmax, rbuf_0; 967 PetscBT xtable; 968 969 PetscFunctionBegin; 970 m = C->rmap->N; 971 rstart = C->rmap->rstart; 972 cstart = C->cmap->rstart; 973 ai = a->i; 974 aj = a->j; 975 bi = b->i; 976 bj = b->j; 977 garray = c->garray; 978 979 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 980 rbuf_i = rbuf[i]; 981 rbuf_0 = rbuf_i[0]; 982 ct += rbuf_0; 983 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 984 } 985 986 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 987 else max1 = 1; 988 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 989 if (nrqr) { 990 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 991 ++no_malloc; 992 } 993 PetscCall(PetscBTCreate(m, &xtable)); 994 PetscCall(PetscArrayzero(isz1, nrqr)); 995 996 ct3 = 0; 997 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 998 rbuf_i = rbuf[i]; 999 rbuf_0 = rbuf_i[0]; 1000 ct1 = 2 * rbuf_0 + 1; 1001 ct2 = ct1; 1002 ct3 += ct1; 1003 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 1004 PetscCall(PetscBTMemzero(m, xtable)); 1005 oct2 = ct2; 1006 kmax = rbuf_i[2 * j]; 1007 for (k = 0; k < kmax; k++, ct1++) { 1008 row = rbuf_i[ct1]; 1009 if (!PetscBTLookupSet(xtable, row)) { 1010 if (!(ct3 < mem_estimate)) { 1011 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1012 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1013 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1014 PetscCall(PetscFree(xdata[0])); 1015 xdata[0] = tmp; 1016 mem_estimate = new_estimate; 1017 ++no_malloc; 1018 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1019 } 1020 xdata[i][ct2++] = row; 1021 ct3++; 1022 } 1023 } 1024 for (k = oct2, max2 = ct2; k < max2; k++) { 1025 row = xdata[i][k] - rstart; 1026 start = ai[row]; 1027 end = ai[row + 1]; 1028 for (l = start; l < end; l++) { 1029 val = aj[l] + cstart; 1030 if (!PetscBTLookupSet(xtable, val)) { 1031 if (!(ct3 < mem_estimate)) { 1032 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1033 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1034 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1035 PetscCall(PetscFree(xdata[0])); 1036 xdata[0] = tmp; 1037 mem_estimate = new_estimate; 1038 ++no_malloc; 1039 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1040 } 1041 xdata[i][ct2++] = val; 1042 ct3++; 1043 } 1044 } 1045 start = bi[row]; 1046 end = bi[row + 1]; 1047 for (l = start; l < end; l++) { 1048 val = garray[bj[l]]; 1049 if (!PetscBTLookupSet(xtable, val)) { 1050 if (!(ct3 < mem_estimate)) { 1051 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1052 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1053 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1054 PetscCall(PetscFree(xdata[0])); 1055 xdata[0] = tmp; 1056 mem_estimate = new_estimate; 1057 ++no_malloc; 1058 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1059 } 1060 xdata[i][ct2++] = val; 1061 ct3++; 1062 } 1063 } 1064 } 1065 /* Update the header*/ 1066 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1067 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1068 } 1069 xdata[i][0] = rbuf_0; 1070 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1071 isz1[i] = ct2; /* size of each message */ 1072 } 1073 PetscCall(PetscBTDestroy(&xtable)); 1074 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1075 PetscFunctionReturn(0); 1076 } 1077 /* -------------------------------------------------------------------------*/ 1078 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1079 /* 1080 Every processor gets the entire matrix 1081 */ 1082 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) 1083 { 1084 Mat B; 1085 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1086 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1087 PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL; 1088 PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j; 1089 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1090 1091 PetscFunctionBegin; 1092 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1093 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1094 if (scall == MAT_INITIAL_MATRIX) { 1095 /* ---------------------------------------------------------------- 1096 Tell every processor the number of nonzeros per row 1097 */ 1098 PetscCall(PetscMalloc1(A->rmap->N, &lens)); 1099 for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; 1100 PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1101 for (i = 0; i < size; i++) { 1102 recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i]; 1103 displs[i] = A->rmap->range[i]; 1104 } 1105 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1106 /* --------------------------------------------------------------- 1107 Create the sequential matrix of the same type as the local block diagonal 1108 */ 1109 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1110 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1111 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1112 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1113 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1114 PetscCall(PetscCalloc1(2, Bin)); 1115 **Bin = B; 1116 b = (Mat_SeqAIJ *)B->data; 1117 1118 /*-------------------------------------------------------------------- 1119 Copy my part of matrix column indices over 1120 */ 1121 sendcount = ad->nz + bd->nz; 1122 jsendbuf = b->j + b->i[rstarts[rank]]; 1123 a_jsendbuf = ad->j; 1124 b_jsendbuf = bd->j; 1125 n = A->rmap->rend - A->rmap->rstart; 1126 cnt = 0; 1127 for (i = 0; i < n; i++) { 1128 /* put in lower diagonal portion */ 1129 m = bd->i[i + 1] - bd->i[i]; 1130 while (m > 0) { 1131 /* is it above diagonal (in bd (compressed) numbering) */ 1132 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1133 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1134 m--; 1135 } 1136 1137 /* put in diagonal portion */ 1138 for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1139 1140 /* put in upper diagonal portion */ 1141 while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1142 } 1143 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1144 1145 /*-------------------------------------------------------------------- 1146 Gather all column indices to all processors 1147 */ 1148 for (i = 0; i < size; i++) { 1149 recvcounts[i] = 0; 1150 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j]; 1151 } 1152 displs[0] = 0; 1153 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1154 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1155 /*-------------------------------------------------------------------- 1156 Assemble the matrix into useable form (note numerical values not yet set) 1157 */ 1158 /* set the b->ilen (length of each row) values */ 1159 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1160 /* set the b->i indices */ 1161 b->i[0] = 0; 1162 for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1]; 1163 PetscCall(PetscFree(lens)); 1164 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1165 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1166 1167 } else { 1168 B = **Bin; 1169 b = (Mat_SeqAIJ *)B->data; 1170 } 1171 1172 /*-------------------------------------------------------------------- 1173 Copy my part of matrix numerical values into the values location 1174 */ 1175 if (flag == MAT_GET_VALUES) { 1176 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1177 MatScalar *sendbuf, *recvbuf; 1178 1179 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1180 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1181 sendcount = ad->nz + bd->nz; 1182 sendbuf = b->a + b->i[rstarts[rank]]; 1183 a_sendbuf = ada; 1184 b_sendbuf = bda; 1185 b_sendj = bd->j; 1186 n = A->rmap->rend - A->rmap->rstart; 1187 cnt = 0; 1188 for (i = 0; i < n; i++) { 1189 /* put in lower diagonal portion */ 1190 m = bd->i[i + 1] - bd->i[i]; 1191 while (m > 0) { 1192 /* is it above diagonal (in bd (compressed) numbering) */ 1193 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1194 sendbuf[cnt++] = *b_sendbuf++; 1195 m--; 1196 b_sendj++; 1197 } 1198 1199 /* put in diagonal portion */ 1200 for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++; 1201 1202 /* put in upper diagonal portion */ 1203 while (m-- > 0) { 1204 sendbuf[cnt++] = *b_sendbuf++; 1205 b_sendj++; 1206 } 1207 } 1208 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1209 1210 /* ----------------------------------------------------------------- 1211 Gather all numerical values to all processors 1212 */ 1213 if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1214 for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]]; 1215 displs[0] = 0; 1216 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1217 recvbuf = b->a; 1218 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A))); 1219 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1220 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1221 } /* endof (flag == MAT_GET_VALUES) */ 1222 PetscCall(PetscFree2(recvcounts, displs)); 1223 1224 PetscCall(MatPropagateSymmetryOptions(A, B)); 1225 PetscFunctionReturn(0); 1226 } 1227 1228 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) 1229 { 1230 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1231 Mat submat, A = c->A, B = c->B; 1232 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1233 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1234 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1235 const PetscInt *icol, *irow; 1236 PetscInt nrow, ncol, start; 1237 PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr; 1238 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc; 1239 PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr; 1240 PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz; 1241 PetscInt *lens, rmax, ncols, *cols, Crow; 1242 #if defined(PETSC_USE_CTABLE) 1243 PetscTable cmap, rmap; 1244 PetscInt *cmap_loc, *rmap_loc; 1245 #else 1246 PetscInt *cmap, *rmap; 1247 #endif 1248 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i; 1249 PetscInt *cworkB, lwrite, *subcols, *row2proc; 1250 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1251 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1252 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1253 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1254 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1255 MPI_Comm comm; 1256 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1257 PetscMPIInt *onodes1, *olengths1, idex, end; 1258 Mat_SubSppt *smatis1; 1259 PetscBool isrowsorted, iscolsorted; 1260 1261 PetscFunctionBegin; 1262 PetscValidLogicalCollectiveInt(C, ismax, 2); 1263 PetscValidLogicalCollectiveEnum(C, scall, 5); 1264 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1265 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1266 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1267 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1268 size = c->size; 1269 rank = c->rank; 1270 1271 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1272 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1273 PetscCall(ISGetIndices(isrow[0], &irow)); 1274 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1275 if (allcolumns) { 1276 icol = NULL; 1277 ncol = C->cmap->N; 1278 } else { 1279 PetscCall(ISGetIndices(iscol[0], &icol)); 1280 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1281 } 1282 1283 if (scall == MAT_INITIAL_MATRIX) { 1284 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1285 1286 /* Get some new tags to keep the communication clean */ 1287 tag1 = ((PetscObject)C)->tag; 1288 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1289 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1290 1291 /* evaluate communication - mesg to who, length of mesg, and buffer space 1292 required. Based on this, buffers are allocated, and data copied into them */ 1293 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1294 PetscCall(PetscMalloc1(nrow, &row2proc)); 1295 1296 /* w1[proc] = num of rows owned by proc -- to be requested */ 1297 proc = 0; 1298 nrqs = 0; /* num of outgoing messages */ 1299 for (j = 0; j < nrow; j++) { 1300 row = irow[j]; 1301 if (!isrowsorted) proc = 0; 1302 while (row >= C->rmap->range[proc + 1]) proc++; 1303 w1[proc]++; 1304 row2proc[j] = proc; /* map row index to proc */ 1305 1306 if (proc != rank && !w2[proc]) { 1307 w2[proc] = 1; 1308 nrqs++; 1309 } 1310 } 1311 w1[rank] = 0; /* rows owned by self will not be requested */ 1312 1313 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1314 for (proc = 0, j = 0; proc < size; proc++) { 1315 if (w1[proc]) pa[j++] = proc; 1316 } 1317 1318 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1319 msz = 0; /* total mesg length (for all procs) */ 1320 for (i = 0; i < nrqs; i++) { 1321 proc = pa[i]; 1322 w1[proc] += 3; 1323 msz += w1[proc]; 1324 } 1325 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1326 1327 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1328 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1329 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1330 1331 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1332 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1333 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1334 1335 /* Now post the Irecvs corresponding to these messages */ 1336 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1337 1338 PetscCall(PetscFree(onodes1)); 1339 PetscCall(PetscFree(olengths1)); 1340 1341 /* Allocate Memory for outgoing messages */ 1342 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1343 PetscCall(PetscArrayzero(sbuf1, size)); 1344 PetscCall(PetscArrayzero(ptr, size)); 1345 1346 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1347 iptr = tmp; 1348 for (i = 0; i < nrqs; i++) { 1349 proc = pa[i]; 1350 sbuf1[proc] = iptr; 1351 iptr += w1[proc]; 1352 } 1353 1354 /* Form the outgoing messages */ 1355 /* Initialize the header space */ 1356 for (i = 0; i < nrqs; i++) { 1357 proc = pa[i]; 1358 PetscCall(PetscArrayzero(sbuf1[proc], 3)); 1359 ptr[proc] = sbuf1[proc] + 3; 1360 } 1361 1362 /* Parse the isrow and copy data into outbuf */ 1363 PetscCall(PetscArrayzero(ctr, size)); 1364 for (j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1365 proc = row2proc[j]; 1366 if (proc != rank) { /* copy to the outgoing buf*/ 1367 *ptr[proc] = irow[j]; 1368 ctr[proc]++; 1369 ptr[proc]++; 1370 } 1371 } 1372 1373 /* Update the headers for the current IS */ 1374 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 1375 if ((ctr_j = ctr[j])) { 1376 sbuf1_j = sbuf1[j]; 1377 k = ++sbuf1_j[0]; 1378 sbuf1_j[2 * k] = ctr_j; 1379 sbuf1_j[2 * k - 1] = 0; 1380 } 1381 } 1382 1383 /* Now post the sends */ 1384 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1385 for (i = 0; i < nrqs; ++i) { 1386 proc = pa[i]; 1387 PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i)); 1388 } 1389 1390 /* Post Receives to capture the buffer size */ 1391 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1392 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1393 1394 if (nrqs) rbuf2[0] = tmp + msz; 1395 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1396 1397 for (i = 0; i < nrqs; ++i) { 1398 proc = pa[i]; 1399 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i)); 1400 } 1401 1402 PetscCall(PetscFree2(w1, w2)); 1403 1404 /* Send to other procs the buf size they should allocate */ 1405 /* Receive messages*/ 1406 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1407 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1408 1409 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1410 for (i = 0; i < nrqr; ++i) { 1411 req_size[i] = 0; 1412 rbuf1_i = rbuf1[i]; 1413 start = 2 * rbuf1_i[0] + 1; 1414 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1415 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1416 sbuf2_i = sbuf2[i]; 1417 for (j = start; j < end; j++) { 1418 k = rbuf1_i[j] - rstart; 1419 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1420 sbuf2_i[j] = ncols; 1421 req_size[i] += ncols; 1422 } 1423 req_source1[i] = r_status1[i].MPI_SOURCE; 1424 1425 /* form the header */ 1426 sbuf2_i[0] = req_size[i]; 1427 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1428 1429 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1430 } 1431 1432 PetscCall(PetscFree(r_status1)); 1433 PetscCall(PetscFree(r_waits1)); 1434 1435 /* rbuf2 is received, Post recv column indices a->j */ 1436 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1437 1438 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1439 for (i = 0; i < nrqs; ++i) { 1440 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1441 req_source2[i] = r_status2[i].MPI_SOURCE; 1442 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1443 } 1444 1445 /* Wait on sends1 and sends2 */ 1446 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1447 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1448 PetscCall(PetscFree(s_waits1)); 1449 PetscCall(PetscFree(s_status1)); 1450 1451 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1452 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1453 1454 /* Now allocate sending buffers for a->j, and send them off */ 1455 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1456 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1457 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 1458 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1459 1460 for (i = 0; i < nrqr; i++) { /* for each requested message */ 1461 rbuf1_i = rbuf1[i]; 1462 sbuf_aj_i = sbuf_aj[i]; 1463 ct1 = 2 * rbuf1_i[0] + 1; 1464 ct2 = 0; 1465 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1466 1467 kmax = rbuf1[i][2]; 1468 for (k = 0; k < kmax; k++, ct1++) { /* for each row */ 1469 row = rbuf1_i[ct1] - rstart; 1470 nzA = ai[row + 1] - ai[row]; 1471 nzB = bi[row + 1] - bi[row]; 1472 ncols = nzA + nzB; 1473 cworkA = aj + ai[row]; 1474 cworkB = bj + bi[row]; 1475 1476 /* load the column indices for this row into cols*/ 1477 cols = sbuf_aj_i + ct2; 1478 1479 lwrite = 0; 1480 for (l = 0; l < nzB; l++) { 1481 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1482 } 1483 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1484 for (l = 0; l < nzB; l++) { 1485 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1486 } 1487 1488 ct2 += ncols; 1489 } 1490 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1491 } 1492 1493 /* create column map (cmap): global col of C -> local col of submat */ 1494 #if defined(PETSC_USE_CTABLE) 1495 if (!allcolumns) { 1496 PetscCall(PetscTableCreate(ncol, C->cmap->N, &cmap)); 1497 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1498 for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1499 if (icol[j] >= cstart && icol[j] < cend) { 1500 cmap_loc[icol[j] - cstart] = j + 1; 1501 } else { /* use PetscTable for non-local col indices */ 1502 PetscCall(PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES)); 1503 } 1504 } 1505 } else { 1506 cmap = NULL; 1507 cmap_loc = NULL; 1508 } 1509 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1510 #else 1511 if (!allcolumns) { 1512 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1513 for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1514 } else { 1515 cmap = NULL; 1516 } 1517 #endif 1518 1519 /* Create lens for MatSeqAIJSetPreallocation() */ 1520 PetscCall(PetscCalloc1(nrow, &lens)); 1521 1522 /* Compute lens from local part of C */ 1523 for (j = 0; j < nrow; j++) { 1524 row = irow[j]; 1525 proc = row2proc[j]; 1526 if (proc == rank) { 1527 /* diagonal part A = c->A */ 1528 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1529 cols = aj + ai[row - rstart]; 1530 if (!allcolumns) { 1531 for (k = 0; k < ncols; k++) { 1532 #if defined(PETSC_USE_CTABLE) 1533 tcol = cmap_loc[cols[k]]; 1534 #else 1535 tcol = cmap[cols[k] + cstart]; 1536 #endif 1537 if (tcol) lens[j]++; 1538 } 1539 } else { /* allcolumns */ 1540 lens[j] = ncols; 1541 } 1542 1543 /* off-diagonal part B = c->B */ 1544 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1545 cols = bj + bi[row - rstart]; 1546 if (!allcolumns) { 1547 for (k = 0; k < ncols; k++) { 1548 #if defined(PETSC_USE_CTABLE) 1549 PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol)); 1550 #else 1551 tcol = cmap[bmap[cols[k]]]; 1552 #endif 1553 if (tcol) lens[j]++; 1554 } 1555 } else { /* allcolumns */ 1556 lens[j] += ncols; 1557 } 1558 } 1559 } 1560 1561 /* Create row map (rmap): global row of C -> local row of submat */ 1562 #if defined(PETSC_USE_CTABLE) 1563 PetscCall(PetscTableCreate(nrow, C->rmap->N, &rmap)); 1564 for (j = 0; j < nrow; j++) { 1565 row = irow[j]; 1566 proc = row2proc[j]; 1567 if (proc == rank) { /* a local row */ 1568 rmap_loc[row - rstart] = j; 1569 } else { 1570 PetscCall(PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES)); 1571 } 1572 } 1573 #else 1574 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1575 for (j = 0; j < nrow; j++) rmap[irow[j]] = j; 1576 #endif 1577 1578 /* Update lens from offproc data */ 1579 /* recv a->j is done */ 1580 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1581 for (i = 0; i < nrqs; i++) { 1582 proc = pa[i]; 1583 sbuf1_i = sbuf1[proc]; 1584 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1585 ct1 = 2 + 1; 1586 ct2 = 0; 1587 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1588 rbuf3_i = rbuf3[i]; /* received C->j */ 1589 1590 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1591 max1 = sbuf1_i[2]; 1592 for (k = 0; k < max1; k++, ct1++) { 1593 #if defined(PETSC_USE_CTABLE) 1594 PetscCall(PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row)); 1595 row--; 1596 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1597 #else 1598 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1599 #endif 1600 /* Now, store row index of submat in sbuf1_i[ct1] */ 1601 sbuf1_i[ct1] = row; 1602 1603 nnz = rbuf2_i[ct1]; 1604 if (!allcolumns) { 1605 for (l = 0; l < nnz; l++, ct2++) { 1606 #if defined(PETSC_USE_CTABLE) 1607 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1608 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1609 } else { 1610 PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol)); 1611 } 1612 #else 1613 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1614 #endif 1615 if (tcol) lens[row]++; 1616 } 1617 } else { /* allcolumns */ 1618 lens[row] += nnz; 1619 } 1620 } 1621 } 1622 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1623 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1624 1625 /* Create the submatrices */ 1626 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1627 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1628 1629 PetscCall(ISGetBlockSize(isrow[0], &i)); 1630 PetscCall(ISGetBlockSize(iscol[0], &j)); 1631 PetscCall(MatSetBlockSizes(submat, i, j)); 1632 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1633 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1634 1635 /* create struct Mat_SubSppt and attached it to submat */ 1636 PetscCall(PetscNew(&smatis1)); 1637 subc = (Mat_SeqAIJ *)submat->data; 1638 subc->submatis1 = smatis1; 1639 1640 smatis1->id = 0; 1641 smatis1->nrqs = nrqs; 1642 smatis1->nrqr = nrqr; 1643 smatis1->rbuf1 = rbuf1; 1644 smatis1->rbuf2 = rbuf2; 1645 smatis1->rbuf3 = rbuf3; 1646 smatis1->sbuf2 = sbuf2; 1647 smatis1->req_source2 = req_source2; 1648 1649 smatis1->sbuf1 = sbuf1; 1650 smatis1->ptr = ptr; 1651 smatis1->tmp = tmp; 1652 smatis1->ctr = ctr; 1653 1654 smatis1->pa = pa; 1655 smatis1->req_size = req_size; 1656 smatis1->req_source1 = req_source1; 1657 1658 smatis1->allcolumns = allcolumns; 1659 smatis1->singleis = PETSC_TRUE; 1660 smatis1->row2proc = row2proc; 1661 smatis1->rmap = rmap; 1662 smatis1->cmap = cmap; 1663 #if defined(PETSC_USE_CTABLE) 1664 smatis1->rmap_loc = rmap_loc; 1665 smatis1->cmap_loc = cmap_loc; 1666 #endif 1667 1668 smatis1->destroy = submat->ops->destroy; 1669 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1670 submat->factortype = C->factortype; 1671 1672 /* compute rmax */ 1673 rmax = 0; 1674 for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1675 1676 } else { /* scall == MAT_REUSE_MATRIX */ 1677 submat = submats[0]; 1678 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1679 1680 subc = (Mat_SeqAIJ *)submat->data; 1681 rmax = subc->rmax; 1682 smatis1 = subc->submatis1; 1683 nrqs = smatis1->nrqs; 1684 nrqr = smatis1->nrqr; 1685 rbuf1 = smatis1->rbuf1; 1686 rbuf2 = smatis1->rbuf2; 1687 rbuf3 = smatis1->rbuf3; 1688 req_source2 = smatis1->req_source2; 1689 1690 sbuf1 = smatis1->sbuf1; 1691 sbuf2 = smatis1->sbuf2; 1692 ptr = smatis1->ptr; 1693 tmp = smatis1->tmp; 1694 ctr = smatis1->ctr; 1695 1696 pa = smatis1->pa; 1697 req_size = smatis1->req_size; 1698 req_source1 = smatis1->req_source1; 1699 1700 allcolumns = smatis1->allcolumns; 1701 row2proc = smatis1->row2proc; 1702 rmap = smatis1->rmap; 1703 cmap = smatis1->cmap; 1704 #if defined(PETSC_USE_CTABLE) 1705 rmap_loc = smatis1->rmap_loc; 1706 cmap_loc = smatis1->cmap_loc; 1707 #endif 1708 } 1709 1710 /* Post recv matrix values */ 1711 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1712 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1713 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1714 for (i = 0; i < nrqs; ++i) { 1715 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1716 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1717 } 1718 1719 /* Allocate sending buffers for a->a, and send them off */ 1720 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1721 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1722 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 1723 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1724 1725 for (i = 0; i < nrqr; i++) { 1726 rbuf1_i = rbuf1[i]; 1727 sbuf_aa_i = sbuf_aa[i]; 1728 ct1 = 2 * rbuf1_i[0] + 1; 1729 ct2 = 0; 1730 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1731 1732 kmax = rbuf1_i[2]; 1733 for (k = 0; k < kmax; k++, ct1++) { 1734 row = rbuf1_i[ct1] - rstart; 1735 nzA = ai[row + 1] - ai[row]; 1736 nzB = bi[row + 1] - bi[row]; 1737 ncols = nzA + nzB; 1738 cworkB = bj + bi[row]; 1739 vworkA = a_a + ai[row]; 1740 vworkB = b_a + bi[row]; 1741 1742 /* load the column values for this row into vals*/ 1743 vals = sbuf_aa_i + ct2; 1744 1745 lwrite = 0; 1746 for (l = 0; l < nzB; l++) { 1747 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1748 } 1749 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1750 for (l = 0; l < nzB; l++) { 1751 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1752 } 1753 1754 ct2 += ncols; 1755 } 1756 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1757 } 1758 1759 /* Assemble submat */ 1760 /* First assemble the local rows */ 1761 for (j = 0; j < nrow; j++) { 1762 row = irow[j]; 1763 proc = row2proc[j]; 1764 if (proc == rank) { 1765 Crow = row - rstart; /* local row index of C */ 1766 #if defined(PETSC_USE_CTABLE) 1767 row = rmap_loc[Crow]; /* row index of submat */ 1768 #else 1769 row = rmap[row]; 1770 #endif 1771 1772 if (allcolumns) { 1773 /* diagonal part A = c->A */ 1774 ncols = ai[Crow + 1] - ai[Crow]; 1775 cols = aj + ai[Crow]; 1776 vals = a_a + ai[Crow]; 1777 i = 0; 1778 for (k = 0; k < ncols; k++) { 1779 subcols[i] = cols[k] + cstart; 1780 subvals[i++] = vals[k]; 1781 } 1782 1783 /* off-diagonal part B = c->B */ 1784 ncols = bi[Crow + 1] - bi[Crow]; 1785 cols = bj + bi[Crow]; 1786 vals = b_a + bi[Crow]; 1787 for (k = 0; k < ncols; k++) { 1788 subcols[i] = bmap[cols[k]]; 1789 subvals[i++] = vals[k]; 1790 } 1791 1792 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1793 1794 } else { /* !allcolumns */ 1795 #if defined(PETSC_USE_CTABLE) 1796 /* diagonal part A = c->A */ 1797 ncols = ai[Crow + 1] - ai[Crow]; 1798 cols = aj + ai[Crow]; 1799 vals = a_a + ai[Crow]; 1800 i = 0; 1801 for (k = 0; k < ncols; k++) { 1802 tcol = cmap_loc[cols[k]]; 1803 if (tcol) { 1804 subcols[i] = --tcol; 1805 subvals[i++] = vals[k]; 1806 } 1807 } 1808 1809 /* off-diagonal part B = c->B */ 1810 ncols = bi[Crow + 1] - bi[Crow]; 1811 cols = bj + bi[Crow]; 1812 vals = b_a + bi[Crow]; 1813 for (k = 0; k < ncols; k++) { 1814 PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol)); 1815 if (tcol) { 1816 subcols[i] = --tcol; 1817 subvals[i++] = vals[k]; 1818 } 1819 } 1820 #else 1821 /* diagonal part A = c->A */ 1822 ncols = ai[Crow + 1] - ai[Crow]; 1823 cols = aj + ai[Crow]; 1824 vals = a_a + ai[Crow]; 1825 i = 0; 1826 for (k = 0; k < ncols; k++) { 1827 tcol = cmap[cols[k] + cstart]; 1828 if (tcol) { 1829 subcols[i] = --tcol; 1830 subvals[i++] = vals[k]; 1831 } 1832 } 1833 1834 /* off-diagonal part B = c->B */ 1835 ncols = bi[Crow + 1] - bi[Crow]; 1836 cols = bj + bi[Crow]; 1837 vals = b_a + bi[Crow]; 1838 for (k = 0; k < ncols; k++) { 1839 tcol = cmap[bmap[cols[k]]]; 1840 if (tcol) { 1841 subcols[i] = --tcol; 1842 subvals[i++] = vals[k]; 1843 } 1844 } 1845 #endif 1846 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1847 } 1848 } 1849 } 1850 1851 /* Now assemble the off-proc rows */ 1852 for (i = 0; i < nrqs; i++) { /* for each requested message */ 1853 /* recv values from other processes */ 1854 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1855 proc = pa[idex]; 1856 sbuf1_i = sbuf1[proc]; 1857 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1858 ct1 = 2 + 1; 1859 ct2 = 0; /* count of received C->j */ 1860 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1861 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1862 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1863 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1864 1865 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1866 max1 = sbuf1_i[2]; /* num of rows */ 1867 for (k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1868 row = sbuf1_i[ct1]; /* row index of submat */ 1869 if (!allcolumns) { 1870 idex = 0; 1871 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1872 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1873 for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1874 #if defined(PETSC_USE_CTABLE) 1875 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1876 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1877 } else { 1878 PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol)); 1879 } 1880 #else 1881 tcol = cmap[rbuf3_i[ct2]]; 1882 #endif 1883 if (tcol) { 1884 subcols[idex] = --tcol; /* may not be sorted */ 1885 subvals[idex++] = rbuf4_i[ct2]; 1886 1887 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1888 For reuse, we replace received C->j with index that should be inserted to submat */ 1889 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1890 } 1891 } 1892 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1893 } else { /* scall == MAT_REUSE_MATRIX */ 1894 submat = submats[0]; 1895 subc = (Mat_SeqAIJ *)submat->data; 1896 1897 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1898 for (l = 0; l < nnz; l++) { 1899 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1900 subvals[idex++] = rbuf4_i[ct2]; 1901 } 1902 1903 bj = subc->j + subc->i[row]; /* sorted column indices */ 1904 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1905 } 1906 } else { /* allcolumns */ 1907 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1908 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES)); 1909 ct2 += nnz; 1910 } 1911 } 1912 } 1913 1914 /* sending a->a are done */ 1915 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1916 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1917 1918 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1919 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1920 submats[0] = submat; 1921 1922 /* Restore the indices */ 1923 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1924 if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol)); 1925 1926 /* Destroy allocated memory */ 1927 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 1928 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1929 if (sbuf_aa) { 1930 PetscCall(PetscFree(sbuf_aa[0])); 1931 PetscCall(PetscFree(sbuf_aa)); 1932 } 1933 1934 if (scall == MAT_INITIAL_MATRIX) { 1935 PetscCall(PetscFree(lens)); 1936 if (sbuf_aj) { 1937 PetscCall(PetscFree(sbuf_aj[0])); 1938 PetscCall(PetscFree(sbuf_aj)); 1939 } 1940 } 1941 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1942 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1943 PetscFunctionReturn(0); 1944 } 1945 1946 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1947 { 1948 PetscInt ncol; 1949 PetscBool colflag, allcolumns = PETSC_FALSE; 1950 1951 PetscFunctionBegin; 1952 /* Allocate memory to hold all the submatrices */ 1953 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat)); 1954 1955 /* Check for special case: each processor gets entire matrix columns */ 1956 PetscCall(ISIdentity(iscol[0], &colflag)); 1957 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1958 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1959 1960 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1961 PetscFunctionReturn(0); 1962 } 1963 1964 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1965 { 1966 PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2]; 1967 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1968 Mat_SeqAIJ *subc; 1969 Mat_SubSppt *smat; 1970 1971 PetscFunctionBegin; 1972 /* Check for special case: each processor has a single IS */ 1973 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1974 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1975 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1976 PetscFunctionReturn(0); 1977 } 1978 1979 /* Collect global wantallmatrix and nstages */ 1980 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1981 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1982 if (!nmax) nmax = 1; 1983 1984 if (scall == MAT_INITIAL_MATRIX) { 1985 /* Collect global wantallmatrix and nstages */ 1986 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1987 PetscCall(ISIdentity(*isrow, &rowflag)); 1988 PetscCall(ISIdentity(*iscol, &colflag)); 1989 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1990 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1991 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1992 wantallmatrix = PETSC_TRUE; 1993 1994 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1995 } 1996 } 1997 1998 /* Determine the number of stages through which submatrices are done 1999 Each stage will extract nmax submatrices. 2000 nmax is determined by the matrix column dimension. 2001 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2002 */ 2003 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2004 2005 in[0] = -1 * (PetscInt)wantallmatrix; 2006 in[1] = nstages; 2007 PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 2008 wantallmatrix = (PetscBool)(-out[0]); 2009 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2010 2011 } else { /* MAT_REUSE_MATRIX */ 2012 if (ismax) { 2013 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2014 smat = subc->submatis1; 2015 } else { /* (*submat)[0] is a dummy matrix */ 2016 smat = (Mat_SubSppt *)(*submat)[0]->data; 2017 } 2018 if (!smat) { 2019 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2020 wantallmatrix = PETSC_TRUE; 2021 } else if (smat->singleis) { 2022 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 2023 PetscFunctionReturn(0); 2024 } else { 2025 nstages = smat->nstages; 2026 } 2027 } 2028 2029 if (wantallmatrix) { 2030 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 2031 PetscFunctionReturn(0); 2032 } 2033 2034 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2035 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat)); 2036 2037 for (i = 0, pos = 0; i < nstages; i++) { 2038 if (pos + nmax <= ismax) max_no = nmax; 2039 else if (pos >= ismax) max_no = 0; 2040 else max_no = ismax - pos; 2041 2042 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos)); 2043 if (!max_no) { 2044 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2045 smat = (Mat_SubSppt *)(*submat)[pos]->data; 2046 smat->nstages = nstages; 2047 } 2048 pos++; /* advance to next dummy matrix if any */ 2049 } else pos += max_no; 2050 } 2051 2052 if (ismax && scall == MAT_INITIAL_MATRIX) { 2053 /* save nstages for reuse */ 2054 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2055 smat = subc->submatis1; 2056 smat->nstages = nstages; 2057 } 2058 PetscFunctionReturn(0); 2059 } 2060 2061 /* -------------------------------------------------------------------------*/ 2062 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) 2063 { 2064 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2065 Mat A = c->A; 2066 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2067 const PetscInt **icol, **irow; 2068 PetscInt *nrow, *ncol, start; 2069 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr; 2070 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1; 2071 PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol; 2072 PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2073 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2074 #if defined(PETSC_USE_CTABLE) 2075 PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i; 2076 #else 2077 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2078 #endif 2079 const PetscInt *irow_i; 2080 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2081 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2082 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2083 MPI_Comm comm; 2084 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2085 PetscMPIInt *onodes1, *olengths1, end; 2086 PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2087 Mat_SubSppt *smat_i; 2088 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2089 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen; 2090 2091 PetscFunctionBegin; 2092 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2093 size = c->size; 2094 rank = c->rank; 2095 2096 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2097 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2098 2099 for (i = 0; i < ismax; i++) { 2100 PetscCall(ISSorted(iscol[i], &issorted[i])); 2101 if (!issorted[i]) iscsorted = issorted[i]; 2102 2103 PetscCall(ISSorted(isrow[i], &issorted[i])); 2104 2105 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2106 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2107 2108 /* Check for special case: allcolumn */ 2109 PetscCall(ISIdentity(iscol[i], &colflag)); 2110 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2111 if (colflag && ncol[i] == C->cmap->N) { 2112 allcolumns[i] = PETSC_TRUE; 2113 icol[i] = NULL; 2114 } else { 2115 allcolumns[i] = PETSC_FALSE; 2116 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2117 } 2118 } 2119 2120 if (scall == MAT_REUSE_MATRIX) { 2121 /* Assumes new rows are same length as the old rows */ 2122 for (i = 0; i < ismax; i++) { 2123 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2124 subc = (Mat_SeqAIJ *)submats[i]->data; 2125 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2126 2127 /* Initial matrix as if empty */ 2128 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2129 2130 smat_i = subc->submatis1; 2131 2132 nrqs = smat_i->nrqs; 2133 nrqr = smat_i->nrqr; 2134 rbuf1 = smat_i->rbuf1; 2135 rbuf2 = smat_i->rbuf2; 2136 rbuf3 = smat_i->rbuf3; 2137 req_source2 = smat_i->req_source2; 2138 2139 sbuf1 = smat_i->sbuf1; 2140 sbuf2 = smat_i->sbuf2; 2141 ptr = smat_i->ptr; 2142 tmp = smat_i->tmp; 2143 ctr = smat_i->ctr; 2144 2145 pa = smat_i->pa; 2146 req_size = smat_i->req_size; 2147 req_source1 = smat_i->req_source1; 2148 2149 allcolumns[i] = smat_i->allcolumns; 2150 row2proc[i] = smat_i->row2proc; 2151 rmap[i] = smat_i->rmap; 2152 cmap[i] = smat_i->cmap; 2153 } 2154 2155 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2156 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2157 smat_i = (Mat_SubSppt *)submats[0]->data; 2158 2159 nrqs = smat_i->nrqs; 2160 nrqr = smat_i->nrqr; 2161 rbuf1 = smat_i->rbuf1; 2162 rbuf2 = smat_i->rbuf2; 2163 rbuf3 = smat_i->rbuf3; 2164 req_source2 = smat_i->req_source2; 2165 2166 sbuf1 = smat_i->sbuf1; 2167 sbuf2 = smat_i->sbuf2; 2168 ptr = smat_i->ptr; 2169 tmp = smat_i->tmp; 2170 ctr = smat_i->ctr; 2171 2172 pa = smat_i->pa; 2173 req_size = smat_i->req_size; 2174 req_source1 = smat_i->req_source1; 2175 2176 allcolumns[0] = PETSC_FALSE; 2177 } 2178 } else { /* scall == MAT_INITIAL_MATRIX */ 2179 /* Get some new tags to keep the communication clean */ 2180 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2181 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2182 2183 /* evaluate communication - mesg to who, length of mesg, and buffer space 2184 required. Based on this, buffers are allocated, and data copied into them*/ 2185 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2186 2187 for (i = 0; i < ismax; i++) { 2188 jmax = nrow[i]; 2189 irow_i = irow[i]; 2190 2191 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2192 row2proc[i] = row2proc_i; 2193 2194 if (issorted[i]) proc = 0; 2195 for (j = 0; j < jmax; j++) { 2196 if (!issorted[i]) proc = 0; 2197 row = irow_i[j]; 2198 while (row >= C->rmap->range[proc + 1]) proc++; 2199 w4[proc]++; 2200 row2proc_i[j] = proc; /* map row index to proc */ 2201 } 2202 for (j = 0; j < size; j++) { 2203 if (w4[j]) { 2204 w1[j] += w4[j]; 2205 w3[j]++; 2206 w4[j] = 0; 2207 } 2208 } 2209 } 2210 2211 nrqs = 0; /* no of outgoing messages */ 2212 msz = 0; /* total mesg length (for all procs) */ 2213 w1[rank] = 0; /* no mesg sent to self */ 2214 w3[rank] = 0; 2215 for (i = 0; i < size; i++) { 2216 if (w1[i]) { 2217 w2[i] = 1; 2218 nrqs++; 2219 } /* there exists a message to proc i */ 2220 } 2221 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2222 for (i = 0, j = 0; i < size; i++) { 2223 if (w1[i]) { 2224 pa[j] = i; 2225 j++; 2226 } 2227 } 2228 2229 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2230 for (i = 0; i < nrqs; i++) { 2231 j = pa[i]; 2232 w1[j] += w2[j] + 2 * w3[j]; 2233 msz += w1[j]; 2234 } 2235 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2236 2237 /* Determine the number of messages to expect, their lengths, from from-ids */ 2238 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2239 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2240 2241 /* Now post the Irecvs corresponding to these messages */ 2242 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2243 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2244 2245 /* Allocate Memory for outgoing messages */ 2246 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2247 PetscCall(PetscArrayzero(sbuf1, size)); 2248 PetscCall(PetscArrayzero(ptr, size)); 2249 2250 { 2251 PetscInt *iptr = tmp; 2252 k = 0; 2253 for (i = 0; i < nrqs; i++) { 2254 j = pa[i]; 2255 iptr += k; 2256 sbuf1[j] = iptr; 2257 k = w1[j]; 2258 } 2259 } 2260 2261 /* Form the outgoing messages. Initialize the header space */ 2262 for (i = 0; i < nrqs; i++) { 2263 j = pa[i]; 2264 sbuf1[j][0] = 0; 2265 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2266 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2267 } 2268 2269 /* Parse the isrow and copy data into outbuf */ 2270 for (i = 0; i < ismax; i++) { 2271 row2proc_i = row2proc[i]; 2272 PetscCall(PetscArrayzero(ctr, size)); 2273 irow_i = irow[i]; 2274 jmax = nrow[i]; 2275 for (j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2276 proc = row2proc_i[j]; 2277 if (proc != rank) { /* copy to the outgoing buf*/ 2278 ctr[proc]++; 2279 *ptr[proc] = irow_i[j]; 2280 ptr[proc]++; 2281 } 2282 } 2283 /* Update the headers for the current IS */ 2284 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 2285 if ((ctr_j = ctr[j])) { 2286 sbuf1_j = sbuf1[j]; 2287 k = ++sbuf1_j[0]; 2288 sbuf1_j[2 * k] = ctr_j; 2289 sbuf1_j[2 * k - 1] = i; 2290 } 2291 } 2292 } 2293 2294 /* Now post the sends */ 2295 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2296 for (i = 0; i < nrqs; ++i) { 2297 j = pa[i]; 2298 PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2299 } 2300 2301 /* Post Receives to capture the buffer size */ 2302 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2303 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2304 if (nrqs) rbuf2[0] = tmp + msz; 2305 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 2306 for (i = 0; i < nrqs; ++i) { 2307 j = pa[i]; 2308 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2309 } 2310 2311 /* Send to other procs the buf size they should allocate */ 2312 /* Receive messages*/ 2313 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2314 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2315 { 2316 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2317 PetscInt *sbuf2_i; 2318 2319 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2320 for (i = 0; i < nrqr; ++i) { 2321 req_size[i] = 0; 2322 rbuf1_i = rbuf1[i]; 2323 start = 2 * rbuf1_i[0] + 1; 2324 end = olengths1[i]; 2325 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2326 sbuf2_i = sbuf2[i]; 2327 for (j = start; j < end; j++) { 2328 id = rbuf1_i[j] - rstart; 2329 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2330 sbuf2_i[j] = ncols; 2331 req_size[i] += ncols; 2332 } 2333 req_source1[i] = onodes1[i]; 2334 /* form the header */ 2335 sbuf2_i[0] = req_size[i]; 2336 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2337 2338 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2339 } 2340 } 2341 2342 PetscCall(PetscFree(onodes1)); 2343 PetscCall(PetscFree(olengths1)); 2344 PetscCall(PetscFree(r_waits1)); 2345 PetscCall(PetscFree4(w1, w2, w3, w4)); 2346 2347 /* Receive messages*/ 2348 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2349 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2350 for (i = 0; i < nrqs; ++i) { 2351 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2352 req_source2[i] = pa[i]; 2353 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2354 } 2355 PetscCall(PetscFree(r_waits2)); 2356 2357 /* Wait on sends1 and sends2 */ 2358 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2359 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2360 PetscCall(PetscFree(s_waits1)); 2361 PetscCall(PetscFree(s_waits2)); 2362 2363 /* Now allocate sending buffers for a->j, and send them off */ 2364 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2365 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2366 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 2367 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2368 2369 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2370 { 2371 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2372 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2373 PetscInt cend = C->cmap->rend; 2374 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2375 2376 for (i = 0; i < nrqr; i++) { 2377 rbuf1_i = rbuf1[i]; 2378 sbuf_aj_i = sbuf_aj[i]; 2379 ct1 = 2 * rbuf1_i[0] + 1; 2380 ct2 = 0; 2381 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2382 kmax = rbuf1[i][2 * j]; 2383 for (k = 0; k < kmax; k++, ct1++) { 2384 row = rbuf1_i[ct1] - rstart; 2385 nzA = a_i[row + 1] - a_i[row]; 2386 nzB = b_i[row + 1] - b_i[row]; 2387 ncols = nzA + nzB; 2388 cworkA = a_j + a_i[row]; 2389 cworkB = b_j + b_i[row]; 2390 2391 /* load the column indices for this row into cols */ 2392 cols = sbuf_aj_i + ct2; 2393 2394 lwrite = 0; 2395 for (l = 0; l < nzB; l++) { 2396 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2397 } 2398 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2399 for (l = 0; l < nzB; l++) { 2400 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2401 } 2402 2403 ct2 += ncols; 2404 } 2405 } 2406 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2407 } 2408 } 2409 2410 /* create col map: global col of C -> local col of submatrices */ 2411 { 2412 const PetscInt *icol_i; 2413 #if defined(PETSC_USE_CTABLE) 2414 for (i = 0; i < ismax; i++) { 2415 if (!allcolumns[i]) { 2416 PetscCall(PetscTableCreate(ncol[i], C->cmap->N, &cmap[i])); 2417 2418 jmax = ncol[i]; 2419 icol_i = icol[i]; 2420 cmap_i = cmap[i]; 2421 for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES)); 2422 } else cmap[i] = NULL; 2423 } 2424 #else 2425 for (i = 0; i < ismax; i++) { 2426 if (!allcolumns[i]) { 2427 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2428 jmax = ncol[i]; 2429 icol_i = icol[i]; 2430 cmap_i = cmap[i]; 2431 for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1; 2432 } else cmap[i] = NULL; 2433 } 2434 #endif 2435 } 2436 2437 /* Create lens which is required for MatCreate... */ 2438 for (i = 0, j = 0; i < ismax; i++) j += nrow[i]; 2439 PetscCall(PetscMalloc1(ismax, &lens)); 2440 2441 if (ismax) PetscCall(PetscCalloc1(j, &lens[0])); 2442 for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1]; 2443 2444 /* Update lens from local data */ 2445 for (i = 0; i < ismax; i++) { 2446 row2proc_i = row2proc[i]; 2447 jmax = nrow[i]; 2448 if (!allcolumns[i]) cmap_i = cmap[i]; 2449 irow_i = irow[i]; 2450 lens_i = lens[i]; 2451 for (j = 0; j < jmax; j++) { 2452 row = irow_i[j]; 2453 proc = row2proc_i[j]; 2454 if (proc == rank) { 2455 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2456 if (!allcolumns[i]) { 2457 for (k = 0; k < ncols; k++) { 2458 #if defined(PETSC_USE_CTABLE) 2459 PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol)); 2460 #else 2461 tcol = cmap_i[cols[k]]; 2462 #endif 2463 if (tcol) lens_i[j]++; 2464 } 2465 } else { /* allcolumns */ 2466 lens_i[j] = ncols; 2467 } 2468 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2469 } 2470 } 2471 } 2472 2473 /* Create row map: global row of C -> local row of submatrices */ 2474 #if defined(PETSC_USE_CTABLE) 2475 for (i = 0; i < ismax; i++) { 2476 PetscCall(PetscTableCreate(nrow[i], C->rmap->N, &rmap[i])); 2477 irow_i = irow[i]; 2478 jmax = nrow[i]; 2479 for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES)); 2480 } 2481 #else 2482 for (i = 0; i < ismax; i++) { 2483 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2484 rmap_i = rmap[i]; 2485 irow_i = irow[i]; 2486 jmax = nrow[i]; 2487 for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j; 2488 } 2489 #endif 2490 2491 /* Update lens from offproc data */ 2492 { 2493 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2494 2495 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2496 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2497 sbuf1_i = sbuf1[pa[tmp2]]; 2498 jmax = sbuf1_i[0]; 2499 ct1 = 2 * jmax + 1; 2500 ct2 = 0; 2501 rbuf2_i = rbuf2[tmp2]; 2502 rbuf3_i = rbuf3[tmp2]; 2503 for (j = 1; j <= jmax; j++) { 2504 is_no = sbuf1_i[2 * j - 1]; 2505 max1 = sbuf1_i[2 * j]; 2506 lens_i = lens[is_no]; 2507 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2508 rmap_i = rmap[is_no]; 2509 for (k = 0; k < max1; k++, ct1++) { 2510 #if defined(PETSC_USE_CTABLE) 2511 PetscCall(PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row)); 2512 row--; 2513 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2514 #else 2515 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2516 #endif 2517 max2 = rbuf2_i[ct1]; 2518 for (l = 0; l < max2; l++, ct2++) { 2519 if (!allcolumns[is_no]) { 2520 #if defined(PETSC_USE_CTABLE) 2521 PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol)); 2522 #else 2523 tcol = cmap_i[rbuf3_i[ct2]]; 2524 #endif 2525 if (tcol) lens_i[row]++; 2526 } else { /* allcolumns */ 2527 lens_i[row]++; /* lens_i[row] += max2 ? */ 2528 } 2529 } 2530 } 2531 } 2532 } 2533 } 2534 PetscCall(PetscFree(r_waits3)); 2535 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2536 PetscCall(PetscFree(s_waits3)); 2537 2538 /* Create the submatrices */ 2539 for (i = 0; i < ismax; i++) { 2540 PetscInt rbs, cbs; 2541 2542 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2543 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2544 2545 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2546 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2547 2548 PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2549 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2550 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2551 2552 /* create struct Mat_SubSppt and attached it to submat */ 2553 PetscCall(PetscNew(&smat_i)); 2554 subc = (Mat_SeqAIJ *)submats[i]->data; 2555 subc->submatis1 = smat_i; 2556 2557 smat_i->destroy = submats[i]->ops->destroy; 2558 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2559 submats[i]->factortype = C->factortype; 2560 2561 smat_i->id = i; 2562 smat_i->nrqs = nrqs; 2563 smat_i->nrqr = nrqr; 2564 smat_i->rbuf1 = rbuf1; 2565 smat_i->rbuf2 = rbuf2; 2566 smat_i->rbuf3 = rbuf3; 2567 smat_i->sbuf2 = sbuf2; 2568 smat_i->req_source2 = req_source2; 2569 2570 smat_i->sbuf1 = sbuf1; 2571 smat_i->ptr = ptr; 2572 smat_i->tmp = tmp; 2573 smat_i->ctr = ctr; 2574 2575 smat_i->pa = pa; 2576 smat_i->req_size = req_size; 2577 smat_i->req_source1 = req_source1; 2578 2579 smat_i->allcolumns = allcolumns[i]; 2580 smat_i->singleis = PETSC_FALSE; 2581 smat_i->row2proc = row2proc[i]; 2582 smat_i->rmap = rmap[i]; 2583 smat_i->cmap = cmap[i]; 2584 } 2585 2586 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2587 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2588 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2589 PetscCall(MatSetType(submats[0], MATDUMMY)); 2590 2591 /* create struct Mat_SubSppt and attached it to submat */ 2592 PetscCall(PetscNew(&smat_i)); 2593 submats[0]->data = (void *)smat_i; 2594 2595 smat_i->destroy = submats[0]->ops->destroy; 2596 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2597 submats[0]->factortype = C->factortype; 2598 2599 smat_i->id = 0; 2600 smat_i->nrqs = nrqs; 2601 smat_i->nrqr = nrqr; 2602 smat_i->rbuf1 = rbuf1; 2603 smat_i->rbuf2 = rbuf2; 2604 smat_i->rbuf3 = rbuf3; 2605 smat_i->sbuf2 = sbuf2; 2606 smat_i->req_source2 = req_source2; 2607 2608 smat_i->sbuf1 = sbuf1; 2609 smat_i->ptr = ptr; 2610 smat_i->tmp = tmp; 2611 smat_i->ctr = ctr; 2612 2613 smat_i->pa = pa; 2614 smat_i->req_size = req_size; 2615 smat_i->req_source1 = req_source1; 2616 2617 smat_i->allcolumns = PETSC_FALSE; 2618 smat_i->singleis = PETSC_FALSE; 2619 smat_i->row2proc = NULL; 2620 smat_i->rmap = NULL; 2621 smat_i->cmap = NULL; 2622 } 2623 2624 if (ismax) PetscCall(PetscFree(lens[0])); 2625 PetscCall(PetscFree(lens)); 2626 if (sbuf_aj) { 2627 PetscCall(PetscFree(sbuf_aj[0])); 2628 PetscCall(PetscFree(sbuf_aj)); 2629 } 2630 2631 } /* endof scall == MAT_INITIAL_MATRIX */ 2632 2633 /* Post recv matrix values */ 2634 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2635 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2636 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2637 for (i = 0; i < nrqs; ++i) { 2638 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2639 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2640 } 2641 2642 /* Allocate sending buffers for a->a, and send them off */ 2643 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2644 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2645 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 2646 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2647 2648 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2649 { 2650 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2651 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2652 PetscInt cend = C->cmap->rend; 2653 PetscInt *b_j = b->j; 2654 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2655 2656 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2657 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2658 for (i = 0; i < nrqr; i++) { 2659 rbuf1_i = rbuf1[i]; 2660 sbuf_aa_i = sbuf_aa[i]; 2661 ct1 = 2 * rbuf1_i[0] + 1; 2662 ct2 = 0; 2663 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2664 kmax = rbuf1_i[2 * j]; 2665 for (k = 0; k < kmax; k++, ct1++) { 2666 row = rbuf1_i[ct1] - rstart; 2667 nzA = a_i[row + 1] - a_i[row]; 2668 nzB = b_i[row + 1] - b_i[row]; 2669 ncols = nzA + nzB; 2670 cworkB = b_j + b_i[row]; 2671 vworkA = a_a + a_i[row]; 2672 vworkB = b_a + b_i[row]; 2673 2674 /* load the column values for this row into vals*/ 2675 vals = sbuf_aa_i + ct2; 2676 2677 lwrite = 0; 2678 for (l = 0; l < nzB; l++) { 2679 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2680 } 2681 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2682 for (l = 0; l < nzB; l++) { 2683 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2684 } 2685 2686 ct2 += ncols; 2687 } 2688 } 2689 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2690 } 2691 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2692 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2693 } 2694 2695 /* Assemble the matrices */ 2696 /* First assemble the local rows */ 2697 for (i = 0; i < ismax; i++) { 2698 row2proc_i = row2proc[i]; 2699 subc = (Mat_SeqAIJ *)submats[i]->data; 2700 imat_ilen = subc->ilen; 2701 imat_j = subc->j; 2702 imat_i = subc->i; 2703 imat_a = subc->a; 2704 2705 if (!allcolumns[i]) cmap_i = cmap[i]; 2706 rmap_i = rmap[i]; 2707 irow_i = irow[i]; 2708 jmax = nrow[i]; 2709 for (j = 0; j < jmax; j++) { 2710 row = irow_i[j]; 2711 proc = row2proc_i[j]; 2712 if (proc == rank) { 2713 old_row = row; 2714 #if defined(PETSC_USE_CTABLE) 2715 PetscCall(PetscTableFind(rmap_i, row + 1, &row)); 2716 row--; 2717 #else 2718 row = rmap_i[row]; 2719 #endif 2720 ilen_row = imat_ilen[row]; 2721 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2722 mat_i = imat_i[row]; 2723 mat_a = imat_a + mat_i; 2724 mat_j = imat_j + mat_i; 2725 if (!allcolumns[i]) { 2726 for (k = 0; k < ncols; k++) { 2727 #if defined(PETSC_USE_CTABLE) 2728 PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol)); 2729 #else 2730 tcol = cmap_i[cols[k]]; 2731 #endif 2732 if (tcol) { 2733 *mat_j++ = tcol - 1; 2734 *mat_a++ = vals[k]; 2735 ilen_row++; 2736 } 2737 } 2738 } else { /* allcolumns */ 2739 for (k = 0; k < ncols; k++) { 2740 *mat_j++ = cols[k]; /* global col index! */ 2741 *mat_a++ = vals[k]; 2742 ilen_row++; 2743 } 2744 } 2745 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2746 2747 imat_ilen[row] = ilen_row; 2748 } 2749 } 2750 } 2751 2752 /* Now assemble the off proc rows */ 2753 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2754 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2755 sbuf1_i = sbuf1[pa[tmp2]]; 2756 jmax = sbuf1_i[0]; 2757 ct1 = 2 * jmax + 1; 2758 ct2 = 0; 2759 rbuf2_i = rbuf2[tmp2]; 2760 rbuf3_i = rbuf3[tmp2]; 2761 rbuf4_i = rbuf4[tmp2]; 2762 for (j = 1; j <= jmax; j++) { 2763 is_no = sbuf1_i[2 * j - 1]; 2764 rmap_i = rmap[is_no]; 2765 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2766 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2767 imat_ilen = subc->ilen; 2768 imat_j = subc->j; 2769 imat_i = subc->i; 2770 imat_a = subc->a; 2771 max1 = sbuf1_i[2 * j]; 2772 for (k = 0; k < max1; k++, ct1++) { 2773 row = sbuf1_i[ct1]; 2774 #if defined(PETSC_USE_CTABLE) 2775 PetscCall(PetscTableFind(rmap_i, row + 1, &row)); 2776 row--; 2777 #else 2778 row = rmap_i[row]; 2779 #endif 2780 ilen = imat_ilen[row]; 2781 mat_i = imat_i[row]; 2782 mat_a = imat_a + mat_i; 2783 mat_j = imat_j + mat_i; 2784 max2 = rbuf2_i[ct1]; 2785 if (!allcolumns[is_no]) { 2786 for (l = 0; l < max2; l++, ct2++) { 2787 #if defined(PETSC_USE_CTABLE) 2788 PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol)); 2789 #else 2790 tcol = cmap_i[rbuf3_i[ct2]]; 2791 #endif 2792 if (tcol) { 2793 *mat_j++ = tcol - 1; 2794 *mat_a++ = rbuf4_i[ct2]; 2795 ilen++; 2796 } 2797 } 2798 } else { /* allcolumns */ 2799 for (l = 0; l < max2; l++, ct2++) { 2800 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2801 *mat_a++ = rbuf4_i[ct2]; 2802 ilen++; 2803 } 2804 } 2805 imat_ilen[row] = ilen; 2806 } 2807 } 2808 } 2809 2810 if (!iscsorted) { /* sort column indices of the rows */ 2811 for (i = 0; i < ismax; i++) { 2812 subc = (Mat_SeqAIJ *)submats[i]->data; 2813 imat_j = subc->j; 2814 imat_i = subc->i; 2815 imat_a = subc->a; 2816 imat_ilen = subc->ilen; 2817 2818 if (allcolumns[i]) continue; 2819 jmax = nrow[i]; 2820 for (j = 0; j < jmax; j++) { 2821 mat_i = imat_i[j]; 2822 mat_a = imat_a + mat_i; 2823 mat_j = imat_j + mat_i; 2824 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2825 } 2826 } 2827 } 2828 2829 PetscCall(PetscFree(r_waits4)); 2830 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2831 PetscCall(PetscFree(s_waits4)); 2832 2833 /* Restore the indices */ 2834 for (i = 0; i < ismax; i++) { 2835 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2836 if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i)); 2837 } 2838 2839 for (i = 0; i < ismax; i++) { 2840 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2841 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2842 } 2843 2844 /* Destroy allocated memory */ 2845 if (sbuf_aa) { 2846 PetscCall(PetscFree(sbuf_aa[0])); 2847 PetscCall(PetscFree(sbuf_aa)); 2848 } 2849 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2850 2851 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 2852 PetscCall(PetscFree(rbuf4)); 2853 2854 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2855 PetscFunctionReturn(0); 2856 } 2857 2858 /* 2859 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2860 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2861 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2862 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2863 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2864 state, and needs to be "assembled" later by compressing B's column space. 2865 2866 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2867 Following this call, C->A & C->B have been created, even if empty. 2868 */ 2869 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) 2870 { 2871 /* If making this function public, change the error returned in this function away from _PLIB. */ 2872 Mat_MPIAIJ *aij; 2873 Mat_SeqAIJ *Baij; 2874 PetscBool seqaij, Bdisassembled; 2875 PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count; 2876 PetscScalar v; 2877 const PetscInt *rowindices, *colindices; 2878 2879 PetscFunctionBegin; 2880 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2881 if (A) { 2882 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2883 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2884 if (rowemb) { 2885 PetscCall(ISGetLocalSize(rowemb, &m)); 2886 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2887 } else { 2888 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2889 } 2890 if (dcolemb) { 2891 PetscCall(ISGetLocalSize(dcolemb, &n)); 2892 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2893 } else { 2894 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2895 } 2896 } 2897 if (B) { 2898 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2899 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2900 if (rowemb) { 2901 PetscCall(ISGetLocalSize(rowemb, &m)); 2902 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2903 } else { 2904 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2905 } 2906 if (ocolemb) { 2907 PetscCall(ISGetLocalSize(ocolemb, &n)); 2908 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2909 } else { 2910 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2911 } 2912 } 2913 2914 aij = (Mat_MPIAIJ *)C->data; 2915 if (!aij->A) { 2916 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2917 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2918 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2919 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2920 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2921 } 2922 if (A) { 2923 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2924 } else { 2925 PetscCall(MatSetUp(aij->A)); 2926 } 2927 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2928 /* 2929 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2930 need to "disassemble" B -- convert it to using C's global indices. 2931 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2932 2933 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2934 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2935 2936 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2937 At least avoid calling MatSetValues() and the implied searches? 2938 */ 2939 2940 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2941 #if defined(PETSC_USE_CTABLE) 2942 PetscCall(PetscTableDestroy(&aij->colmap)); 2943 #else 2944 PetscCall(PetscFree(aij->colmap)); 2945 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2946 #endif 2947 ngcol = 0; 2948 if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol)); 2949 if (aij->garray) { PetscCall(PetscFree(aij->garray)); } 2950 PetscCall(VecDestroy(&aij->lvec)); 2951 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2952 } 2953 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B)); 2954 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B)); 2955 } 2956 Bdisassembled = PETSC_FALSE; 2957 if (!aij->B) { 2958 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2959 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2960 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2961 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2962 Bdisassembled = PETSC_TRUE; 2963 } 2964 if (B) { 2965 Baij = (Mat_SeqAIJ *)B->data; 2966 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2967 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2968 for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i]; 2969 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2970 PetscCall(PetscFree(nz)); 2971 } 2972 2973 PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend)); 2974 shift = rend - rstart; 2975 count = 0; 2976 rowindices = NULL; 2977 colindices = NULL; 2978 if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices)); 2979 if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices)); 2980 for (i = 0; i < B->rmap->n; i++) { 2981 PetscInt row; 2982 row = i; 2983 if (rowindices) row = rowindices[i]; 2984 for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2985 col = Baij->j[count]; 2986 if (colindices) col = colindices[col]; 2987 if (Bdisassembled && col >= rstart) col += shift; 2988 v = Baij->a[count]; 2989 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2990 ++count; 2991 } 2992 } 2993 /* No assembly for aij->B is necessary. */ 2994 /* FIXME: set aij->B's nonzerostate correctly. */ 2995 } else { 2996 PetscCall(MatSetUp(aij->B)); 2997 } 2998 C->preallocated = PETSC_TRUE; 2999 C->was_assembled = PETSC_FALSE; 3000 C->assembled = PETSC_FALSE; 3001 /* 3002 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3003 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3004 */ 3005 PetscFunctionReturn(0); 3006 } 3007 3008 /* 3009 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3010 */ 3011 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) 3012 { 3013 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 3014 3015 PetscFunctionBegin; 3016 PetscValidPointer(A, 2); 3017 PetscValidPointer(B, 3); 3018 /* FIXME: make sure C is assembled */ 3019 *A = aij->A; 3020 *B = aij->B; 3021 /* Note that we don't incref *A and *B, so be careful! */ 3022 PetscFunctionReturn(0); 3023 } 3024 3025 /* 3026 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3027 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3028 */ 3029 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) 3030 { 3031 PetscMPIInt size, flag; 3032 PetscInt i, ii, cismax, ispar; 3033 Mat *A, *B; 3034 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 3035 3036 PetscFunctionBegin; 3037 if (!ismax) PetscFunctionReturn(0); 3038 3039 for (i = 0, cismax = 0; i < ismax; ++i) { 3040 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 3041 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3042 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3043 if (size > 1) ++cismax; 3044 } 3045 3046 /* 3047 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3048 ispar counts the number of parallel ISs across C's comm. 3049 */ 3050 PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 3051 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3052 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 3053 PetscFunctionReturn(0); 3054 } 3055 3056 /* if (ispar) */ 3057 /* 3058 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3059 These are used to extract the off-diag portion of the resulting parallel matrix. 3060 The row IS for the off-diag portion is the same as for the diag portion, 3061 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3062 */ 3063 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3064 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3065 for (i = 0, ii = 0; i < ismax; ++i) { 3066 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3067 if (size > 1) { 3068 /* 3069 TODO: This is the part that's ***NOT SCALABLE***. 3070 To fix this we need to extract just the indices of C's nonzero columns 3071 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3072 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3073 to be done without serializing on the IS list, so, most likely, it is best 3074 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3075 */ 3076 PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii]))); 3077 /* Now we have to 3078 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3079 were sorted on each rank, concatenated they might no longer be sorted; 3080 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3081 indices in the nondecreasing order to the original index positions. 3082 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3083 */ 3084 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3085 PetscCall(ISSort(ciscol[ii])); 3086 ++ii; 3087 } 3088 } 3089 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3090 for (i = 0, ii = 0; i < ismax; ++i) { 3091 PetscInt j, issize; 3092 const PetscInt *indices; 3093 3094 /* 3095 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3096 */ 3097 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3098 PetscCall(ISSort(isrow[i])); 3099 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3100 PetscCall(ISGetIndices(isrow[i], &indices)); 3101 for (j = 1; j < issize; ++j) { 3102 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3103 } 3104 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3105 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3106 PetscCall(ISSort(iscol[i])); 3107 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3108 PetscCall(ISGetIndices(iscol[i], &indices)); 3109 for (j = 1; j < issize; ++j) { 3110 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3111 } 3112 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3113 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3114 if (size > 1) { 3115 cisrow[ii] = isrow[i]; 3116 ++ii; 3117 } 3118 } 3119 /* 3120 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3121 array of sequential matrices underlying the resulting parallel matrices. 3122 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3123 contain duplicates. 3124 3125 There are as many diag matrices as there are original index sets. There are only as many parallel 3126 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3127 3128 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3129 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3130 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3131 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3132 with A[i] and B[ii] extracted from the corresponding MPI submat. 3133 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3134 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3135 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3136 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3137 values into A[i] and B[ii] sitting inside the corresponding submat. 3138 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3139 A[i], B[ii], AA[i] or BB[ii] matrices. 3140 */ 3141 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3142 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat)); 3143 3144 /* Now obtain the sequential A and B submatrices separately. */ 3145 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3146 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3147 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3148 3149 /* 3150 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3151 matrices A & B have been extracted directly into the parallel matrices containing them, or 3152 simply into the sequential matrix identical with the corresponding A (if size == 1). 3153 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3154 to have the same sparsity pattern. 3155 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3156 must be constructed for C. This is done by setseqmat(s). 3157 */ 3158 for (i = 0, ii = 0; i < ismax; ++i) { 3159 /* 3160 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3161 That way we can avoid sorting and computing permutations when reusing. 3162 To this end: 3163 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3164 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3165 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3166 */ 3167 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3168 3169 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3170 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3171 if (size > 1) { 3172 if (scall == MAT_INITIAL_MATRIX) { 3173 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3174 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3175 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3176 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3177 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3178 } 3179 /* 3180 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3181 */ 3182 { 3183 Mat AA = A[i], BB = B[ii]; 3184 3185 if (AA || BB) { 3186 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3187 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3188 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3189 } 3190 PetscCall(MatDestroy(&AA)); 3191 } 3192 PetscCall(ISDestroy(ciscol + ii)); 3193 PetscCall(ISDestroy(ciscol_p + ii)); 3194 ++ii; 3195 } else { /* if (size == 1) */ 3196 if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i])); 3197 if (isrow_p[i] || iscol_p[i]) { 3198 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3199 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3200 /* Otherwise A is extracted straight into (*submats)[i]. */ 3201 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3202 PetscCall(MatDestroy(A + i)); 3203 } else (*submat)[i] = A[i]; 3204 } 3205 PetscCall(ISDestroy(&isrow_p[i])); 3206 PetscCall(ISDestroy(&iscol_p[i])); 3207 } 3208 PetscCall(PetscFree2(cisrow, ciscol)); 3209 PetscCall(PetscFree2(isrow_p, iscol_p)); 3210 PetscCall(PetscFree(ciscol_p)); 3211 PetscCall(PetscFree(A)); 3212 PetscCall(MatDestroySubMatrices(cismax, &B)); 3213 PetscFunctionReturn(0); 3214 } 3215 3216 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 3217 { 3218 PetscFunctionBegin; 3219 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3220 PetscFunctionReturn(0); 3221 } 3222