1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) { 22 PetscInt i; 23 24 PetscFunctionBegin; 25 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 26 for (i = 0; i < ov; ++i) { PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); } 27 PetscFunctionReturn(0); 28 } 29 30 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) { 31 PetscInt i; 32 33 PetscFunctionBegin; 34 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 35 for (i = 0; i < ov; ++i) { PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); } 36 PetscFunctionReturn(0); 37 } 38 39 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) { 40 MPI_Comm comm; 41 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j; 42 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 43 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 44 const PetscInt *indices_i, **indices; 45 PetscLayout rmap; 46 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner; 47 PetscSF sf; 48 PetscSFNode *remote; 49 50 PetscFunctionBegin; 51 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 52 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 53 PetscCallMPI(MPI_Comm_size(comm, &size)); 54 /* get row map to determine where rows should be going */ 55 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 56 /* retrieve IS data and put all together so that we 57 * can optimize communication 58 * */ 59 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 60 for (i = 0, tlength = 0; i < nidx; i++) { 61 PetscCall(ISGetLocalSize(is[i], &length[i])); 62 tlength += length[i]; 63 PetscCall(ISGetIndices(is[i], &indices[i])); 64 } 65 /* find these rows on remote processors */ 66 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 67 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 68 nrrows = 0; 69 for (i = 0; i < nidx; i++) { 70 length_i = length[i]; 71 indices_i = indices[i]; 72 for (j = 0; j < length_i; j++) { 73 owner = -1; 74 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 75 /* remote processors */ 76 if (owner != rank) { 77 tosizes_temp[owner]++; /* number of rows to owner */ 78 rrow_ranks[nrrows] = owner; /* processor */ 79 rrow_isids[nrrows] = i; /* is id */ 80 remoterows[nrrows++] = indices_i[j]; /* row */ 81 } 82 } 83 PetscCall(ISRestoreIndices(is[i], &indices[i])); 84 } 85 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 86 /* test if we need to exchange messages 87 * generally speaking, we do not need to exchange 88 * data when overlap is 1 89 * */ 90 PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 91 /* we do not have any messages 92 * It usually corresponds to overlap 1 93 * */ 94 if (!reducednrrows) { 95 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 96 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 97 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 98 PetscFunctionReturn(0); 99 } 100 nto = 0; 101 /* send sizes and ranks for building a two-sided communcation */ 102 for (i = 0; i < size; i++) { 103 if (tosizes_temp[i]) { 104 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 105 tosizes_temp[i] = nto; /* a map from processor to index */ 106 toranks[nto++] = i; /* processor */ 107 } 108 } 109 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 110 toffsets[0] = 0; 111 for (i = 0; i < nto; i++) { 112 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 113 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 114 } 115 /* send information to other processors */ 116 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 117 nrecvrows = 0; 118 for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 119 PetscCall(PetscMalloc1(nrecvrows, &remote)); 120 nrecvrows = 0; 121 for (i = 0; i < nfrom; i++) { 122 for (j = 0; j < fromsizes[2 * i]; j++) { 123 remote[nrecvrows].rank = fromranks[i]; 124 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 125 } 126 } 127 PetscCall(PetscSFCreate(comm, &sf)); 128 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 129 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 130 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 131 PetscCall(PetscSFSetFromOptions(sf)); 132 /* message pair <no of is, row> */ 133 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 134 for (i = 0; i < nrrows; i++) { 135 owner = rrow_ranks[i]; /* processor */ 136 j = tosizes_temp[owner]; /* index */ 137 todata[toffsets[j]++] = rrow_isids[i]; 138 todata[toffsets[j]++] = remoterows[i]; 139 } 140 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 141 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 142 PetscCall(PetscFree(toffsets)); 143 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 144 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 145 PetscCall(PetscSFDestroy(&sf)); 146 /* send rows belonging to the remote so that then we could get the overlapping data back */ 147 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 148 PetscCall(PetscFree2(todata, fromdata)); 149 PetscCall(PetscFree(fromsizes)); 150 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 151 PetscCall(PetscFree(fromranks)); 152 nrecvrows = 0; 153 for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 154 PetscCall(PetscCalloc1(nrecvrows, &todata)); 155 PetscCall(PetscMalloc1(nrecvrows, &remote)); 156 nrecvrows = 0; 157 for (i = 0; i < nto; i++) { 158 for (j = 0; j < tosizes[2 * i]; j++) { 159 remote[nrecvrows].rank = toranks[i]; 160 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 161 } 162 } 163 PetscCall(PetscSFCreate(comm, &sf)); 164 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 165 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 166 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 167 PetscCall(PetscSFSetFromOptions(sf)); 168 /* overlap communication and computation */ 169 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 170 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 171 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 172 PetscCall(PetscSFDestroy(&sf)); 173 PetscCall(PetscFree2(sbdata, sbsizes)); 174 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 175 PetscCall(PetscFree(toranks)); 176 PetscCall(PetscFree(tosizes)); 177 PetscCall(PetscFree(todata)); 178 PetscFunctionReturn(0); 179 } 180 181 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) { 182 PetscInt *isz, isz_i, i, j, is_id, data_size; 183 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 184 const PetscInt *indices_i_temp; 185 MPI_Comm *iscomms; 186 187 PetscFunctionBegin; 188 max_lsize = 0; 189 PetscCall(PetscMalloc1(nidx, &isz)); 190 for (i = 0; i < nidx; i++) { 191 PetscCall(ISGetLocalSize(is[i], &lsize)); 192 max_lsize = lsize > max_lsize ? lsize : max_lsize; 193 isz[i] = lsize; 194 } 195 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 196 for (i = 0; i < nidx; i++) { 197 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 198 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 199 PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i])); 200 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 201 PetscCall(ISDestroy(&is[i])); 202 } 203 /* retrieve information to get row id and its overlap */ 204 for (i = 0; i < nrecvs;) { 205 is_id = recvdata[i++]; 206 data_size = recvdata[i++]; 207 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 208 isz_i = isz[is_id]; 209 for (j = 0; j < data_size; j++) { 210 col = recvdata[i++]; 211 indices_i[isz_i++] = col; 212 } 213 isz[is_id] = isz_i; 214 } 215 /* remove duplicate entities */ 216 for (i = 0; i < nidx; i++) { 217 indices_i = indices_temp + (max_lsize + nrecvs) * i; 218 isz_i = isz[i]; 219 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 220 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 221 PetscCall(PetscCommDestroy(&iscomms[i])); 222 } 223 PetscCall(PetscFree(isz)); 224 PetscCall(PetscFree2(indices_temp, iscomms)); 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) { 229 PetscLayout rmap, cmap; 230 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 231 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 232 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 233 const PetscInt *gcols, *ai, *aj, *bi, *bj; 234 Mat amat, bmat; 235 PetscMPIInt rank; 236 PetscBool done; 237 MPI_Comm comm; 238 239 PetscFunctionBegin; 240 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 241 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 242 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 243 /* Even if the mat is symmetric, we still assume it is not symmetric */ 244 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 245 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 246 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 247 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 248 /* total number of nonzero values is used to estimate the memory usage in the next step */ 249 tnz = ai[an] + bi[bn]; 250 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 251 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 252 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 253 /* to find the longest message */ 254 max_fszs = 0; 255 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 256 /* better way to estimate number of nonzero in the mat??? */ 257 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 258 for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i; 259 rows_pos = 0; 260 totalrows = 0; 261 for (i = 0; i < nfrom; i++) { 262 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 263 /* group data together */ 264 for (j = 0; j < fromsizes[2 * i]; j += 2) { 265 is_id = fromrows[rows_pos++]; /* no of is */ 266 rows_i = rows_data[is_id]; 267 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 268 } 269 /* estimate a space to avoid multiple allocations */ 270 for (j = 0; j < nidx; j++) { 271 indvc_ij = 0; 272 rows_i = rows_data[j]; 273 for (l = 0; l < rows_pos_i[j]; l++) { 274 row = rows_i[l] - rstart; 275 start = ai[row]; 276 end = ai[row + 1]; 277 for (k = start; k < end; k++) { /* Amat */ 278 col = aj[k] + cstart; 279 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 280 } 281 start = bi[row]; 282 end = bi[row + 1]; 283 for (k = start; k < end; k++) { /* Bmat */ 284 col = gcols[bj[k]]; 285 indices_tmp[indvc_ij++] = col; 286 } 287 } 288 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 289 indv_counts[i * nidx + j] = indvc_ij; 290 totalrows += indvc_ij; 291 } 292 } 293 /* message triple <no of is, number of rows, rows> */ 294 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 295 totalrows = 0; 296 rows_pos = 0; 297 /* use this code again */ 298 for (i = 0; i < nfrom; i++) { 299 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 300 for (j = 0; j < fromsizes[2 * i]; j += 2) { 301 is_id = fromrows[rows_pos++]; 302 rows_i = rows_data[is_id]; 303 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 304 } 305 /* add data */ 306 for (j = 0; j < nidx; j++) { 307 if (!indv_counts[i * nidx + j]) continue; 308 indvc_ij = 0; 309 sbdata[totalrows++] = j; 310 sbdata[totalrows++] = indv_counts[i * nidx + j]; 311 sbsizes[2 * i] += 2; 312 rows_i = rows_data[j]; 313 for (l = 0; l < rows_pos_i[j]; l++) { 314 row = rows_i[l] - rstart; 315 start = ai[row]; 316 end = ai[row + 1]; 317 for (k = start; k < end; k++) { /* Amat */ 318 col = aj[k] + cstart; 319 indices_tmp[indvc_ij++] = col; 320 } 321 start = bi[row]; 322 end = bi[row + 1]; 323 for (k = start; k < end; k++) { /* Bmat */ 324 col = gcols[bj[k]]; 325 indices_tmp[indvc_ij++] = col; 326 } 327 } 328 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 329 sbsizes[2 * i] += indvc_ij; 330 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 331 totalrows += indvc_ij; 332 } 333 } 334 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 335 offsets[0] = 0; 336 for (i = 0; i < nfrom; i++) { 337 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 338 sbsizes[2 * i + 1] = offsets[i]; 339 } 340 PetscCall(PetscFree(offsets)); 341 if (sbrowsizes) *sbrowsizes = sbsizes; 342 if (sbrows) *sbrows = sbdata; 343 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 344 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 345 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 346 PetscFunctionReturn(0); 347 } 348 349 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) { 350 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 351 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 352 PetscInt lsize, lsize_tmp; 353 PetscMPIInt rank, owner; 354 Mat amat, bmat; 355 PetscBool done; 356 PetscLayout cmap, rmap; 357 MPI_Comm comm; 358 359 PetscFunctionBegin; 360 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 361 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 362 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 363 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 364 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 365 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 366 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 367 /* is it a safe way to compute number of nonzero values ? */ 368 tnz = ai[an] + bi[bn]; 369 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 370 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 371 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 372 /* it is a better way to estimate memory than the old implementation 373 * where global size of matrix is used 374 * */ 375 PetscCall(PetscMalloc1(tnz, &indices_temp)); 376 for (i = 0; i < nidx; i++) { 377 MPI_Comm iscomm; 378 379 PetscCall(ISGetLocalSize(is[i], &lsize)); 380 PetscCall(ISGetIndices(is[i], &indices)); 381 lsize_tmp = 0; 382 for (j = 0; j < lsize; j++) { 383 owner = -1; 384 row = indices[j]; 385 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 386 if (owner != rank) continue; 387 /* local number */ 388 row -= rstart; 389 start = ai[row]; 390 end = ai[row + 1]; 391 for (k = start; k < end; k++) { /* Amat */ 392 col = aj[k] + cstart; 393 indices_temp[lsize_tmp++] = col; 394 } 395 start = bi[row]; 396 end = bi[row + 1]; 397 for (k = start; k < end; k++) { /* Bmat */ 398 col = gcols[bj[k]]; 399 indices_temp[lsize_tmp++] = col; 400 } 401 } 402 PetscCall(ISRestoreIndices(is[i], &indices)); 403 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 404 PetscCall(ISDestroy(&is[i])); 405 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 406 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 407 PetscCall(PetscCommDestroy(&iscomm)); 408 } 409 PetscCall(PetscFree(indices_temp)); 410 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 411 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 412 PetscFunctionReturn(0); 413 } 414 415 /* 416 Sample message format: 417 If a processor A wants processor B to process some elements corresponding 418 to index sets is[1],is[5] 419 mesg [0] = 2 (no of index sets in the mesg) 420 ----------- 421 mesg [1] = 1 => is[1] 422 mesg [2] = sizeof(is[1]); 423 ----------- 424 mesg [3] = 5 => is[5] 425 mesg [4] = sizeof(is[5]); 426 ----------- 427 mesg [5] 428 mesg [n] datas[1] 429 ----------- 430 mesg[n+1] 431 mesg[m] data(is[5]) 432 ----------- 433 434 Notes: 435 nrqs - no of requests sent (or to be sent out) 436 nrqr - no of requests received (which have to be or which have been processed) 437 */ 438 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) { 439 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 440 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 441 const PetscInt **idx, *idx_i; 442 PetscInt *n, **data, len; 443 #if defined(PETSC_USE_CTABLE) 444 PetscTable *table_data, table_data_i; 445 PetscInt *tdata, tcount, tcount_max; 446 #else 447 PetscInt *data_i, *d_p; 448 #endif 449 PetscMPIInt size, rank, tag1, tag2, proc = 0; 450 PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr; 451 PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2; 452 PetscBT *table; 453 MPI_Comm comm; 454 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 455 MPI_Status *recv_status; 456 MPI_Comm *iscomms; 457 char *t_p; 458 459 PetscFunctionBegin; 460 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 461 size = c->size; 462 rank = c->rank; 463 M = C->rmap->N; 464 465 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 466 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 467 468 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 469 470 for (i = 0; i < imax; i++) { 471 PetscCall(ISGetIndices(is[i], &idx[i])); 472 PetscCall(ISGetLocalSize(is[i], &n[i])); 473 } 474 475 /* evaluate communication - mesg to who,length of mesg, and buffer space 476 required. Based on this, buffers are allocated, and data copied into them */ 477 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 478 for (i = 0; i < imax; i++) { 479 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 480 idx_i = idx[i]; 481 len = n[i]; 482 for (j = 0; j < len; j++) { 483 row = idx_i[j]; 484 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 485 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 486 w4[proc]++; 487 } 488 for (j = 0; j < size; j++) { 489 if (w4[j]) { 490 w1[j] += w4[j]; 491 w3[j]++; 492 } 493 } 494 } 495 496 nrqs = 0; /* no of outgoing messages */ 497 msz = 0; /* total mesg length (for all proc */ 498 w1[rank] = 0; /* no mesg sent to intself */ 499 w3[rank] = 0; 500 for (i = 0; i < size; i++) { 501 if (w1[i]) { 502 w2[i] = 1; 503 nrqs++; 504 } /* there exists a message to proc i */ 505 } 506 /* pa - is list of processors to communicate with */ 507 PetscCall(PetscMalloc1(nrqs, &pa)); 508 for (i = 0, j = 0; i < size; i++) { 509 if (w1[i]) { 510 pa[j] = i; 511 j++; 512 } 513 } 514 515 /* Each message would have a header = 1 + 2*(no of IS) + data */ 516 for (i = 0; i < nrqs; i++) { 517 j = pa[i]; 518 w1[j] += w2[j] + 2 * w3[j]; 519 msz += w1[j]; 520 } 521 522 /* Determine the number of messages to expect, their lengths, from from-ids */ 523 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 524 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 525 526 /* Now post the Irecvs corresponding to these messages */ 527 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 528 529 /* Allocate Memory for outgoing messages */ 530 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 531 PetscCall(PetscArrayzero(outdat, size)); 532 PetscCall(PetscArrayzero(ptr, size)); 533 534 { 535 PetscInt *iptr = tmp, ict = 0; 536 for (i = 0; i < nrqs; i++) { 537 j = pa[i]; 538 iptr += ict; 539 outdat[j] = iptr; 540 ict = w1[j]; 541 } 542 } 543 544 /* Form the outgoing messages */ 545 /* plug in the headers */ 546 for (i = 0; i < nrqs; i++) { 547 j = pa[i]; 548 outdat[j][0] = 0; 549 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 550 ptr[j] = outdat[j] + 2 * w3[j] + 1; 551 } 552 553 /* Memory for doing local proc's work */ 554 { 555 PetscInt M_BPB_imax = 0; 556 #if defined(PETSC_USE_CTABLE) 557 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 558 PetscCall(PetscMalloc1(imax, &table_data)); 559 for (i = 0; i < imax; i++) { PetscCall(PetscTableCreate(n[i], M, &table_data[i])); } 560 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 561 for (i = 0; i < imax; i++) { table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; } 562 #else 563 PetscInt Mimax = 0; 564 PetscCall(PetscIntMultError(M, imax, &Mimax)); 565 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 566 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 567 for (i = 0; i < imax; i++) { 568 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 569 data[i] = d_p + M * i; 570 } 571 #endif 572 } 573 574 /* Parse the IS and update local tables and the outgoing buf with the data */ 575 { 576 PetscInt n_i, isz_i, *outdat_j, ctr_j; 577 PetscBT table_i; 578 579 for (i = 0; i < imax; i++) { 580 PetscCall(PetscArrayzero(ctr, size)); 581 n_i = n[i]; 582 table_i = table[i]; 583 idx_i = idx[i]; 584 #if defined(PETSC_USE_CTABLE) 585 table_data_i = table_data[i]; 586 #else 587 data_i = data[i]; 588 #endif 589 isz_i = isz[i]; 590 for (j = 0; j < n_i; j++) { /* parse the indices of each IS */ 591 row = idx_i[j]; 592 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 593 if (proc != rank) { /* copy to the outgoing buffer */ 594 ctr[proc]++; 595 *ptr[proc] = row; 596 ptr[proc]++; 597 } else if (!PetscBTLookupSet(table_i, row)) { 598 #if defined(PETSC_USE_CTABLE) 599 PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES)); 600 #else 601 data_i[isz_i] = row; /* Update the local table */ 602 #endif 603 isz_i++; 604 } 605 } 606 /* Update the headers for the current IS */ 607 for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 608 if ((ctr_j = ctr[j])) { 609 outdat_j = outdat[j]; 610 k = ++outdat_j[0]; 611 outdat_j[2 * k] = ctr_j; 612 outdat_j[2 * k - 1] = i; 613 } 614 } 615 isz[i] = isz_i; 616 } 617 } 618 619 /* Now post the sends */ 620 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 621 for (i = 0; i < nrqs; ++i) { 622 j = pa[i]; 623 PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 624 } 625 626 /* No longer need the original indices */ 627 PetscCall(PetscMalloc1(imax, &iscomms)); 628 for (i = 0; i < imax; ++i) { 629 PetscCall(ISRestoreIndices(is[i], idx + i)); 630 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 631 } 632 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 633 634 for (i = 0; i < imax; ++i) { PetscCall(ISDestroy(&is[i])); } 635 636 /* Do Local work */ 637 #if defined(PETSC_USE_CTABLE) 638 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 639 #else 640 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 641 #endif 642 643 /* Receive messages */ 644 PetscCall(PetscMalloc1(nrqr, &recv_status)); 645 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 646 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 647 648 /* Phase 1 sends are complete - deallocate buffers */ 649 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 650 PetscCall(PetscFree4(w1, w2, w3, w4)); 651 652 PetscCall(PetscMalloc1(nrqr, &xdata)); 653 PetscCall(PetscMalloc1(nrqr, &isz1)); 654 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 655 PetscCall(PetscFree(rbuf[0])); 656 PetscCall(PetscFree(rbuf)); 657 658 /* Send the data back */ 659 /* Do a global reduction to know the buffer space req for incoming messages */ 660 { 661 PetscMPIInt *rw1; 662 663 PetscCall(PetscCalloc1(size, &rw1)); 664 665 for (i = 0; i < nrqr; ++i) { 666 proc = recv_status[i].MPI_SOURCE; 667 668 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 669 rw1[proc] = isz1[i]; 670 } 671 PetscCall(PetscFree(onodes1)); 672 PetscCall(PetscFree(olengths1)); 673 674 /* Determine the number of messages to expect, their lengths, from from-ids */ 675 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 676 PetscCall(PetscFree(rw1)); 677 } 678 /* Now post the Irecvs corresponding to these messages */ 679 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 680 681 /* Now post the sends */ 682 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 683 for (i = 0; i < nrqr; ++i) { 684 j = recv_status[i].MPI_SOURCE; 685 PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i)); 686 } 687 688 /* receive work done on other processors */ 689 { 690 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 691 PetscMPIInt idex; 692 PetscBT table_i; 693 694 for (i = 0; i < nrqs; ++i) { 695 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 696 /* Process the message */ 697 rbuf2_i = rbuf2[idex]; 698 ct1 = 2 * rbuf2_i[0] + 1; 699 jmax = rbuf2[idex][0]; 700 for (j = 1; j <= jmax; j++) { 701 max = rbuf2_i[2 * j]; 702 is_no = rbuf2_i[2 * j - 1]; 703 isz_i = isz[is_no]; 704 table_i = table[is_no]; 705 #if defined(PETSC_USE_CTABLE) 706 table_data_i = table_data[is_no]; 707 #else 708 data_i = data[is_no]; 709 #endif 710 for (k = 0; k < max; k++, ct1++) { 711 row = rbuf2_i[ct1]; 712 if (!PetscBTLookupSet(table_i, row)) { 713 #if defined(PETSC_USE_CTABLE) 714 PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES)); 715 #else 716 data_i[isz_i] = row; 717 #endif 718 isz_i++; 719 } 720 } 721 isz[is_no] = isz_i; 722 } 723 } 724 725 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 726 } 727 728 #if defined(PETSC_USE_CTABLE) 729 tcount_max = 0; 730 for (i = 0; i < imax; ++i) { 731 table_data_i = table_data[i]; 732 PetscCall(PetscTableGetCount(table_data_i, &tcount)); 733 if (tcount_max < tcount) tcount_max = tcount; 734 } 735 PetscCall(PetscMalloc1(tcount_max + 1, &tdata)); 736 #endif 737 738 for (i = 0; i < imax; ++i) { 739 #if defined(PETSC_USE_CTABLE) 740 PetscTablePosition tpos; 741 table_data_i = table_data[i]; 742 743 PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos)); 744 while (tpos) { 745 PetscCall(PetscTableGetNext(table_data_i, &tpos, &k, &j)); 746 tdata[--j] = --k; 747 } 748 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 749 #else 750 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 751 #endif 752 PetscCall(PetscCommDestroy(&iscomms[i])); 753 } 754 755 PetscCall(PetscFree(iscomms)); 756 PetscCall(PetscFree(onodes2)); 757 PetscCall(PetscFree(olengths2)); 758 759 PetscCall(PetscFree(pa)); 760 PetscCall(PetscFree(rbuf2[0])); 761 PetscCall(PetscFree(rbuf2)); 762 PetscCall(PetscFree(s_waits1)); 763 PetscCall(PetscFree(r_waits1)); 764 PetscCall(PetscFree(s_waits2)); 765 PetscCall(PetscFree(r_waits2)); 766 PetscCall(PetscFree(recv_status)); 767 if (xdata) { 768 PetscCall(PetscFree(xdata[0])); 769 PetscCall(PetscFree(xdata)); 770 } 771 PetscCall(PetscFree(isz1)); 772 #if defined(PETSC_USE_CTABLE) 773 for (i = 0; i < imax; i++) { PetscCall(PetscTableDestroy((PetscTable *)&table_data[i])); } 774 PetscCall(PetscFree(table_data)); 775 PetscCall(PetscFree(tdata)); 776 PetscCall(PetscFree4(table, data, isz, t_p)); 777 #else 778 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 779 #endif 780 PetscFunctionReturn(0); 781 } 782 783 /* 784 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 785 the work on the local processor. 786 787 Inputs: 788 C - MAT_MPIAIJ; 789 imax - total no of index sets processed at a time; 790 table - an array of char - size = m bits. 791 792 Output: 793 isz - array containing the count of the solution elements corresponding 794 to each index set; 795 data or table_data - pointer to the solutions 796 */ 797 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data) { 798 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 799 Mat A = c->A, B = c->B; 800 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 801 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 802 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 803 PetscBT table_i; 804 #if defined(PETSC_USE_CTABLE) 805 PetscTable table_data_i; 806 PetscTablePosition tpos; 807 PetscInt tcount, *tdata; 808 #else 809 PetscInt *data_i; 810 #endif 811 812 PetscFunctionBegin; 813 rstart = C->rmap->rstart; 814 cstart = C->cmap->rstart; 815 ai = a->i; 816 aj = a->j; 817 bi = b->i; 818 bj = b->j; 819 garray = c->garray; 820 821 for (i = 0; i < imax; i++) { 822 #if defined(PETSC_USE_CTABLE) 823 /* copy existing entries of table_data_i into tdata[] */ 824 table_data_i = table_data[i]; 825 PetscCall(PetscTableGetCount(table_data_i, &tcount)); 826 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 827 828 PetscCall(PetscMalloc1(tcount, &tdata)); 829 PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos)); 830 while (tpos) { 831 PetscCall(PetscTableGetNext(table_data_i, &tpos, &row, &j)); 832 tdata[--j] = --row; 833 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 834 } 835 #else 836 data_i = data[i]; 837 #endif 838 table_i = table[i]; 839 isz_i = isz[i]; 840 max = isz[i]; 841 842 for (j = 0; j < max; j++) { 843 #if defined(PETSC_USE_CTABLE) 844 row = tdata[j] - rstart; 845 #else 846 row = data_i[j] - rstart; 847 #endif 848 start = ai[row]; 849 end = ai[row + 1]; 850 for (k = start; k < end; k++) { /* Amat */ 851 val = aj[k] + cstart; 852 if (!PetscBTLookupSet(table_i, val)) { 853 #if defined(PETSC_USE_CTABLE) 854 PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES)); 855 #else 856 data_i[isz_i] = val; 857 #endif 858 isz_i++; 859 } 860 } 861 start = bi[row]; 862 end = bi[row + 1]; 863 for (k = start; k < end; k++) { /* Bmat */ 864 val = garray[bj[k]]; 865 if (!PetscBTLookupSet(table_i, val)) { 866 #if defined(PETSC_USE_CTABLE) 867 PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES)); 868 #else 869 data_i[isz_i] = val; 870 #endif 871 isz_i++; 872 } 873 } 874 } 875 isz[i] = isz_i; 876 877 #if defined(PETSC_USE_CTABLE) 878 PetscCall(PetscFree(tdata)); 879 #endif 880 } 881 PetscFunctionReturn(0); 882 } 883 884 /* 885 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 886 and return the output 887 888 Input: 889 C - the matrix 890 nrqr - no of messages being processed. 891 rbuf - an array of pointers to the received requests 892 893 Output: 894 xdata - array of messages to be sent back 895 isz1 - size of each message 896 897 For better efficiency perhaps we should malloc separately each xdata[i], 898 then if a remalloc is required we need only copy the data for that one row 899 rather then all previous rows as it is now where a single large chunk of 900 memory is used. 901 902 */ 903 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) { 904 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 905 Mat A = c->A, B = c->B; 906 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 907 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 908 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 909 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 910 PetscInt *rbuf_i, kmax, rbuf_0; 911 PetscBT xtable; 912 913 PetscFunctionBegin; 914 m = C->rmap->N; 915 rstart = C->rmap->rstart; 916 cstart = C->cmap->rstart; 917 ai = a->i; 918 aj = a->j; 919 bi = b->i; 920 bj = b->j; 921 garray = c->garray; 922 923 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 924 rbuf_i = rbuf[i]; 925 rbuf_0 = rbuf_i[0]; 926 ct += rbuf_0; 927 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 928 } 929 930 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 931 else max1 = 1; 932 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 933 if (nrqr) { 934 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 935 ++no_malloc; 936 } 937 PetscCall(PetscBTCreate(m, &xtable)); 938 PetscCall(PetscArrayzero(isz1, nrqr)); 939 940 ct3 = 0; 941 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 942 rbuf_i = rbuf[i]; 943 rbuf_0 = rbuf_i[0]; 944 ct1 = 2 * rbuf_0 + 1; 945 ct2 = ct1; 946 ct3 += ct1; 947 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 948 PetscCall(PetscBTMemzero(m, xtable)); 949 oct2 = ct2; 950 kmax = rbuf_i[2 * j]; 951 for (k = 0; k < kmax; k++, ct1++) { 952 row = rbuf_i[ct1]; 953 if (!PetscBTLookupSet(xtable, row)) { 954 if (!(ct3 < mem_estimate)) { 955 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 956 PetscCall(PetscMalloc1(new_estimate, &tmp)); 957 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 958 PetscCall(PetscFree(xdata[0])); 959 xdata[0] = tmp; 960 mem_estimate = new_estimate; 961 ++no_malloc; 962 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 963 } 964 xdata[i][ct2++] = row; 965 ct3++; 966 } 967 } 968 for (k = oct2, max2 = ct2; k < max2; k++) { 969 row = xdata[i][k] - rstart; 970 start = ai[row]; 971 end = ai[row + 1]; 972 for (l = start; l < end; l++) { 973 val = aj[l] + cstart; 974 if (!PetscBTLookupSet(xtable, val)) { 975 if (!(ct3 < mem_estimate)) { 976 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 977 PetscCall(PetscMalloc1(new_estimate, &tmp)); 978 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 979 PetscCall(PetscFree(xdata[0])); 980 xdata[0] = tmp; 981 mem_estimate = new_estimate; 982 ++no_malloc; 983 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 984 } 985 xdata[i][ct2++] = val; 986 ct3++; 987 } 988 } 989 start = bi[row]; 990 end = bi[row + 1]; 991 for (l = start; l < end; l++) { 992 val = garray[bj[l]]; 993 if (!PetscBTLookupSet(xtable, val)) { 994 if (!(ct3 < mem_estimate)) { 995 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 996 PetscCall(PetscMalloc1(new_estimate, &tmp)); 997 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 998 PetscCall(PetscFree(xdata[0])); 999 xdata[0] = tmp; 1000 mem_estimate = new_estimate; 1001 ++no_malloc; 1002 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1003 } 1004 xdata[i][ct2++] = val; 1005 ct3++; 1006 } 1007 } 1008 } 1009 /* Update the header*/ 1010 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1011 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1012 } 1013 xdata[i][0] = rbuf_0; 1014 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1015 isz1[i] = ct2; /* size of each message */ 1016 } 1017 PetscCall(PetscBTDestroy(&xtable)); 1018 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1019 PetscFunctionReturn(0); 1020 } 1021 /* -------------------------------------------------------------------------*/ 1022 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1023 /* 1024 Every processor gets the entire matrix 1025 */ 1026 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) { 1027 Mat B; 1028 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1029 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1030 PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL; 1031 PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j; 1032 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1033 1034 PetscFunctionBegin; 1035 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1036 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1037 if (scall == MAT_INITIAL_MATRIX) { 1038 /* ---------------------------------------------------------------- 1039 Tell every processor the number of nonzeros per row 1040 */ 1041 PetscCall(PetscMalloc1(A->rmap->N, &lens)); 1042 for (i = A->rmap->rstart; i < A->rmap->rend; i++) { lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; } 1043 PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1044 for (i = 0; i < size; i++) { 1045 recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i]; 1046 displs[i] = A->rmap->range[i]; 1047 } 1048 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1049 /* --------------------------------------------------------------- 1050 Create the sequential matrix of the same type as the local block diagonal 1051 */ 1052 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1053 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1054 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1055 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1056 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1057 PetscCall(PetscCalloc1(2, Bin)); 1058 **Bin = B; 1059 b = (Mat_SeqAIJ *)B->data; 1060 1061 /*-------------------------------------------------------------------- 1062 Copy my part of matrix column indices over 1063 */ 1064 sendcount = ad->nz + bd->nz; 1065 jsendbuf = b->j + b->i[rstarts[rank]]; 1066 a_jsendbuf = ad->j; 1067 b_jsendbuf = bd->j; 1068 n = A->rmap->rend - A->rmap->rstart; 1069 cnt = 0; 1070 for (i = 0; i < n; i++) { 1071 /* put in lower diagonal portion */ 1072 m = bd->i[i + 1] - bd->i[i]; 1073 while (m > 0) { 1074 /* is it above diagonal (in bd (compressed) numbering) */ 1075 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1076 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1077 m--; 1078 } 1079 1080 /* put in diagonal portion */ 1081 for (j = ad->i[i]; j < ad->i[i + 1]; j++) { jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; } 1082 1083 /* put in upper diagonal portion */ 1084 while (m-- > 0) { jsendbuf[cnt++] = garray[*b_jsendbuf++]; } 1085 } 1086 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1087 1088 /*-------------------------------------------------------------------- 1089 Gather all column indices to all processors 1090 */ 1091 for (i = 0; i < size; i++) { 1092 recvcounts[i] = 0; 1093 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) { recvcounts[i] += lens[j]; } 1094 } 1095 displs[0] = 0; 1096 for (i = 1; i < size; i++) { displs[i] = displs[i - 1] + recvcounts[i - 1]; } 1097 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1098 /*-------------------------------------------------------------------- 1099 Assemble the matrix into useable form (note numerical values not yet set) 1100 */ 1101 /* set the b->ilen (length of each row) values */ 1102 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1103 /* set the b->i indices */ 1104 b->i[0] = 0; 1105 for (i = 1; i <= A->rmap->N; i++) { b->i[i] = b->i[i - 1] + lens[i - 1]; } 1106 PetscCall(PetscFree(lens)); 1107 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1108 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1109 1110 } else { 1111 B = **Bin; 1112 b = (Mat_SeqAIJ *)B->data; 1113 } 1114 1115 /*-------------------------------------------------------------------- 1116 Copy my part of matrix numerical values into the values location 1117 */ 1118 if (flag == MAT_GET_VALUES) { 1119 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1120 MatScalar *sendbuf, *recvbuf; 1121 1122 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1123 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1124 sendcount = ad->nz + bd->nz; 1125 sendbuf = b->a + b->i[rstarts[rank]]; 1126 a_sendbuf = ada; 1127 b_sendbuf = bda; 1128 b_sendj = bd->j; 1129 n = A->rmap->rend - A->rmap->rstart; 1130 cnt = 0; 1131 for (i = 0; i < n; i++) { 1132 /* put in lower diagonal portion */ 1133 m = bd->i[i + 1] - bd->i[i]; 1134 while (m > 0) { 1135 /* is it above diagonal (in bd (compressed) numbering) */ 1136 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1137 sendbuf[cnt++] = *b_sendbuf++; 1138 m--; 1139 b_sendj++; 1140 } 1141 1142 /* put in diagonal portion */ 1143 for (j = ad->i[i]; j < ad->i[i + 1]; j++) { sendbuf[cnt++] = *a_sendbuf++; } 1144 1145 /* put in upper diagonal portion */ 1146 while (m-- > 0) { 1147 sendbuf[cnt++] = *b_sendbuf++; 1148 b_sendj++; 1149 } 1150 } 1151 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1152 1153 /* ----------------------------------------------------------------- 1154 Gather all numerical values to all processors 1155 */ 1156 if (!recvcounts) { PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); } 1157 for (i = 0; i < size; i++) { recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]]; } 1158 displs[0] = 0; 1159 for (i = 1; i < size; i++) { displs[i] = displs[i - 1] + recvcounts[i - 1]; } 1160 recvbuf = b->a; 1161 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A))); 1162 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1163 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1164 } /* endof (flag == MAT_GET_VALUES) */ 1165 PetscCall(PetscFree2(recvcounts, displs)); 1166 1167 PetscCall(MatPropagateSymmetryOptions(A, B)); 1168 PetscFunctionReturn(0); 1169 } 1170 1171 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) { 1172 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1173 Mat submat, A = c->A, B = c->B; 1174 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1175 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1176 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1177 const PetscInt *icol, *irow; 1178 PetscInt nrow, ncol, start; 1179 PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr; 1180 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc; 1181 PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr; 1182 PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz; 1183 PetscInt *lens, rmax, ncols, *cols, Crow; 1184 #if defined(PETSC_USE_CTABLE) 1185 PetscTable cmap, rmap; 1186 PetscInt *cmap_loc, *rmap_loc; 1187 #else 1188 PetscInt *cmap, *rmap; 1189 #endif 1190 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i; 1191 PetscInt *cworkB, lwrite, *subcols, *row2proc; 1192 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1193 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1194 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1195 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1196 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1197 MPI_Comm comm; 1198 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1199 PetscMPIInt *onodes1, *olengths1, idex, end; 1200 Mat_SubSppt *smatis1; 1201 PetscBool isrowsorted, iscolsorted; 1202 1203 PetscFunctionBegin; 1204 PetscValidLogicalCollectiveInt(C, ismax, 2); 1205 PetscValidLogicalCollectiveEnum(C, scall, 5); 1206 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1207 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1208 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1209 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1210 size = c->size; 1211 rank = c->rank; 1212 1213 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1214 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1215 PetscCall(ISGetIndices(isrow[0], &irow)); 1216 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1217 if (allcolumns) { 1218 icol = NULL; 1219 ncol = C->cmap->N; 1220 } else { 1221 PetscCall(ISGetIndices(iscol[0], &icol)); 1222 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1223 } 1224 1225 if (scall == MAT_INITIAL_MATRIX) { 1226 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1227 1228 /* Get some new tags to keep the communication clean */ 1229 tag1 = ((PetscObject)C)->tag; 1230 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1231 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1232 1233 /* evaluate communication - mesg to who, length of mesg, and buffer space 1234 required. Based on this, buffers are allocated, and data copied into them */ 1235 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1236 PetscCall(PetscMalloc1(nrow, &row2proc)); 1237 1238 /* w1[proc] = num of rows owned by proc -- to be requested */ 1239 proc = 0; 1240 nrqs = 0; /* num of outgoing messages */ 1241 for (j = 0; j < nrow; j++) { 1242 row = irow[j]; 1243 if (!isrowsorted) proc = 0; 1244 while (row >= C->rmap->range[proc + 1]) proc++; 1245 w1[proc]++; 1246 row2proc[j] = proc; /* map row index to proc */ 1247 1248 if (proc != rank && !w2[proc]) { 1249 w2[proc] = 1; 1250 nrqs++; 1251 } 1252 } 1253 w1[rank] = 0; /* rows owned by self will not be requested */ 1254 1255 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1256 for (proc = 0, j = 0; proc < size; proc++) { 1257 if (w1[proc]) { pa[j++] = proc; } 1258 } 1259 1260 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1261 msz = 0; /* total mesg length (for all procs) */ 1262 for (i = 0; i < nrqs; i++) { 1263 proc = pa[i]; 1264 w1[proc] += 3; 1265 msz += w1[proc]; 1266 } 1267 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1268 1269 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1270 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1271 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1272 1273 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1274 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1275 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1276 1277 /* Now post the Irecvs corresponding to these messages */ 1278 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1279 1280 PetscCall(PetscFree(onodes1)); 1281 PetscCall(PetscFree(olengths1)); 1282 1283 /* Allocate Memory for outgoing messages */ 1284 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1285 PetscCall(PetscArrayzero(sbuf1, size)); 1286 PetscCall(PetscArrayzero(ptr, size)); 1287 1288 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1289 iptr = tmp; 1290 for (i = 0; i < nrqs; i++) { 1291 proc = pa[i]; 1292 sbuf1[proc] = iptr; 1293 iptr += w1[proc]; 1294 } 1295 1296 /* Form the outgoing messages */ 1297 /* Initialize the header space */ 1298 for (i = 0; i < nrqs; i++) { 1299 proc = pa[i]; 1300 PetscCall(PetscArrayzero(sbuf1[proc], 3)); 1301 ptr[proc] = sbuf1[proc] + 3; 1302 } 1303 1304 /* Parse the isrow and copy data into outbuf */ 1305 PetscCall(PetscArrayzero(ctr, size)); 1306 for (j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1307 proc = row2proc[j]; 1308 if (proc != rank) { /* copy to the outgoing buf*/ 1309 *ptr[proc] = irow[j]; 1310 ctr[proc]++; 1311 ptr[proc]++; 1312 } 1313 } 1314 1315 /* Update the headers for the current IS */ 1316 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 1317 if ((ctr_j = ctr[j])) { 1318 sbuf1_j = sbuf1[j]; 1319 k = ++sbuf1_j[0]; 1320 sbuf1_j[2 * k] = ctr_j; 1321 sbuf1_j[2 * k - 1] = 0; 1322 } 1323 } 1324 1325 /* Now post the sends */ 1326 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1327 for (i = 0; i < nrqs; ++i) { 1328 proc = pa[i]; 1329 PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i)); 1330 } 1331 1332 /* Post Receives to capture the buffer size */ 1333 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1334 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1335 1336 if (nrqs) rbuf2[0] = tmp + msz; 1337 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1338 1339 for (i = 0; i < nrqs; ++i) { 1340 proc = pa[i]; 1341 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i)); 1342 } 1343 1344 PetscCall(PetscFree2(w1, w2)); 1345 1346 /* Send to other procs the buf size they should allocate */ 1347 /* Receive messages*/ 1348 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1349 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1350 1351 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1352 for (i = 0; i < nrqr; ++i) { 1353 req_size[i] = 0; 1354 rbuf1_i = rbuf1[i]; 1355 start = 2 * rbuf1_i[0] + 1; 1356 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1357 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1358 sbuf2_i = sbuf2[i]; 1359 for (j = start; j < end; j++) { 1360 k = rbuf1_i[j] - rstart; 1361 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1362 sbuf2_i[j] = ncols; 1363 req_size[i] += ncols; 1364 } 1365 req_source1[i] = r_status1[i].MPI_SOURCE; 1366 1367 /* form the header */ 1368 sbuf2_i[0] = req_size[i]; 1369 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1370 1371 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1372 } 1373 1374 PetscCall(PetscFree(r_status1)); 1375 PetscCall(PetscFree(r_waits1)); 1376 1377 /* rbuf2 is received, Post recv column indices a->j */ 1378 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1379 1380 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1381 for (i = 0; i < nrqs; ++i) { 1382 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1383 req_source2[i] = r_status2[i].MPI_SOURCE; 1384 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1385 } 1386 1387 /* Wait on sends1 and sends2 */ 1388 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1389 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1390 PetscCall(PetscFree(s_waits1)); 1391 PetscCall(PetscFree(s_status1)); 1392 1393 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1394 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1395 1396 /* Now allocate sending buffers for a->j, and send them off */ 1397 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1398 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1399 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 1400 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1401 1402 for (i = 0; i < nrqr; i++) { /* for each requested message */ 1403 rbuf1_i = rbuf1[i]; 1404 sbuf_aj_i = sbuf_aj[i]; 1405 ct1 = 2 * rbuf1_i[0] + 1; 1406 ct2 = 0; 1407 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1408 1409 kmax = rbuf1[i][2]; 1410 for (k = 0; k < kmax; k++, ct1++) { /* for each row */ 1411 row = rbuf1_i[ct1] - rstart; 1412 nzA = ai[row + 1] - ai[row]; 1413 nzB = bi[row + 1] - bi[row]; 1414 ncols = nzA + nzB; 1415 cworkA = aj + ai[row]; 1416 cworkB = bj + bi[row]; 1417 1418 /* load the column indices for this row into cols*/ 1419 cols = sbuf_aj_i + ct2; 1420 1421 lwrite = 0; 1422 for (l = 0; l < nzB; l++) { 1423 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1424 } 1425 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1426 for (l = 0; l < nzB; l++) { 1427 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1428 } 1429 1430 ct2 += ncols; 1431 } 1432 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1433 } 1434 1435 /* create column map (cmap): global col of C -> local col of submat */ 1436 #if defined(PETSC_USE_CTABLE) 1437 if (!allcolumns) { 1438 PetscCall(PetscTableCreate(ncol, C->cmap->N, &cmap)); 1439 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1440 for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1441 if (icol[j] >= cstart && icol[j] < cend) { 1442 cmap_loc[icol[j] - cstart] = j + 1; 1443 } else { /* use PetscTable for non-local col indices */ 1444 PetscCall(PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES)); 1445 } 1446 } 1447 } else { 1448 cmap = NULL; 1449 cmap_loc = NULL; 1450 } 1451 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1452 #else 1453 if (!allcolumns) { 1454 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1455 for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1456 } else { 1457 cmap = NULL; 1458 } 1459 #endif 1460 1461 /* Create lens for MatSeqAIJSetPreallocation() */ 1462 PetscCall(PetscCalloc1(nrow, &lens)); 1463 1464 /* Compute lens from local part of C */ 1465 for (j = 0; j < nrow; j++) { 1466 row = irow[j]; 1467 proc = row2proc[j]; 1468 if (proc == rank) { 1469 /* diagonal part A = c->A */ 1470 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1471 cols = aj + ai[row - rstart]; 1472 if (!allcolumns) { 1473 for (k = 0; k < ncols; k++) { 1474 #if defined(PETSC_USE_CTABLE) 1475 tcol = cmap_loc[cols[k]]; 1476 #else 1477 tcol = cmap[cols[k] + cstart]; 1478 #endif 1479 if (tcol) lens[j]++; 1480 } 1481 } else { /* allcolumns */ 1482 lens[j] = ncols; 1483 } 1484 1485 /* off-diagonal part B = c->B */ 1486 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1487 cols = bj + bi[row - rstart]; 1488 if (!allcolumns) { 1489 for (k = 0; k < ncols; k++) { 1490 #if defined(PETSC_USE_CTABLE) 1491 PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol)); 1492 #else 1493 tcol = cmap[bmap[cols[k]]]; 1494 #endif 1495 if (tcol) lens[j]++; 1496 } 1497 } else { /* allcolumns */ 1498 lens[j] += ncols; 1499 } 1500 } 1501 } 1502 1503 /* Create row map (rmap): global row of C -> local row of submat */ 1504 #if defined(PETSC_USE_CTABLE) 1505 PetscCall(PetscTableCreate(nrow, C->rmap->N, &rmap)); 1506 for (j = 0; j < nrow; j++) { 1507 row = irow[j]; 1508 proc = row2proc[j]; 1509 if (proc == rank) { /* a local row */ 1510 rmap_loc[row - rstart] = j; 1511 } else { 1512 PetscCall(PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES)); 1513 } 1514 } 1515 #else 1516 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1517 for (j = 0; j < nrow; j++) { rmap[irow[j]] = j; } 1518 #endif 1519 1520 /* Update lens from offproc data */ 1521 /* recv a->j is done */ 1522 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1523 for (i = 0; i < nrqs; i++) { 1524 proc = pa[i]; 1525 sbuf1_i = sbuf1[proc]; 1526 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1527 ct1 = 2 + 1; 1528 ct2 = 0; 1529 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1530 rbuf3_i = rbuf3[i]; /* received C->j */ 1531 1532 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1533 max1 = sbuf1_i[2]; 1534 for (k = 0; k < max1; k++, ct1++) { 1535 #if defined(PETSC_USE_CTABLE) 1536 PetscCall(PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row)); 1537 row--; 1538 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1539 #else 1540 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1541 #endif 1542 /* Now, store row index of submat in sbuf1_i[ct1] */ 1543 sbuf1_i[ct1] = row; 1544 1545 nnz = rbuf2_i[ct1]; 1546 if (!allcolumns) { 1547 for (l = 0; l < nnz; l++, ct2++) { 1548 #if defined(PETSC_USE_CTABLE) 1549 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1550 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1551 } else { 1552 PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol)); 1553 } 1554 #else 1555 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1556 #endif 1557 if (tcol) lens[row]++; 1558 } 1559 } else { /* allcolumns */ 1560 lens[row] += nnz; 1561 } 1562 } 1563 } 1564 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1565 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1566 1567 /* Create the submatrices */ 1568 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1569 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1570 1571 PetscCall(ISGetBlockSize(isrow[0], &i)); 1572 PetscCall(ISGetBlockSize(iscol[0], &j)); 1573 PetscCall(MatSetBlockSizes(submat, i, j)); 1574 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1575 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1576 1577 /* create struct Mat_SubSppt and attached it to submat */ 1578 PetscCall(PetscNew(&smatis1)); 1579 subc = (Mat_SeqAIJ *)submat->data; 1580 subc->submatis1 = smatis1; 1581 1582 smatis1->id = 0; 1583 smatis1->nrqs = nrqs; 1584 smatis1->nrqr = nrqr; 1585 smatis1->rbuf1 = rbuf1; 1586 smatis1->rbuf2 = rbuf2; 1587 smatis1->rbuf3 = rbuf3; 1588 smatis1->sbuf2 = sbuf2; 1589 smatis1->req_source2 = req_source2; 1590 1591 smatis1->sbuf1 = sbuf1; 1592 smatis1->ptr = ptr; 1593 smatis1->tmp = tmp; 1594 smatis1->ctr = ctr; 1595 1596 smatis1->pa = pa; 1597 smatis1->req_size = req_size; 1598 smatis1->req_source1 = req_source1; 1599 1600 smatis1->allcolumns = allcolumns; 1601 smatis1->singleis = PETSC_TRUE; 1602 smatis1->row2proc = row2proc; 1603 smatis1->rmap = rmap; 1604 smatis1->cmap = cmap; 1605 #if defined(PETSC_USE_CTABLE) 1606 smatis1->rmap_loc = rmap_loc; 1607 smatis1->cmap_loc = cmap_loc; 1608 #endif 1609 1610 smatis1->destroy = submat->ops->destroy; 1611 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1612 submat->factortype = C->factortype; 1613 1614 /* compute rmax */ 1615 rmax = 0; 1616 for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1617 1618 } else { /* scall == MAT_REUSE_MATRIX */ 1619 submat = submats[0]; 1620 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1621 1622 subc = (Mat_SeqAIJ *)submat->data; 1623 rmax = subc->rmax; 1624 smatis1 = subc->submatis1; 1625 nrqs = smatis1->nrqs; 1626 nrqr = smatis1->nrqr; 1627 rbuf1 = smatis1->rbuf1; 1628 rbuf2 = smatis1->rbuf2; 1629 rbuf3 = smatis1->rbuf3; 1630 req_source2 = smatis1->req_source2; 1631 1632 sbuf1 = smatis1->sbuf1; 1633 sbuf2 = smatis1->sbuf2; 1634 ptr = smatis1->ptr; 1635 tmp = smatis1->tmp; 1636 ctr = smatis1->ctr; 1637 1638 pa = smatis1->pa; 1639 req_size = smatis1->req_size; 1640 req_source1 = smatis1->req_source1; 1641 1642 allcolumns = smatis1->allcolumns; 1643 row2proc = smatis1->row2proc; 1644 rmap = smatis1->rmap; 1645 cmap = smatis1->cmap; 1646 #if defined(PETSC_USE_CTABLE) 1647 rmap_loc = smatis1->rmap_loc; 1648 cmap_loc = smatis1->cmap_loc; 1649 #endif 1650 } 1651 1652 /* Post recv matrix values */ 1653 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1654 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1655 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1656 for (i = 0; i < nrqs; ++i) { 1657 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1658 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1659 } 1660 1661 /* Allocate sending buffers for a->a, and send them off */ 1662 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1663 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1664 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 1665 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1666 1667 for (i = 0; i < nrqr; i++) { 1668 rbuf1_i = rbuf1[i]; 1669 sbuf_aa_i = sbuf_aa[i]; 1670 ct1 = 2 * rbuf1_i[0] + 1; 1671 ct2 = 0; 1672 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1673 1674 kmax = rbuf1_i[2]; 1675 for (k = 0; k < kmax; k++, ct1++) { 1676 row = rbuf1_i[ct1] - rstart; 1677 nzA = ai[row + 1] - ai[row]; 1678 nzB = bi[row + 1] - bi[row]; 1679 ncols = nzA + nzB; 1680 cworkB = bj + bi[row]; 1681 vworkA = a_a + ai[row]; 1682 vworkB = b_a + bi[row]; 1683 1684 /* load the column values for this row into vals*/ 1685 vals = sbuf_aa_i + ct2; 1686 1687 lwrite = 0; 1688 for (l = 0; l < nzB; l++) { 1689 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1690 } 1691 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1692 for (l = 0; l < nzB; l++) { 1693 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1694 } 1695 1696 ct2 += ncols; 1697 } 1698 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1699 } 1700 1701 /* Assemble submat */ 1702 /* First assemble the local rows */ 1703 for (j = 0; j < nrow; j++) { 1704 row = irow[j]; 1705 proc = row2proc[j]; 1706 if (proc == rank) { 1707 Crow = row - rstart; /* local row index of C */ 1708 #if defined(PETSC_USE_CTABLE) 1709 row = rmap_loc[Crow]; /* row index of submat */ 1710 #else 1711 row = rmap[row]; 1712 #endif 1713 1714 if (allcolumns) { 1715 /* diagonal part A = c->A */ 1716 ncols = ai[Crow + 1] - ai[Crow]; 1717 cols = aj + ai[Crow]; 1718 vals = a_a + ai[Crow]; 1719 i = 0; 1720 for (k = 0; k < ncols; k++) { 1721 subcols[i] = cols[k] + cstart; 1722 subvals[i++] = vals[k]; 1723 } 1724 1725 /* off-diagonal part B = c->B */ 1726 ncols = bi[Crow + 1] - bi[Crow]; 1727 cols = bj + bi[Crow]; 1728 vals = b_a + bi[Crow]; 1729 for (k = 0; k < ncols; k++) { 1730 subcols[i] = bmap[cols[k]]; 1731 subvals[i++] = vals[k]; 1732 } 1733 1734 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1735 1736 } else { /* !allcolumns */ 1737 #if defined(PETSC_USE_CTABLE) 1738 /* diagonal part A = c->A */ 1739 ncols = ai[Crow + 1] - ai[Crow]; 1740 cols = aj + ai[Crow]; 1741 vals = a_a + ai[Crow]; 1742 i = 0; 1743 for (k = 0; k < ncols; k++) { 1744 tcol = cmap_loc[cols[k]]; 1745 if (tcol) { 1746 subcols[i] = --tcol; 1747 subvals[i++] = vals[k]; 1748 } 1749 } 1750 1751 /* off-diagonal part B = c->B */ 1752 ncols = bi[Crow + 1] - bi[Crow]; 1753 cols = bj + bi[Crow]; 1754 vals = b_a + bi[Crow]; 1755 for (k = 0; k < ncols; k++) { 1756 PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol)); 1757 if (tcol) { 1758 subcols[i] = --tcol; 1759 subvals[i++] = vals[k]; 1760 } 1761 } 1762 #else 1763 /* diagonal part A = c->A */ 1764 ncols = ai[Crow + 1] - ai[Crow]; 1765 cols = aj + ai[Crow]; 1766 vals = a_a + ai[Crow]; 1767 i = 0; 1768 for (k = 0; k < ncols; k++) { 1769 tcol = cmap[cols[k] + cstart]; 1770 if (tcol) { 1771 subcols[i] = --tcol; 1772 subvals[i++] = vals[k]; 1773 } 1774 } 1775 1776 /* off-diagonal part B = c->B */ 1777 ncols = bi[Crow + 1] - bi[Crow]; 1778 cols = bj + bi[Crow]; 1779 vals = b_a + bi[Crow]; 1780 for (k = 0; k < ncols; k++) { 1781 tcol = cmap[bmap[cols[k]]]; 1782 if (tcol) { 1783 subcols[i] = --tcol; 1784 subvals[i++] = vals[k]; 1785 } 1786 } 1787 #endif 1788 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1789 } 1790 } 1791 } 1792 1793 /* Now assemble the off-proc rows */ 1794 for (i = 0; i < nrqs; i++) { /* for each requested message */ 1795 /* recv values from other processes */ 1796 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1797 proc = pa[idex]; 1798 sbuf1_i = sbuf1[proc]; 1799 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1800 ct1 = 2 + 1; 1801 ct2 = 0; /* count of received C->j */ 1802 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1803 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1804 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1805 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1806 1807 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1808 max1 = sbuf1_i[2]; /* num of rows */ 1809 for (k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1810 row = sbuf1_i[ct1]; /* row index of submat */ 1811 if (!allcolumns) { 1812 idex = 0; 1813 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1814 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1815 for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1816 #if defined(PETSC_USE_CTABLE) 1817 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1818 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1819 } else { 1820 PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol)); 1821 } 1822 #else 1823 tcol = cmap[rbuf3_i[ct2]]; 1824 #endif 1825 if (tcol) { 1826 subcols[idex] = --tcol; /* may not be sorted */ 1827 subvals[idex++] = rbuf4_i[ct2]; 1828 1829 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1830 For reuse, we replace received C->j with index that should be inserted to submat */ 1831 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1832 } 1833 } 1834 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1835 } else { /* scall == MAT_REUSE_MATRIX */ 1836 submat = submats[0]; 1837 subc = (Mat_SeqAIJ *)submat->data; 1838 1839 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1840 for (l = 0; l < nnz; l++) { 1841 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1842 subvals[idex++] = rbuf4_i[ct2]; 1843 } 1844 1845 bj = subc->j + subc->i[row]; /* sorted column indices */ 1846 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1847 } 1848 } else { /* allcolumns */ 1849 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1850 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES)); 1851 ct2 += nnz; 1852 } 1853 } 1854 } 1855 1856 /* sending a->a are done */ 1857 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1858 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1859 1860 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1861 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1862 submats[0] = submat; 1863 1864 /* Restore the indices */ 1865 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1866 if (!allcolumns) { PetscCall(ISRestoreIndices(iscol[0], &icol)); } 1867 1868 /* Destroy allocated memory */ 1869 for (i = 0; i < nrqs; ++i) { PetscCall(PetscFree(rbuf4[i])); } 1870 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1871 if (sbuf_aa) { 1872 PetscCall(PetscFree(sbuf_aa[0])); 1873 PetscCall(PetscFree(sbuf_aa)); 1874 } 1875 1876 if (scall == MAT_INITIAL_MATRIX) { 1877 PetscCall(PetscFree(lens)); 1878 if (sbuf_aj) { 1879 PetscCall(PetscFree(sbuf_aj[0])); 1880 PetscCall(PetscFree(sbuf_aj)); 1881 } 1882 } 1883 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1884 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1885 PetscFunctionReturn(0); 1886 } 1887 1888 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) { 1889 PetscInt ncol; 1890 PetscBool colflag, allcolumns = PETSC_FALSE; 1891 1892 PetscFunctionBegin; 1893 /* Allocate memory to hold all the submatrices */ 1894 if (scall == MAT_INITIAL_MATRIX) { PetscCall(PetscCalloc1(2, submat)); } 1895 1896 /* Check for special case: each processor gets entire matrix columns */ 1897 PetscCall(ISIdentity(iscol[0], &colflag)); 1898 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1899 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1900 1901 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1902 PetscFunctionReturn(0); 1903 } 1904 1905 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) { 1906 PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2]; 1907 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1908 Mat_SeqAIJ *subc; 1909 Mat_SubSppt *smat; 1910 1911 PetscFunctionBegin; 1912 /* Check for special case: each processor has a single IS */ 1913 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1914 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1915 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1916 PetscFunctionReturn(0); 1917 } 1918 1919 /* Collect global wantallmatrix and nstages */ 1920 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1921 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1922 if (!nmax) nmax = 1; 1923 1924 if (scall == MAT_INITIAL_MATRIX) { 1925 /* Collect global wantallmatrix and nstages */ 1926 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1927 PetscCall(ISIdentity(*isrow, &rowflag)); 1928 PetscCall(ISIdentity(*iscol, &colflag)); 1929 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1930 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1931 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1932 wantallmatrix = PETSC_TRUE; 1933 1934 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1935 } 1936 } 1937 1938 /* Determine the number of stages through which submatrices are done 1939 Each stage will extract nmax submatrices. 1940 nmax is determined by the matrix column dimension. 1941 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1942 */ 1943 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1944 1945 in[0] = -1 * (PetscInt)wantallmatrix; 1946 in[1] = nstages; 1947 PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 1948 wantallmatrix = (PetscBool)(-out[0]); 1949 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1950 1951 } else { /* MAT_REUSE_MATRIX */ 1952 if (ismax) { 1953 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 1954 smat = subc->submatis1; 1955 } else { /* (*submat)[0] is a dummy matrix */ 1956 smat = (Mat_SubSppt *)(*submat)[0]->data; 1957 } 1958 if (!smat) { 1959 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 1960 wantallmatrix = PETSC_TRUE; 1961 } else if (smat->singleis) { 1962 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1963 PetscFunctionReturn(0); 1964 } else { 1965 nstages = smat->nstages; 1966 } 1967 } 1968 1969 if (wantallmatrix) { 1970 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 1971 PetscFunctionReturn(0); 1972 } 1973 1974 /* Allocate memory to hold all the submatrices and dummy submatrices */ 1975 if (scall == MAT_INITIAL_MATRIX) { PetscCall(PetscCalloc1(ismax + nstages, submat)); } 1976 1977 for (i = 0, pos = 0; i < nstages; i++) { 1978 if (pos + nmax <= ismax) max_no = nmax; 1979 else if (pos >= ismax) max_no = 0; 1980 else max_no = ismax - pos; 1981 1982 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos)); 1983 if (!max_no) { 1984 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 1985 smat = (Mat_SubSppt *)(*submat)[pos]->data; 1986 smat->nstages = nstages; 1987 } 1988 pos++; /* advance to next dummy matrix if any */ 1989 } else pos += max_no; 1990 } 1991 1992 if (ismax && scall == MAT_INITIAL_MATRIX) { 1993 /* save nstages for reuse */ 1994 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 1995 smat = subc->submatis1; 1996 smat->nstages = nstages; 1997 } 1998 PetscFunctionReturn(0); 1999 } 2000 2001 /* -------------------------------------------------------------------------*/ 2002 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) { 2003 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2004 Mat A = c->A; 2005 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2006 const PetscInt **icol, **irow; 2007 PetscInt *nrow, *ncol, start; 2008 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr; 2009 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1; 2010 PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol; 2011 PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2012 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2013 #if defined(PETSC_USE_CTABLE) 2014 PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i; 2015 #else 2016 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2017 #endif 2018 const PetscInt *irow_i; 2019 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2020 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2021 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2022 MPI_Comm comm; 2023 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2024 PetscMPIInt *onodes1, *olengths1, end; 2025 PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2026 Mat_SubSppt *smat_i; 2027 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2028 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen; 2029 2030 PetscFunctionBegin; 2031 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2032 size = c->size; 2033 rank = c->rank; 2034 2035 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2036 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2037 2038 for (i = 0; i < ismax; i++) { 2039 PetscCall(ISSorted(iscol[i], &issorted[i])); 2040 if (!issorted[i]) iscsorted = issorted[i]; 2041 2042 PetscCall(ISSorted(isrow[i], &issorted[i])); 2043 2044 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2045 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2046 2047 /* Check for special case: allcolumn */ 2048 PetscCall(ISIdentity(iscol[i], &colflag)); 2049 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2050 if (colflag && ncol[i] == C->cmap->N) { 2051 allcolumns[i] = PETSC_TRUE; 2052 icol[i] = NULL; 2053 } else { 2054 allcolumns[i] = PETSC_FALSE; 2055 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2056 } 2057 } 2058 2059 if (scall == MAT_REUSE_MATRIX) { 2060 /* Assumes new rows are same length as the old rows */ 2061 for (i = 0; i < ismax; i++) { 2062 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2063 subc = (Mat_SeqAIJ *)submats[i]->data; 2064 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2065 2066 /* Initial matrix as if empty */ 2067 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2068 2069 smat_i = subc->submatis1; 2070 2071 nrqs = smat_i->nrqs; 2072 nrqr = smat_i->nrqr; 2073 rbuf1 = smat_i->rbuf1; 2074 rbuf2 = smat_i->rbuf2; 2075 rbuf3 = smat_i->rbuf3; 2076 req_source2 = smat_i->req_source2; 2077 2078 sbuf1 = smat_i->sbuf1; 2079 sbuf2 = smat_i->sbuf2; 2080 ptr = smat_i->ptr; 2081 tmp = smat_i->tmp; 2082 ctr = smat_i->ctr; 2083 2084 pa = smat_i->pa; 2085 req_size = smat_i->req_size; 2086 req_source1 = smat_i->req_source1; 2087 2088 allcolumns[i] = smat_i->allcolumns; 2089 row2proc[i] = smat_i->row2proc; 2090 rmap[i] = smat_i->rmap; 2091 cmap[i] = smat_i->cmap; 2092 } 2093 2094 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2095 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2096 smat_i = (Mat_SubSppt *)submats[0]->data; 2097 2098 nrqs = smat_i->nrqs; 2099 nrqr = smat_i->nrqr; 2100 rbuf1 = smat_i->rbuf1; 2101 rbuf2 = smat_i->rbuf2; 2102 rbuf3 = smat_i->rbuf3; 2103 req_source2 = smat_i->req_source2; 2104 2105 sbuf1 = smat_i->sbuf1; 2106 sbuf2 = smat_i->sbuf2; 2107 ptr = smat_i->ptr; 2108 tmp = smat_i->tmp; 2109 ctr = smat_i->ctr; 2110 2111 pa = smat_i->pa; 2112 req_size = smat_i->req_size; 2113 req_source1 = smat_i->req_source1; 2114 2115 allcolumns[0] = PETSC_FALSE; 2116 } 2117 } else { /* scall == MAT_INITIAL_MATRIX */ 2118 /* Get some new tags to keep the communication clean */ 2119 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2120 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2121 2122 /* evaluate communication - mesg to who, length of mesg, and buffer space 2123 required. Based on this, buffers are allocated, and data copied into them*/ 2124 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2125 2126 for (i = 0; i < ismax; i++) { 2127 jmax = nrow[i]; 2128 irow_i = irow[i]; 2129 2130 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2131 row2proc[i] = row2proc_i; 2132 2133 if (issorted[i]) proc = 0; 2134 for (j = 0; j < jmax; j++) { 2135 if (!issorted[i]) proc = 0; 2136 row = irow_i[j]; 2137 while (row >= C->rmap->range[proc + 1]) proc++; 2138 w4[proc]++; 2139 row2proc_i[j] = proc; /* map row index to proc */ 2140 } 2141 for (j = 0; j < size; j++) { 2142 if (w4[j]) { 2143 w1[j] += w4[j]; 2144 w3[j]++; 2145 w4[j] = 0; 2146 } 2147 } 2148 } 2149 2150 nrqs = 0; /* no of outgoing messages */ 2151 msz = 0; /* total mesg length (for all procs) */ 2152 w1[rank] = 0; /* no mesg sent to self */ 2153 w3[rank] = 0; 2154 for (i = 0; i < size; i++) { 2155 if (w1[i]) { 2156 w2[i] = 1; 2157 nrqs++; 2158 } /* there exists a message to proc i */ 2159 } 2160 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2161 for (i = 0, j = 0; i < size; i++) { 2162 if (w1[i]) { 2163 pa[j] = i; 2164 j++; 2165 } 2166 } 2167 2168 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2169 for (i = 0; i < nrqs; i++) { 2170 j = pa[i]; 2171 w1[j] += w2[j] + 2 * w3[j]; 2172 msz += w1[j]; 2173 } 2174 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2175 2176 /* Determine the number of messages to expect, their lengths, from from-ids */ 2177 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2178 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2179 2180 /* Now post the Irecvs corresponding to these messages */ 2181 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2182 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2183 2184 /* Allocate Memory for outgoing messages */ 2185 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2186 PetscCall(PetscArrayzero(sbuf1, size)); 2187 PetscCall(PetscArrayzero(ptr, size)); 2188 2189 { 2190 PetscInt *iptr = tmp; 2191 k = 0; 2192 for (i = 0; i < nrqs; i++) { 2193 j = pa[i]; 2194 iptr += k; 2195 sbuf1[j] = iptr; 2196 k = w1[j]; 2197 } 2198 } 2199 2200 /* Form the outgoing messages. Initialize the header space */ 2201 for (i = 0; i < nrqs; i++) { 2202 j = pa[i]; 2203 sbuf1[j][0] = 0; 2204 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2205 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2206 } 2207 2208 /* Parse the isrow and copy data into outbuf */ 2209 for (i = 0; i < ismax; i++) { 2210 row2proc_i = row2proc[i]; 2211 PetscCall(PetscArrayzero(ctr, size)); 2212 irow_i = irow[i]; 2213 jmax = nrow[i]; 2214 for (j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2215 proc = row2proc_i[j]; 2216 if (proc != rank) { /* copy to the outgoing buf*/ 2217 ctr[proc]++; 2218 *ptr[proc] = irow_i[j]; 2219 ptr[proc]++; 2220 } 2221 } 2222 /* Update the headers for the current IS */ 2223 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 2224 if ((ctr_j = ctr[j])) { 2225 sbuf1_j = sbuf1[j]; 2226 k = ++sbuf1_j[0]; 2227 sbuf1_j[2 * k] = ctr_j; 2228 sbuf1_j[2 * k - 1] = i; 2229 } 2230 } 2231 } 2232 2233 /* Now post the sends */ 2234 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2235 for (i = 0; i < nrqs; ++i) { 2236 j = pa[i]; 2237 PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2238 } 2239 2240 /* Post Receives to capture the buffer size */ 2241 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2242 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2243 if (nrqs) rbuf2[0] = tmp + msz; 2244 for (i = 1; i < nrqs; ++i) { rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; } 2245 for (i = 0; i < nrqs; ++i) { 2246 j = pa[i]; 2247 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2248 } 2249 2250 /* Send to other procs the buf size they should allocate */ 2251 /* Receive messages*/ 2252 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2253 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2254 { 2255 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2256 PetscInt *sbuf2_i; 2257 2258 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2259 for (i = 0; i < nrqr; ++i) { 2260 req_size[i] = 0; 2261 rbuf1_i = rbuf1[i]; 2262 start = 2 * rbuf1_i[0] + 1; 2263 end = olengths1[i]; 2264 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2265 sbuf2_i = sbuf2[i]; 2266 for (j = start; j < end; j++) { 2267 id = rbuf1_i[j] - rstart; 2268 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2269 sbuf2_i[j] = ncols; 2270 req_size[i] += ncols; 2271 } 2272 req_source1[i] = onodes1[i]; 2273 /* form the header */ 2274 sbuf2_i[0] = req_size[i]; 2275 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2276 2277 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2278 } 2279 } 2280 2281 PetscCall(PetscFree(onodes1)); 2282 PetscCall(PetscFree(olengths1)); 2283 PetscCall(PetscFree(r_waits1)); 2284 PetscCall(PetscFree4(w1, w2, w3, w4)); 2285 2286 /* Receive messages*/ 2287 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2288 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2289 for (i = 0; i < nrqs; ++i) { 2290 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2291 req_source2[i] = pa[i]; 2292 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2293 } 2294 PetscCall(PetscFree(r_waits2)); 2295 2296 /* Wait on sends1 and sends2 */ 2297 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2298 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2299 PetscCall(PetscFree(s_waits1)); 2300 PetscCall(PetscFree(s_waits2)); 2301 2302 /* Now allocate sending buffers for a->j, and send them off */ 2303 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2304 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2305 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 2306 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2307 2308 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2309 { 2310 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2311 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2312 PetscInt cend = C->cmap->rend; 2313 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2314 2315 for (i = 0; i < nrqr; i++) { 2316 rbuf1_i = rbuf1[i]; 2317 sbuf_aj_i = sbuf_aj[i]; 2318 ct1 = 2 * rbuf1_i[0] + 1; 2319 ct2 = 0; 2320 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2321 kmax = rbuf1[i][2 * j]; 2322 for (k = 0; k < kmax; k++, ct1++) { 2323 row = rbuf1_i[ct1] - rstart; 2324 nzA = a_i[row + 1] - a_i[row]; 2325 nzB = b_i[row + 1] - b_i[row]; 2326 ncols = nzA + nzB; 2327 cworkA = a_j + a_i[row]; 2328 cworkB = b_j + b_i[row]; 2329 2330 /* load the column indices for this row into cols */ 2331 cols = sbuf_aj_i + ct2; 2332 2333 lwrite = 0; 2334 for (l = 0; l < nzB; l++) { 2335 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2336 } 2337 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2338 for (l = 0; l < nzB; l++) { 2339 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2340 } 2341 2342 ct2 += ncols; 2343 } 2344 } 2345 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2346 } 2347 } 2348 2349 /* create col map: global col of C -> local col of submatrices */ 2350 { 2351 const PetscInt *icol_i; 2352 #if defined(PETSC_USE_CTABLE) 2353 for (i = 0; i < ismax; i++) { 2354 if (!allcolumns[i]) { 2355 PetscCall(PetscTableCreate(ncol[i], C->cmap->N, &cmap[i])); 2356 2357 jmax = ncol[i]; 2358 icol_i = icol[i]; 2359 cmap_i = cmap[i]; 2360 for (j = 0; j < jmax; j++) { PetscCall(PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES)); } 2361 } else cmap[i] = NULL; 2362 } 2363 #else 2364 for (i = 0; i < ismax; i++) { 2365 if (!allcolumns[i]) { 2366 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2367 jmax = ncol[i]; 2368 icol_i = icol[i]; 2369 cmap_i = cmap[i]; 2370 for (j = 0; j < jmax; j++) { cmap_i[icol_i[j]] = j + 1; } 2371 } else cmap[i] = NULL; 2372 } 2373 #endif 2374 } 2375 2376 /* Create lens which is required for MatCreate... */ 2377 for (i = 0, j = 0; i < ismax; i++) j += nrow[i]; 2378 PetscCall(PetscMalloc1(ismax, &lens)); 2379 2380 if (ismax) { PetscCall(PetscCalloc1(j, &lens[0])); } 2381 for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1]; 2382 2383 /* Update lens from local data */ 2384 for (i = 0; i < ismax; i++) { 2385 row2proc_i = row2proc[i]; 2386 jmax = nrow[i]; 2387 if (!allcolumns[i]) cmap_i = cmap[i]; 2388 irow_i = irow[i]; 2389 lens_i = lens[i]; 2390 for (j = 0; j < jmax; j++) { 2391 row = irow_i[j]; 2392 proc = row2proc_i[j]; 2393 if (proc == rank) { 2394 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2395 if (!allcolumns[i]) { 2396 for (k = 0; k < ncols; k++) { 2397 #if defined(PETSC_USE_CTABLE) 2398 PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol)); 2399 #else 2400 tcol = cmap_i[cols[k]]; 2401 #endif 2402 if (tcol) lens_i[j]++; 2403 } 2404 } else { /* allcolumns */ 2405 lens_i[j] = ncols; 2406 } 2407 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2408 } 2409 } 2410 } 2411 2412 /* Create row map: global row of C -> local row of submatrices */ 2413 #if defined(PETSC_USE_CTABLE) 2414 for (i = 0; i < ismax; i++) { 2415 PetscCall(PetscTableCreate(nrow[i], C->rmap->N, &rmap[i])); 2416 irow_i = irow[i]; 2417 jmax = nrow[i]; 2418 for (j = 0; j < jmax; j++) { PetscCall(PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES)); } 2419 } 2420 #else 2421 for (i = 0; i < ismax; i++) { 2422 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2423 rmap_i = rmap[i]; 2424 irow_i = irow[i]; 2425 jmax = nrow[i]; 2426 for (j = 0; j < jmax; j++) { rmap_i[irow_i[j]] = j; } 2427 } 2428 #endif 2429 2430 /* Update lens from offproc data */ 2431 { 2432 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2433 2434 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2435 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2436 sbuf1_i = sbuf1[pa[tmp2]]; 2437 jmax = sbuf1_i[0]; 2438 ct1 = 2 * jmax + 1; 2439 ct2 = 0; 2440 rbuf2_i = rbuf2[tmp2]; 2441 rbuf3_i = rbuf3[tmp2]; 2442 for (j = 1; j <= jmax; j++) { 2443 is_no = sbuf1_i[2 * j - 1]; 2444 max1 = sbuf1_i[2 * j]; 2445 lens_i = lens[is_no]; 2446 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2447 rmap_i = rmap[is_no]; 2448 for (k = 0; k < max1; k++, ct1++) { 2449 #if defined(PETSC_USE_CTABLE) 2450 PetscCall(PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row)); 2451 row--; 2452 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2453 #else 2454 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2455 #endif 2456 max2 = rbuf2_i[ct1]; 2457 for (l = 0; l < max2; l++, ct2++) { 2458 if (!allcolumns[is_no]) { 2459 #if defined(PETSC_USE_CTABLE) 2460 PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol)); 2461 #else 2462 tcol = cmap_i[rbuf3_i[ct2]]; 2463 #endif 2464 if (tcol) lens_i[row]++; 2465 } else { /* allcolumns */ 2466 lens_i[row]++; /* lens_i[row] += max2 ? */ 2467 } 2468 } 2469 } 2470 } 2471 } 2472 } 2473 PetscCall(PetscFree(r_waits3)); 2474 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2475 PetscCall(PetscFree(s_waits3)); 2476 2477 /* Create the submatrices */ 2478 for (i = 0; i < ismax; i++) { 2479 PetscInt rbs, cbs; 2480 2481 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2482 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2483 2484 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2485 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2486 2487 PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2488 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2489 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2490 2491 /* create struct Mat_SubSppt and attached it to submat */ 2492 PetscCall(PetscNew(&smat_i)); 2493 subc = (Mat_SeqAIJ *)submats[i]->data; 2494 subc->submatis1 = smat_i; 2495 2496 smat_i->destroy = submats[i]->ops->destroy; 2497 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2498 submats[i]->factortype = C->factortype; 2499 2500 smat_i->id = i; 2501 smat_i->nrqs = nrqs; 2502 smat_i->nrqr = nrqr; 2503 smat_i->rbuf1 = rbuf1; 2504 smat_i->rbuf2 = rbuf2; 2505 smat_i->rbuf3 = rbuf3; 2506 smat_i->sbuf2 = sbuf2; 2507 smat_i->req_source2 = req_source2; 2508 2509 smat_i->sbuf1 = sbuf1; 2510 smat_i->ptr = ptr; 2511 smat_i->tmp = tmp; 2512 smat_i->ctr = ctr; 2513 2514 smat_i->pa = pa; 2515 smat_i->req_size = req_size; 2516 smat_i->req_source1 = req_source1; 2517 2518 smat_i->allcolumns = allcolumns[i]; 2519 smat_i->singleis = PETSC_FALSE; 2520 smat_i->row2proc = row2proc[i]; 2521 smat_i->rmap = rmap[i]; 2522 smat_i->cmap = cmap[i]; 2523 } 2524 2525 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2526 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2527 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2528 PetscCall(MatSetType(submats[0], MATDUMMY)); 2529 2530 /* create struct Mat_SubSppt and attached it to submat */ 2531 PetscCall(PetscNewLog(submats[0], &smat_i)); 2532 submats[0]->data = (void *)smat_i; 2533 2534 smat_i->destroy = submats[0]->ops->destroy; 2535 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2536 submats[0]->factortype = C->factortype; 2537 2538 smat_i->id = 0; 2539 smat_i->nrqs = nrqs; 2540 smat_i->nrqr = nrqr; 2541 smat_i->rbuf1 = rbuf1; 2542 smat_i->rbuf2 = rbuf2; 2543 smat_i->rbuf3 = rbuf3; 2544 smat_i->sbuf2 = sbuf2; 2545 smat_i->req_source2 = req_source2; 2546 2547 smat_i->sbuf1 = sbuf1; 2548 smat_i->ptr = ptr; 2549 smat_i->tmp = tmp; 2550 smat_i->ctr = ctr; 2551 2552 smat_i->pa = pa; 2553 smat_i->req_size = req_size; 2554 smat_i->req_source1 = req_source1; 2555 2556 smat_i->allcolumns = PETSC_FALSE; 2557 smat_i->singleis = PETSC_FALSE; 2558 smat_i->row2proc = NULL; 2559 smat_i->rmap = NULL; 2560 smat_i->cmap = NULL; 2561 } 2562 2563 if (ismax) PetscCall(PetscFree(lens[0])); 2564 PetscCall(PetscFree(lens)); 2565 if (sbuf_aj) { 2566 PetscCall(PetscFree(sbuf_aj[0])); 2567 PetscCall(PetscFree(sbuf_aj)); 2568 } 2569 2570 } /* endof scall == MAT_INITIAL_MATRIX */ 2571 2572 /* Post recv matrix values */ 2573 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2574 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2575 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2576 for (i = 0; i < nrqs; ++i) { 2577 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2578 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2579 } 2580 2581 /* Allocate sending buffers for a->a, and send them off */ 2582 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2583 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2584 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 2585 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2586 2587 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2588 { 2589 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2590 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2591 PetscInt cend = C->cmap->rend; 2592 PetscInt *b_j = b->j; 2593 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2594 2595 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2596 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2597 for (i = 0; i < nrqr; i++) { 2598 rbuf1_i = rbuf1[i]; 2599 sbuf_aa_i = sbuf_aa[i]; 2600 ct1 = 2 * rbuf1_i[0] + 1; 2601 ct2 = 0; 2602 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2603 kmax = rbuf1_i[2 * j]; 2604 for (k = 0; k < kmax; k++, ct1++) { 2605 row = rbuf1_i[ct1] - rstart; 2606 nzA = a_i[row + 1] - a_i[row]; 2607 nzB = b_i[row + 1] - b_i[row]; 2608 ncols = nzA + nzB; 2609 cworkB = b_j + b_i[row]; 2610 vworkA = a_a + a_i[row]; 2611 vworkB = b_a + b_i[row]; 2612 2613 /* load the column values for this row into vals*/ 2614 vals = sbuf_aa_i + ct2; 2615 2616 lwrite = 0; 2617 for (l = 0; l < nzB; l++) { 2618 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2619 } 2620 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2621 for (l = 0; l < nzB; l++) { 2622 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2623 } 2624 2625 ct2 += ncols; 2626 } 2627 } 2628 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2629 } 2630 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2631 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2632 } 2633 2634 /* Assemble the matrices */ 2635 /* First assemble the local rows */ 2636 for (i = 0; i < ismax; i++) { 2637 row2proc_i = row2proc[i]; 2638 subc = (Mat_SeqAIJ *)submats[i]->data; 2639 imat_ilen = subc->ilen; 2640 imat_j = subc->j; 2641 imat_i = subc->i; 2642 imat_a = subc->a; 2643 2644 if (!allcolumns[i]) cmap_i = cmap[i]; 2645 rmap_i = rmap[i]; 2646 irow_i = irow[i]; 2647 jmax = nrow[i]; 2648 for (j = 0; j < jmax; j++) { 2649 row = irow_i[j]; 2650 proc = row2proc_i[j]; 2651 if (proc == rank) { 2652 old_row = row; 2653 #if defined(PETSC_USE_CTABLE) 2654 PetscCall(PetscTableFind(rmap_i, row + 1, &row)); 2655 row--; 2656 #else 2657 row = rmap_i[row]; 2658 #endif 2659 ilen_row = imat_ilen[row]; 2660 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2661 mat_i = imat_i[row]; 2662 mat_a = imat_a + mat_i; 2663 mat_j = imat_j + mat_i; 2664 if (!allcolumns[i]) { 2665 for (k = 0; k < ncols; k++) { 2666 #if defined(PETSC_USE_CTABLE) 2667 PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol)); 2668 #else 2669 tcol = cmap_i[cols[k]]; 2670 #endif 2671 if (tcol) { 2672 *mat_j++ = tcol - 1; 2673 *mat_a++ = vals[k]; 2674 ilen_row++; 2675 } 2676 } 2677 } else { /* allcolumns */ 2678 for (k = 0; k < ncols; k++) { 2679 *mat_j++ = cols[k]; /* global col index! */ 2680 *mat_a++ = vals[k]; 2681 ilen_row++; 2682 } 2683 } 2684 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2685 2686 imat_ilen[row] = ilen_row; 2687 } 2688 } 2689 } 2690 2691 /* Now assemble the off proc rows */ 2692 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2693 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2694 sbuf1_i = sbuf1[pa[tmp2]]; 2695 jmax = sbuf1_i[0]; 2696 ct1 = 2 * jmax + 1; 2697 ct2 = 0; 2698 rbuf2_i = rbuf2[tmp2]; 2699 rbuf3_i = rbuf3[tmp2]; 2700 rbuf4_i = rbuf4[tmp2]; 2701 for (j = 1; j <= jmax; j++) { 2702 is_no = sbuf1_i[2 * j - 1]; 2703 rmap_i = rmap[is_no]; 2704 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2705 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2706 imat_ilen = subc->ilen; 2707 imat_j = subc->j; 2708 imat_i = subc->i; 2709 imat_a = subc->a; 2710 max1 = sbuf1_i[2 * j]; 2711 for (k = 0; k < max1; k++, ct1++) { 2712 row = sbuf1_i[ct1]; 2713 #if defined(PETSC_USE_CTABLE) 2714 PetscCall(PetscTableFind(rmap_i, row + 1, &row)); 2715 row--; 2716 #else 2717 row = rmap_i[row]; 2718 #endif 2719 ilen = imat_ilen[row]; 2720 mat_i = imat_i[row]; 2721 mat_a = imat_a + mat_i; 2722 mat_j = imat_j + mat_i; 2723 max2 = rbuf2_i[ct1]; 2724 if (!allcolumns[is_no]) { 2725 for (l = 0; l < max2; l++, ct2++) { 2726 #if defined(PETSC_USE_CTABLE) 2727 PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol)); 2728 #else 2729 tcol = cmap_i[rbuf3_i[ct2]]; 2730 #endif 2731 if (tcol) { 2732 *mat_j++ = tcol - 1; 2733 *mat_a++ = rbuf4_i[ct2]; 2734 ilen++; 2735 } 2736 } 2737 } else { /* allcolumns */ 2738 for (l = 0; l < max2; l++, ct2++) { 2739 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2740 *mat_a++ = rbuf4_i[ct2]; 2741 ilen++; 2742 } 2743 } 2744 imat_ilen[row] = ilen; 2745 } 2746 } 2747 } 2748 2749 if (!iscsorted) { /* sort column indices of the rows */ 2750 for (i = 0; i < ismax; i++) { 2751 subc = (Mat_SeqAIJ *)submats[i]->data; 2752 imat_j = subc->j; 2753 imat_i = subc->i; 2754 imat_a = subc->a; 2755 imat_ilen = subc->ilen; 2756 2757 if (allcolumns[i]) continue; 2758 jmax = nrow[i]; 2759 for (j = 0; j < jmax; j++) { 2760 mat_i = imat_i[j]; 2761 mat_a = imat_a + mat_i; 2762 mat_j = imat_j + mat_i; 2763 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2764 } 2765 } 2766 } 2767 2768 PetscCall(PetscFree(r_waits4)); 2769 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2770 PetscCall(PetscFree(s_waits4)); 2771 2772 /* Restore the indices */ 2773 for (i = 0; i < ismax; i++) { 2774 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2775 if (!allcolumns[i]) { PetscCall(ISRestoreIndices(iscol[i], icol + i)); } 2776 } 2777 2778 for (i = 0; i < ismax; i++) { 2779 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2780 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2781 } 2782 2783 /* Destroy allocated memory */ 2784 if (sbuf_aa) { 2785 PetscCall(PetscFree(sbuf_aa[0])); 2786 PetscCall(PetscFree(sbuf_aa)); 2787 } 2788 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2789 2790 for (i = 0; i < nrqs; ++i) { PetscCall(PetscFree(rbuf4[i])); } 2791 PetscCall(PetscFree(rbuf4)); 2792 2793 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2794 PetscFunctionReturn(0); 2795 } 2796 2797 /* 2798 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2799 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2800 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2801 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2802 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2803 state, and needs to be "assembled" later by compressing B's column space. 2804 2805 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2806 Following this call, C->A & C->B have been created, even if empty. 2807 */ 2808 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) { 2809 /* If making this function public, change the error returned in this function away from _PLIB. */ 2810 Mat_MPIAIJ *aij; 2811 Mat_SeqAIJ *Baij; 2812 PetscBool seqaij, Bdisassembled; 2813 PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count; 2814 PetscScalar v; 2815 const PetscInt *rowindices, *colindices; 2816 2817 PetscFunctionBegin; 2818 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2819 if (A) { 2820 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2821 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2822 if (rowemb) { 2823 PetscCall(ISGetLocalSize(rowemb, &m)); 2824 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2825 } else { 2826 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2827 } 2828 if (dcolemb) { 2829 PetscCall(ISGetLocalSize(dcolemb, &n)); 2830 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2831 } else { 2832 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2833 } 2834 } 2835 if (B) { 2836 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2837 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2838 if (rowemb) { 2839 PetscCall(ISGetLocalSize(rowemb, &m)); 2840 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2841 } else { 2842 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2843 } 2844 if (ocolemb) { 2845 PetscCall(ISGetLocalSize(ocolemb, &n)); 2846 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2847 } else { 2848 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2849 } 2850 } 2851 2852 aij = (Mat_MPIAIJ *)C->data; 2853 if (!aij->A) { 2854 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2855 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2856 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2857 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2858 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2859 PetscCall(PetscLogObjectParent((PetscObject)C, (PetscObject)aij->A)); 2860 } 2861 if (A) { 2862 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2863 } else { 2864 PetscCall(MatSetUp(aij->A)); 2865 } 2866 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2867 /* 2868 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2869 need to "disassemble" B -- convert it to using C's global indices. 2870 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2871 2872 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2873 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2874 2875 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2876 At least avoid calling MatSetValues() and the implied searches? 2877 */ 2878 2879 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2880 #if defined(PETSC_USE_CTABLE) 2881 PetscCall(PetscTableDestroy(&aij->colmap)); 2882 #else 2883 PetscCall(PetscFree(aij->colmap)); 2884 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2885 if (aij->B) PetscCall(PetscLogObjectMemory((PetscObject)C, -aij->B->cmap->n * sizeof(PetscInt))); 2886 #endif 2887 ngcol = 0; 2888 if (aij->lvec) { PetscCall(VecGetSize(aij->lvec, &ngcol)); } 2889 if (aij->garray) { 2890 PetscCall(PetscFree(aij->garray)); 2891 PetscCall(PetscLogObjectMemory((PetscObject)C, -ngcol * sizeof(PetscInt))); 2892 } 2893 PetscCall(VecDestroy(&aij->lvec)); 2894 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2895 } 2896 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { PetscCall(MatDestroy(&aij->B)); } 2897 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { PetscCall(MatZeroEntries(aij->B)); } 2898 } 2899 Bdisassembled = PETSC_FALSE; 2900 if (!aij->B) { 2901 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2902 PetscCall(PetscLogObjectParent((PetscObject)C, (PetscObject)aij->B)); 2903 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2904 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2905 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2906 Bdisassembled = PETSC_TRUE; 2907 } 2908 if (B) { 2909 Baij = (Mat_SeqAIJ *)B->data; 2910 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2911 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2912 for (i = 0; i < B->rmap->n; i++) { nz[i] = Baij->i[i + 1] - Baij->i[i]; } 2913 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2914 PetscCall(PetscFree(nz)); 2915 } 2916 2917 PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend)); 2918 shift = rend - rstart; 2919 count = 0; 2920 rowindices = NULL; 2921 colindices = NULL; 2922 if (rowemb) { PetscCall(ISGetIndices(rowemb, &rowindices)); } 2923 if (ocolemb) { PetscCall(ISGetIndices(ocolemb, &colindices)); } 2924 for (i = 0; i < B->rmap->n; i++) { 2925 PetscInt row; 2926 row = i; 2927 if (rowindices) row = rowindices[i]; 2928 for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2929 col = Baij->j[count]; 2930 if (colindices) col = colindices[col]; 2931 if (Bdisassembled && col >= rstart) col += shift; 2932 v = Baij->a[count]; 2933 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2934 ++count; 2935 } 2936 } 2937 /* No assembly for aij->B is necessary. */ 2938 /* FIXME: set aij->B's nonzerostate correctly. */ 2939 } else { 2940 PetscCall(MatSetUp(aij->B)); 2941 } 2942 C->preallocated = PETSC_TRUE; 2943 C->was_assembled = PETSC_FALSE; 2944 C->assembled = PETSC_FALSE; 2945 /* 2946 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2947 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2948 */ 2949 PetscFunctionReturn(0); 2950 } 2951 2952 /* 2953 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2954 */ 2955 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) { 2956 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 2957 2958 PetscFunctionBegin; 2959 PetscValidPointer(A, 2); 2960 PetscValidPointer(B, 3); 2961 /* FIXME: make sure C is assembled */ 2962 *A = aij->A; 2963 *B = aij->B; 2964 /* Note that we don't incref *A and *B, so be careful! */ 2965 PetscFunctionReturn(0); 2966 } 2967 2968 /* 2969 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 2970 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 2971 */ 2972 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) { 2973 PetscMPIInt size, flag; 2974 PetscInt i, ii, cismax, ispar; 2975 Mat *A, *B; 2976 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 2977 2978 PetscFunctionBegin; 2979 if (!ismax) PetscFunctionReturn(0); 2980 2981 for (i = 0, cismax = 0; i < ismax; ++i) { 2982 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 2983 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2984 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 2985 if (size > 1) ++cismax; 2986 } 2987 2988 /* 2989 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 2990 ispar counts the number of parallel ISs across C's comm. 2991 */ 2992 PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 2993 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 2994 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 2995 PetscFunctionReturn(0); 2996 } 2997 2998 /* if (ispar) */ 2999 /* 3000 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3001 These are used to extract the off-diag portion of the resulting parallel matrix. 3002 The row IS for the off-diag portion is the same as for the diag portion, 3003 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3004 */ 3005 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3006 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3007 for (i = 0, ii = 0; i < ismax; ++i) { 3008 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3009 if (size > 1) { 3010 /* 3011 TODO: This is the part that's ***NOT SCALABLE***. 3012 To fix this we need to extract just the indices of C's nonzero columns 3013 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3014 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3015 to be done without serializing on the IS list, so, most likely, it is best 3016 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3017 */ 3018 PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii]))); 3019 /* Now we have to 3020 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3021 were sorted on each rank, concatenated they might no longer be sorted; 3022 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3023 indices in the nondecreasing order to the original index positions. 3024 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3025 */ 3026 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3027 PetscCall(ISSort(ciscol[ii])); 3028 ++ii; 3029 } 3030 } 3031 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3032 for (i = 0, ii = 0; i < ismax; ++i) { 3033 PetscInt j, issize; 3034 const PetscInt *indices; 3035 3036 /* 3037 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3038 */ 3039 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3040 PetscCall(ISSort(isrow[i])); 3041 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3042 PetscCall(ISGetIndices(isrow[i], &indices)); 3043 for (j = 1; j < issize; ++j) { 3044 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3045 } 3046 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3047 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3048 PetscCall(ISSort(iscol[i])); 3049 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3050 PetscCall(ISGetIndices(iscol[i], &indices)); 3051 for (j = 1; j < issize; ++j) { 3052 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3053 } 3054 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3055 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3056 if (size > 1) { 3057 cisrow[ii] = isrow[i]; 3058 ++ii; 3059 } 3060 } 3061 /* 3062 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3063 array of sequential matrices underlying the resulting parallel matrices. 3064 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3065 contain duplicates. 3066 3067 There are as many diag matrices as there are original index sets. There are only as many parallel 3068 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3069 3070 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3071 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3072 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3073 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3074 with A[i] and B[ii] extracted from the corresponding MPI submat. 3075 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3076 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3077 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3078 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3079 values into A[i] and B[ii] sitting inside the corresponding submat. 3080 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3081 A[i], B[ii], AA[i] or BB[ii] matrices. 3082 */ 3083 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3084 if (scall == MAT_INITIAL_MATRIX) { PetscCall(PetscMalloc1(ismax, submat)); } 3085 3086 /* Now obtain the sequential A and B submatrices separately. */ 3087 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3088 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3089 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3090 3091 /* 3092 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3093 matrices A & B have been extracted directly into the parallel matrices containing them, or 3094 simply into the sequential matrix identical with the corresponding A (if size == 1). 3095 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3096 to have the same sparsity pattern. 3097 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3098 must be constructed for C. This is done by setseqmat(s). 3099 */ 3100 for (i = 0, ii = 0; i < ismax; ++i) { 3101 /* 3102 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3103 That way we can avoid sorting and computing permutations when reusing. 3104 To this end: 3105 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3106 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3107 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3108 */ 3109 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3110 3111 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3112 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3113 if (size > 1) { 3114 if (scall == MAT_INITIAL_MATRIX) { 3115 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3116 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3117 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3118 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3119 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3120 } 3121 /* 3122 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3123 */ 3124 { 3125 Mat AA = A[i], BB = B[ii]; 3126 3127 if (AA || BB) { 3128 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3129 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3130 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3131 } 3132 PetscCall(MatDestroy(&AA)); 3133 } 3134 PetscCall(ISDestroy(ciscol + ii)); 3135 PetscCall(ISDestroy(ciscol_p + ii)); 3136 ++ii; 3137 } else { /* if (size == 1) */ 3138 if (scall == MAT_REUSE_MATRIX) { PetscCall(MatDestroy(&(*submat)[i])); } 3139 if (isrow_p[i] || iscol_p[i]) { 3140 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3141 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3142 /* Otherwise A is extracted straight into (*submats)[i]. */ 3143 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3144 PetscCall(MatDestroy(A + i)); 3145 } else (*submat)[i] = A[i]; 3146 } 3147 PetscCall(ISDestroy(&isrow_p[i])); 3148 PetscCall(ISDestroy(&iscol_p[i])); 3149 } 3150 PetscCall(PetscFree2(cisrow, ciscol)); 3151 PetscCall(PetscFree2(isrow_p, iscol_p)); 3152 PetscCall(PetscFree(ciscol_p)); 3153 PetscCall(PetscFree(A)); 3154 PetscCall(MatDestroySubMatrices(cismax, &B)); 3155 PetscFunctionReturn(0); 3156 } 3157 3158 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) { 3159 PetscFunctionBegin; 3160 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3161 PetscFunctionReturn(0); 3162 } 3163