1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscHMapI *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) 22 { 23 PetscInt i; 24 25 PetscFunctionBegin; 26 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 27 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); 28 PetscFunctionReturn(0); 29 } 30 31 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) 32 { 33 PetscInt i; 34 35 PetscFunctionBegin; 36 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 37 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); 38 PetscFunctionReturn(0); 39 } 40 41 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) 42 { 43 MPI_Comm comm; 44 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j; 45 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 46 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 47 const PetscInt *indices_i, **indices; 48 PetscLayout rmap; 49 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner; 50 PetscSF sf; 51 PetscSFNode *remote; 52 53 PetscFunctionBegin; 54 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 55 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 56 PetscCallMPI(MPI_Comm_size(comm, &size)); 57 /* get row map to determine where rows should be going */ 58 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 59 /* retrieve IS data and put all together so that we 60 * can optimize communication 61 * */ 62 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 63 for (i = 0, tlength = 0; i < nidx; i++) { 64 PetscCall(ISGetLocalSize(is[i], &length[i])); 65 tlength += length[i]; 66 PetscCall(ISGetIndices(is[i], &indices[i])); 67 } 68 /* find these rows on remote processors */ 69 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 70 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 71 nrrows = 0; 72 for (i = 0; i < nidx; i++) { 73 length_i = length[i]; 74 indices_i = indices[i]; 75 for (j = 0; j < length_i; j++) { 76 owner = -1; 77 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 78 /* remote processors */ 79 if (owner != rank) { 80 tosizes_temp[owner]++; /* number of rows to owner */ 81 rrow_ranks[nrrows] = owner; /* processor */ 82 rrow_isids[nrrows] = i; /* is id */ 83 remoterows[nrrows++] = indices_i[j]; /* row */ 84 } 85 } 86 PetscCall(ISRestoreIndices(is[i], &indices[i])); 87 } 88 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 89 /* test if we need to exchange messages 90 * generally speaking, we do not need to exchange 91 * data when overlap is 1 92 * */ 93 PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 94 /* we do not have any messages 95 * It usually corresponds to overlap 1 96 * */ 97 if (!reducednrrows) { 98 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 99 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 100 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 101 PetscFunctionReturn(0); 102 } 103 nto = 0; 104 /* send sizes and ranks for building a two-sided communcation */ 105 for (i = 0; i < size; i++) { 106 if (tosizes_temp[i]) { 107 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 108 tosizes_temp[i] = nto; /* a map from processor to index */ 109 toranks[nto++] = i; /* processor */ 110 } 111 } 112 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 113 toffsets[0] = 0; 114 for (i = 0; i < nto; i++) { 115 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 116 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 117 } 118 /* send information to other processors */ 119 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 120 nrecvrows = 0; 121 for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 122 PetscCall(PetscMalloc1(nrecvrows, &remote)); 123 nrecvrows = 0; 124 for (i = 0; i < nfrom; i++) { 125 for (j = 0; j < fromsizes[2 * i]; j++) { 126 remote[nrecvrows].rank = fromranks[i]; 127 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 128 } 129 } 130 PetscCall(PetscSFCreate(comm, &sf)); 131 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 132 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 133 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 134 PetscCall(PetscSFSetFromOptions(sf)); 135 /* message pair <no of is, row> */ 136 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 137 for (i = 0; i < nrrows; i++) { 138 owner = rrow_ranks[i]; /* processor */ 139 j = tosizes_temp[owner]; /* index */ 140 todata[toffsets[j]++] = rrow_isids[i]; 141 todata[toffsets[j]++] = remoterows[i]; 142 } 143 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 144 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 145 PetscCall(PetscFree(toffsets)); 146 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 147 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 148 PetscCall(PetscSFDestroy(&sf)); 149 /* send rows belonging to the remote so that then we could get the overlapping data back */ 150 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 151 PetscCall(PetscFree2(todata, fromdata)); 152 PetscCall(PetscFree(fromsizes)); 153 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 154 PetscCall(PetscFree(fromranks)); 155 nrecvrows = 0; 156 for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 157 PetscCall(PetscCalloc1(nrecvrows, &todata)); 158 PetscCall(PetscMalloc1(nrecvrows, &remote)); 159 nrecvrows = 0; 160 for (i = 0; i < nto; i++) { 161 for (j = 0; j < tosizes[2 * i]; j++) { 162 remote[nrecvrows].rank = toranks[i]; 163 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 164 } 165 } 166 PetscCall(PetscSFCreate(comm, &sf)); 167 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 168 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 169 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 170 PetscCall(PetscSFSetFromOptions(sf)); 171 /* overlap communication and computation */ 172 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 173 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 174 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 175 PetscCall(PetscSFDestroy(&sf)); 176 PetscCall(PetscFree2(sbdata, sbsizes)); 177 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 178 PetscCall(PetscFree(toranks)); 179 PetscCall(PetscFree(tosizes)); 180 PetscCall(PetscFree(todata)); 181 PetscFunctionReturn(0); 182 } 183 184 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 185 { 186 PetscInt *isz, isz_i, i, j, is_id, data_size; 187 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 188 const PetscInt *indices_i_temp; 189 MPI_Comm *iscomms; 190 191 PetscFunctionBegin; 192 max_lsize = 0; 193 PetscCall(PetscMalloc1(nidx, &isz)); 194 for (i = 0; i < nidx; i++) { 195 PetscCall(ISGetLocalSize(is[i], &lsize)); 196 max_lsize = lsize > max_lsize ? lsize : max_lsize; 197 isz[i] = lsize; 198 } 199 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 200 for (i = 0; i < nidx; i++) { 201 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 202 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 203 PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i])); 204 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 205 PetscCall(ISDestroy(&is[i])); 206 } 207 /* retrieve information to get row id and its overlap */ 208 for (i = 0; i < nrecvs;) { 209 is_id = recvdata[i++]; 210 data_size = recvdata[i++]; 211 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 212 isz_i = isz[is_id]; 213 for (j = 0; j < data_size; j++) { 214 col = recvdata[i++]; 215 indices_i[isz_i++] = col; 216 } 217 isz[is_id] = isz_i; 218 } 219 /* remove duplicate entities */ 220 for (i = 0; i < nidx; i++) { 221 indices_i = indices_temp + (max_lsize + nrecvs) * i; 222 isz_i = isz[i]; 223 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 224 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 225 PetscCall(PetscCommDestroy(&iscomms[i])); 226 } 227 PetscCall(PetscFree(isz)); 228 PetscCall(PetscFree2(indices_temp, iscomms)); 229 PetscFunctionReturn(0); 230 } 231 232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 233 { 234 PetscLayout rmap, cmap; 235 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 236 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 237 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 238 const PetscInt *gcols, *ai, *aj, *bi, *bj; 239 Mat amat, bmat; 240 PetscMPIInt rank; 241 PetscBool done; 242 MPI_Comm comm; 243 244 PetscFunctionBegin; 245 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 246 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 247 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 248 /* Even if the mat is symmetric, we still assume it is not symmetric */ 249 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 250 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 251 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 252 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 253 /* total number of nonzero values is used to estimate the memory usage in the next step */ 254 tnz = ai[an] + bi[bn]; 255 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 256 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 257 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 258 /* to find the longest message */ 259 max_fszs = 0; 260 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 261 /* better way to estimate number of nonzero in the mat??? */ 262 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 263 for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i; 264 rows_pos = 0; 265 totalrows = 0; 266 for (i = 0; i < nfrom; i++) { 267 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 268 /* group data together */ 269 for (j = 0; j < fromsizes[2 * i]; j += 2) { 270 is_id = fromrows[rows_pos++]; /* no of is */ 271 rows_i = rows_data[is_id]; 272 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 273 } 274 /* estimate a space to avoid multiple allocations */ 275 for (j = 0; j < nidx; j++) { 276 indvc_ij = 0; 277 rows_i = rows_data[j]; 278 for (l = 0; l < rows_pos_i[j]; l++) { 279 row = rows_i[l] - rstart; 280 start = ai[row]; 281 end = ai[row + 1]; 282 for (k = start; k < end; k++) { /* Amat */ 283 col = aj[k] + cstart; 284 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 285 } 286 start = bi[row]; 287 end = bi[row + 1]; 288 for (k = start; k < end; k++) { /* Bmat */ 289 col = gcols[bj[k]]; 290 indices_tmp[indvc_ij++] = col; 291 } 292 } 293 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 294 indv_counts[i * nidx + j] = indvc_ij; 295 totalrows += indvc_ij; 296 } 297 } 298 /* message triple <no of is, number of rows, rows> */ 299 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 300 totalrows = 0; 301 rows_pos = 0; 302 /* use this code again */ 303 for (i = 0; i < nfrom; i++) { 304 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 305 for (j = 0; j < fromsizes[2 * i]; j += 2) { 306 is_id = fromrows[rows_pos++]; 307 rows_i = rows_data[is_id]; 308 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 309 } 310 /* add data */ 311 for (j = 0; j < nidx; j++) { 312 if (!indv_counts[i * nidx + j]) continue; 313 indvc_ij = 0; 314 sbdata[totalrows++] = j; 315 sbdata[totalrows++] = indv_counts[i * nidx + j]; 316 sbsizes[2 * i] += 2; 317 rows_i = rows_data[j]; 318 for (l = 0; l < rows_pos_i[j]; l++) { 319 row = rows_i[l] - rstart; 320 start = ai[row]; 321 end = ai[row + 1]; 322 for (k = start; k < end; k++) { /* Amat */ 323 col = aj[k] + cstart; 324 indices_tmp[indvc_ij++] = col; 325 } 326 start = bi[row]; 327 end = bi[row + 1]; 328 for (k = start; k < end; k++) { /* Bmat */ 329 col = gcols[bj[k]]; 330 indices_tmp[indvc_ij++] = col; 331 } 332 } 333 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 334 sbsizes[2 * i] += indvc_ij; 335 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 336 totalrows += indvc_ij; 337 } 338 } 339 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 340 offsets[0] = 0; 341 for (i = 0; i < nfrom; i++) { 342 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 343 sbsizes[2 * i + 1] = offsets[i]; 344 } 345 PetscCall(PetscFree(offsets)); 346 if (sbrowsizes) *sbrowsizes = sbsizes; 347 if (sbrows) *sbrows = sbdata; 348 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 349 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 350 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 351 PetscFunctionReturn(0); 352 } 353 354 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) 355 { 356 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 357 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 358 PetscInt lsize, lsize_tmp; 359 PetscMPIInt rank, owner; 360 Mat amat, bmat; 361 PetscBool done; 362 PetscLayout cmap, rmap; 363 MPI_Comm comm; 364 365 PetscFunctionBegin; 366 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 367 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 368 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 369 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 370 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 371 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 372 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 373 /* is it a safe way to compute number of nonzero values ? */ 374 tnz = ai[an] + bi[bn]; 375 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 376 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 377 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 378 /* it is a better way to estimate memory than the old implementation 379 * where global size of matrix is used 380 * */ 381 PetscCall(PetscMalloc1(tnz, &indices_temp)); 382 for (i = 0; i < nidx; i++) { 383 MPI_Comm iscomm; 384 385 PetscCall(ISGetLocalSize(is[i], &lsize)); 386 PetscCall(ISGetIndices(is[i], &indices)); 387 lsize_tmp = 0; 388 for (j = 0; j < lsize; j++) { 389 owner = -1; 390 row = indices[j]; 391 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 392 if (owner != rank) continue; 393 /* local number */ 394 row -= rstart; 395 start = ai[row]; 396 end = ai[row + 1]; 397 for (k = start; k < end; k++) { /* Amat */ 398 col = aj[k] + cstart; 399 indices_temp[lsize_tmp++] = col; 400 } 401 start = bi[row]; 402 end = bi[row + 1]; 403 for (k = start; k < end; k++) { /* Bmat */ 404 col = gcols[bj[k]]; 405 indices_temp[lsize_tmp++] = col; 406 } 407 } 408 PetscCall(ISRestoreIndices(is[i], &indices)); 409 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 410 PetscCall(ISDestroy(&is[i])); 411 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 412 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 413 PetscCall(PetscCommDestroy(&iscomm)); 414 } 415 PetscCall(PetscFree(indices_temp)); 416 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 417 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 418 PetscFunctionReturn(0); 419 } 420 421 /* 422 Sample message format: 423 If a processor A wants processor B to process some elements corresponding 424 to index sets is[1],is[5] 425 mesg [0] = 2 (no of index sets in the mesg) 426 ----------- 427 mesg [1] = 1 => is[1] 428 mesg [2] = sizeof(is[1]); 429 ----------- 430 mesg [3] = 5 => is[5] 431 mesg [4] = sizeof(is[5]); 432 ----------- 433 mesg [5] 434 mesg [n] datas[1] 435 ----------- 436 mesg[n+1] 437 mesg[m] data(is[5]) 438 ----------- 439 440 Notes: 441 nrqs - no of requests sent (or to be sent out) 442 nrqr - no of requests received (which have to be or which have been processed) 443 */ 444 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) 445 { 446 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 447 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 448 const PetscInt **idx, *idx_i; 449 PetscInt *n, **data, len; 450 #if defined(PETSC_USE_CTABLE) 451 PetscHMapI *table_data, table_data_i; 452 PetscInt *tdata, tcount, tcount_max; 453 #else 454 PetscInt *data_i, *d_p; 455 #endif 456 PetscMPIInt size, rank, tag1, tag2, proc = 0; 457 PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr; 458 PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2; 459 PetscBT *table; 460 MPI_Comm comm; 461 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 462 MPI_Status *recv_status; 463 MPI_Comm *iscomms; 464 char *t_p; 465 466 PetscFunctionBegin; 467 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 468 size = c->size; 469 rank = c->rank; 470 M = C->rmap->N; 471 472 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 473 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 474 475 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 476 477 for (i = 0; i < imax; i++) { 478 PetscCall(ISGetIndices(is[i], &idx[i])); 479 PetscCall(ISGetLocalSize(is[i], &n[i])); 480 } 481 482 /* evaluate communication - mesg to who,length of mesg, and buffer space 483 required. Based on this, buffers are allocated, and data copied into them */ 484 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 485 for (i = 0; i < imax; i++) { 486 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 487 idx_i = idx[i]; 488 len = n[i]; 489 for (j = 0; j < len; j++) { 490 row = idx_i[j]; 491 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 492 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 493 w4[proc]++; 494 } 495 for (j = 0; j < size; j++) { 496 if (w4[j]) { 497 w1[j] += w4[j]; 498 w3[j]++; 499 } 500 } 501 } 502 503 nrqs = 0; /* no of outgoing messages */ 504 msz = 0; /* total mesg length (for all proc */ 505 w1[rank] = 0; /* no mesg sent to intself */ 506 w3[rank] = 0; 507 for (i = 0; i < size; i++) { 508 if (w1[i]) { 509 w2[i] = 1; 510 nrqs++; 511 } /* there exists a message to proc i */ 512 } 513 /* pa - is list of processors to communicate with */ 514 PetscCall(PetscMalloc1(nrqs, &pa)); 515 for (i = 0, j = 0; i < size; i++) { 516 if (w1[i]) { 517 pa[j] = i; 518 j++; 519 } 520 } 521 522 /* Each message would have a header = 1 + 2*(no of IS) + data */ 523 for (i = 0; i < nrqs; i++) { 524 j = pa[i]; 525 w1[j] += w2[j] + 2 * w3[j]; 526 msz += w1[j]; 527 } 528 529 /* Determine the number of messages to expect, their lengths, from from-ids */ 530 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 531 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 532 533 /* Now post the Irecvs corresponding to these messages */ 534 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 535 536 /* Allocate Memory for outgoing messages */ 537 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 538 PetscCall(PetscArrayzero(outdat, size)); 539 PetscCall(PetscArrayzero(ptr, size)); 540 541 { 542 PetscInt *iptr = tmp, ict = 0; 543 for (i = 0; i < nrqs; i++) { 544 j = pa[i]; 545 iptr += ict; 546 outdat[j] = iptr; 547 ict = w1[j]; 548 } 549 } 550 551 /* Form the outgoing messages */ 552 /* plug in the headers */ 553 for (i = 0; i < nrqs; i++) { 554 j = pa[i]; 555 outdat[j][0] = 0; 556 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 557 ptr[j] = outdat[j] + 2 * w3[j] + 1; 558 } 559 560 /* Memory for doing local proc's work */ 561 { 562 PetscInt M_BPB_imax = 0; 563 #if defined(PETSC_USE_CTABLE) 564 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 565 PetscCall(PetscMalloc1(imax, &table_data)); 566 for (i = 0; i < imax; i++) PetscCall(PetscHMapICreateWithSize(n[i], table_data + i)); 567 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 568 for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 569 #else 570 PetscInt Mimax = 0; 571 PetscCall(PetscIntMultError(M, imax, &Mimax)); 572 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 573 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 574 for (i = 0; i < imax; i++) { 575 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 576 data[i] = d_p + M * i; 577 } 578 #endif 579 } 580 581 /* Parse the IS and update local tables and the outgoing buf with the data */ 582 { 583 PetscInt n_i, isz_i, *outdat_j, ctr_j; 584 PetscBT table_i; 585 586 for (i = 0; i < imax; i++) { 587 PetscCall(PetscArrayzero(ctr, size)); 588 n_i = n[i]; 589 table_i = table[i]; 590 idx_i = idx[i]; 591 #if defined(PETSC_USE_CTABLE) 592 table_data_i = table_data[i]; 593 #else 594 data_i = data[i]; 595 #endif 596 isz_i = isz[i]; 597 for (j = 0; j < n_i; j++) { /* parse the indices of each IS */ 598 row = idx_i[j]; 599 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 600 if (proc != rank) { /* copy to the outgoing buffer */ 601 ctr[proc]++; 602 *ptr[proc] = row; 603 ptr[proc]++; 604 } else if (!PetscBTLookupSet(table_i, row)) { 605 #if defined(PETSC_USE_CTABLE) 606 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 607 #else 608 data_i[isz_i] = row; /* Update the local table */ 609 #endif 610 isz_i++; 611 } 612 } 613 /* Update the headers for the current IS */ 614 for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 615 if ((ctr_j = ctr[j])) { 616 outdat_j = outdat[j]; 617 k = ++outdat_j[0]; 618 outdat_j[2 * k] = ctr_j; 619 outdat_j[2 * k - 1] = i; 620 } 621 } 622 isz[i] = isz_i; 623 } 624 } 625 626 /* Now post the sends */ 627 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 628 for (i = 0; i < nrqs; ++i) { 629 j = pa[i]; 630 PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 631 } 632 633 /* No longer need the original indices */ 634 PetscCall(PetscMalloc1(imax, &iscomms)); 635 for (i = 0; i < imax; ++i) { 636 PetscCall(ISRestoreIndices(is[i], idx + i)); 637 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 638 } 639 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 640 641 for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i])); 642 643 /* Do Local work */ 644 #if defined(PETSC_USE_CTABLE) 645 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 646 #else 647 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 648 #endif 649 650 /* Receive messages */ 651 PetscCall(PetscMalloc1(nrqr, &recv_status)); 652 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 653 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 654 655 /* Phase 1 sends are complete - deallocate buffers */ 656 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 657 PetscCall(PetscFree4(w1, w2, w3, w4)); 658 659 PetscCall(PetscMalloc1(nrqr, &xdata)); 660 PetscCall(PetscMalloc1(nrqr, &isz1)); 661 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 662 PetscCall(PetscFree(rbuf[0])); 663 PetscCall(PetscFree(rbuf)); 664 665 /* Send the data back */ 666 /* Do a global reduction to know the buffer space req for incoming messages */ 667 { 668 PetscMPIInt *rw1; 669 670 PetscCall(PetscCalloc1(size, &rw1)); 671 672 for (i = 0; i < nrqr; ++i) { 673 proc = recv_status[i].MPI_SOURCE; 674 675 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 676 rw1[proc] = isz1[i]; 677 } 678 PetscCall(PetscFree(onodes1)); 679 PetscCall(PetscFree(olengths1)); 680 681 /* Determine the number of messages to expect, their lengths, from from-ids */ 682 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 683 PetscCall(PetscFree(rw1)); 684 } 685 /* Now post the Irecvs corresponding to these messages */ 686 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 687 688 /* Now post the sends */ 689 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 690 for (i = 0; i < nrqr; ++i) { 691 j = recv_status[i].MPI_SOURCE; 692 PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i)); 693 } 694 695 /* receive work done on other processors */ 696 { 697 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 698 PetscMPIInt idex; 699 PetscBT table_i; 700 701 for (i = 0; i < nrqs; ++i) { 702 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 703 /* Process the message */ 704 rbuf2_i = rbuf2[idex]; 705 ct1 = 2 * rbuf2_i[0] + 1; 706 jmax = rbuf2[idex][0]; 707 for (j = 1; j <= jmax; j++) { 708 max = rbuf2_i[2 * j]; 709 is_no = rbuf2_i[2 * j - 1]; 710 isz_i = isz[is_no]; 711 table_i = table[is_no]; 712 #if defined(PETSC_USE_CTABLE) 713 table_data_i = table_data[is_no]; 714 #else 715 data_i = data[is_no]; 716 #endif 717 for (k = 0; k < max; k++, ct1++) { 718 row = rbuf2_i[ct1]; 719 if (!PetscBTLookupSet(table_i, row)) { 720 #if defined(PETSC_USE_CTABLE) 721 PetscCall(PetscHMapISet(table_data_i, row + 1, isz_i + 1)); 722 #else 723 data_i[isz_i] = row; 724 #endif 725 isz_i++; 726 } 727 } 728 isz[is_no] = isz_i; 729 } 730 } 731 732 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 733 } 734 735 #if defined(PETSC_USE_CTABLE) 736 tcount_max = 0; 737 for (i = 0; i < imax; ++i) { 738 table_data_i = table_data[i]; 739 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 740 if (tcount_max < tcount) tcount_max = tcount; 741 } 742 PetscCall(PetscMalloc1(tcount_max + 1, &tdata)); 743 #endif 744 745 for (i = 0; i < imax; ++i) { 746 #if defined(PETSC_USE_CTABLE) 747 PetscHashIter tpos; 748 749 table_data_i = table_data[i]; 750 PetscHashIterBegin(table_data_i, tpos); 751 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 752 PetscHashIterGetKey(table_data_i, tpos, k); 753 PetscHashIterGetVal(table_data_i, tpos, j); 754 PetscHashIterNext(table_data_i, tpos); 755 tdata[--j] = --k; 756 } 757 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 758 #else 759 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 760 #endif 761 PetscCall(PetscCommDestroy(&iscomms[i])); 762 } 763 764 PetscCall(PetscFree(iscomms)); 765 PetscCall(PetscFree(onodes2)); 766 PetscCall(PetscFree(olengths2)); 767 768 PetscCall(PetscFree(pa)); 769 PetscCall(PetscFree(rbuf2[0])); 770 PetscCall(PetscFree(rbuf2)); 771 PetscCall(PetscFree(s_waits1)); 772 PetscCall(PetscFree(r_waits1)); 773 PetscCall(PetscFree(s_waits2)); 774 PetscCall(PetscFree(r_waits2)); 775 PetscCall(PetscFree(recv_status)); 776 if (xdata) { 777 PetscCall(PetscFree(xdata[0])); 778 PetscCall(PetscFree(xdata)); 779 } 780 PetscCall(PetscFree(isz1)); 781 #if defined(PETSC_USE_CTABLE) 782 for (i = 0; i < imax; i++) PetscCall(PetscHMapIDestroy(table_data + i)); 783 PetscCall(PetscFree(table_data)); 784 PetscCall(PetscFree(tdata)); 785 PetscCall(PetscFree4(table, data, isz, t_p)); 786 #else 787 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 788 #endif 789 PetscFunctionReturn(0); 790 } 791 792 /* 793 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 794 the work on the local processor. 795 796 Inputs: 797 C - MAT_MPIAIJ; 798 imax - total no of index sets processed at a time; 799 table - an array of char - size = m bits. 800 801 Output: 802 isz - array containing the count of the solution elements corresponding 803 to each index set; 804 data or table_data - pointer to the solutions 805 */ 806 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscHMapI *table_data) 807 { 808 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 809 Mat A = c->A, B = c->B; 810 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 811 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 812 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 813 PetscBT table_i; 814 #if defined(PETSC_USE_CTABLE) 815 PetscHMapI table_data_i; 816 PetscHashIter tpos; 817 PetscInt tcount, *tdata; 818 #else 819 PetscInt *data_i; 820 #endif 821 822 PetscFunctionBegin; 823 rstart = C->rmap->rstart; 824 cstart = C->cmap->rstart; 825 ai = a->i; 826 aj = a->j; 827 bi = b->i; 828 bj = b->j; 829 garray = c->garray; 830 831 for (i = 0; i < imax; i++) { 832 #if defined(PETSC_USE_CTABLE) 833 /* copy existing entries of table_data_i into tdata[] */ 834 table_data_i = table_data[i]; 835 PetscCall(PetscHMapIGetSize(table_data_i, &tcount)); 836 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 837 838 PetscCall(PetscMalloc1(tcount, &tdata)); 839 PetscHashIterBegin(table_data_i, tpos); 840 while (!PetscHashIterAtEnd(table_data_i, tpos)) { 841 PetscHashIterGetKey(table_data_i, tpos, row); 842 PetscHashIterGetVal(table_data_i, tpos, j); 843 PetscHashIterNext(table_data_i, tpos); 844 tdata[--j] = --row; 845 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 846 } 847 #else 848 data_i = data[i]; 849 #endif 850 table_i = table[i]; 851 isz_i = isz[i]; 852 max = isz[i]; 853 854 for (j = 0; j < max; j++) { 855 #if defined(PETSC_USE_CTABLE) 856 row = tdata[j] - rstart; 857 #else 858 row = data_i[j] - rstart; 859 #endif 860 start = ai[row]; 861 end = ai[row + 1]; 862 for (k = start; k < end; k++) { /* Amat */ 863 val = aj[k] + cstart; 864 if (!PetscBTLookupSet(table_i, val)) { 865 #if defined(PETSC_USE_CTABLE) 866 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 867 #else 868 data_i[isz_i] = val; 869 #endif 870 isz_i++; 871 } 872 } 873 start = bi[row]; 874 end = bi[row + 1]; 875 for (k = start; k < end; k++) { /* Bmat */ 876 val = garray[bj[k]]; 877 if (!PetscBTLookupSet(table_i, val)) { 878 #if defined(PETSC_USE_CTABLE) 879 PetscCall(PetscHMapISet(table_data_i, val + 1, isz_i + 1)); 880 #else 881 data_i[isz_i] = val; 882 #endif 883 isz_i++; 884 } 885 } 886 } 887 isz[i] = isz_i; 888 889 #if defined(PETSC_USE_CTABLE) 890 PetscCall(PetscFree(tdata)); 891 #endif 892 } 893 PetscFunctionReturn(0); 894 } 895 896 /* 897 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 898 and return the output 899 900 Input: 901 C - the matrix 902 nrqr - no of messages being processed. 903 rbuf - an array of pointers to the received requests 904 905 Output: 906 xdata - array of messages to be sent back 907 isz1 - size of each message 908 909 For better efficiency perhaps we should malloc separately each xdata[i], 910 then if a remalloc is required we need only copy the data for that one row 911 rather then all previous rows as it is now where a single large chunk of 912 memory is used. 913 914 */ 915 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) 916 { 917 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 918 Mat A = c->A, B = c->B; 919 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 920 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 921 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 922 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 923 PetscInt *rbuf_i, kmax, rbuf_0; 924 PetscBT xtable; 925 926 PetscFunctionBegin; 927 m = C->rmap->N; 928 rstart = C->rmap->rstart; 929 cstart = C->cmap->rstart; 930 ai = a->i; 931 aj = a->j; 932 bi = b->i; 933 bj = b->j; 934 garray = c->garray; 935 936 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 937 rbuf_i = rbuf[i]; 938 rbuf_0 = rbuf_i[0]; 939 ct += rbuf_0; 940 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 941 } 942 943 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 944 else max1 = 1; 945 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 946 if (nrqr) { 947 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 948 ++no_malloc; 949 } 950 PetscCall(PetscBTCreate(m, &xtable)); 951 PetscCall(PetscArrayzero(isz1, nrqr)); 952 953 ct3 = 0; 954 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 955 rbuf_i = rbuf[i]; 956 rbuf_0 = rbuf_i[0]; 957 ct1 = 2 * rbuf_0 + 1; 958 ct2 = ct1; 959 ct3 += ct1; 960 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 961 PetscCall(PetscBTMemzero(m, xtable)); 962 oct2 = ct2; 963 kmax = rbuf_i[2 * j]; 964 for (k = 0; k < kmax; k++, ct1++) { 965 row = rbuf_i[ct1]; 966 if (!PetscBTLookupSet(xtable, row)) { 967 if (!(ct3 < mem_estimate)) { 968 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 969 PetscCall(PetscMalloc1(new_estimate, &tmp)); 970 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 971 PetscCall(PetscFree(xdata[0])); 972 xdata[0] = tmp; 973 mem_estimate = new_estimate; 974 ++no_malloc; 975 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 976 } 977 xdata[i][ct2++] = row; 978 ct3++; 979 } 980 } 981 for (k = oct2, max2 = ct2; k < max2; k++) { 982 row = xdata[i][k] - rstart; 983 start = ai[row]; 984 end = ai[row + 1]; 985 for (l = start; l < end; l++) { 986 val = aj[l] + cstart; 987 if (!PetscBTLookupSet(xtable, val)) { 988 if (!(ct3 < mem_estimate)) { 989 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 990 PetscCall(PetscMalloc1(new_estimate, &tmp)); 991 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 992 PetscCall(PetscFree(xdata[0])); 993 xdata[0] = tmp; 994 mem_estimate = new_estimate; 995 ++no_malloc; 996 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 997 } 998 xdata[i][ct2++] = val; 999 ct3++; 1000 } 1001 } 1002 start = bi[row]; 1003 end = bi[row + 1]; 1004 for (l = start; l < end; l++) { 1005 val = garray[bj[l]]; 1006 if (!PetscBTLookupSet(xtable, val)) { 1007 if (!(ct3 < mem_estimate)) { 1008 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1009 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1010 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1011 PetscCall(PetscFree(xdata[0])); 1012 xdata[0] = tmp; 1013 mem_estimate = new_estimate; 1014 ++no_malloc; 1015 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1016 } 1017 xdata[i][ct2++] = val; 1018 ct3++; 1019 } 1020 } 1021 } 1022 /* Update the header*/ 1023 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1024 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1025 } 1026 xdata[i][0] = rbuf_0; 1027 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1028 isz1[i] = ct2; /* size of each message */ 1029 } 1030 PetscCall(PetscBTDestroy(&xtable)); 1031 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1032 PetscFunctionReturn(0); 1033 } 1034 /* -------------------------------------------------------------------------*/ 1035 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1036 /* 1037 Every processor gets the entire matrix 1038 */ 1039 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) 1040 { 1041 Mat B; 1042 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1043 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1044 PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL; 1045 PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j; 1046 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1047 1048 PetscFunctionBegin; 1049 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1050 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1051 if (scall == MAT_INITIAL_MATRIX) { 1052 /* ---------------------------------------------------------------- 1053 Tell every processor the number of nonzeros per row 1054 */ 1055 PetscCall(PetscMalloc1(A->rmap->N, &lens)); 1056 for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; 1057 PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1058 for (i = 0; i < size; i++) { 1059 recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i]; 1060 displs[i] = A->rmap->range[i]; 1061 } 1062 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1063 /* --------------------------------------------------------------- 1064 Create the sequential matrix of the same type as the local block diagonal 1065 */ 1066 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1067 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1068 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1069 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1070 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1071 PetscCall(PetscCalloc1(2, Bin)); 1072 **Bin = B; 1073 b = (Mat_SeqAIJ *)B->data; 1074 1075 /*-------------------------------------------------------------------- 1076 Copy my part of matrix column indices over 1077 */ 1078 sendcount = ad->nz + bd->nz; 1079 jsendbuf = b->j + b->i[rstarts[rank]]; 1080 a_jsendbuf = ad->j; 1081 b_jsendbuf = bd->j; 1082 n = A->rmap->rend - A->rmap->rstart; 1083 cnt = 0; 1084 for (i = 0; i < n; i++) { 1085 /* put in lower diagonal portion */ 1086 m = bd->i[i + 1] - bd->i[i]; 1087 while (m > 0) { 1088 /* is it above diagonal (in bd (compressed) numbering) */ 1089 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1090 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1091 m--; 1092 } 1093 1094 /* put in diagonal portion */ 1095 for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1096 1097 /* put in upper diagonal portion */ 1098 while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1099 } 1100 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1101 1102 /*-------------------------------------------------------------------- 1103 Gather all column indices to all processors 1104 */ 1105 for (i = 0; i < size; i++) { 1106 recvcounts[i] = 0; 1107 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j]; 1108 } 1109 displs[0] = 0; 1110 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1111 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1112 /*-------------------------------------------------------------------- 1113 Assemble the matrix into useable form (note numerical values not yet set) 1114 */ 1115 /* set the b->ilen (length of each row) values */ 1116 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1117 /* set the b->i indices */ 1118 b->i[0] = 0; 1119 for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1]; 1120 PetscCall(PetscFree(lens)); 1121 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1122 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1123 1124 } else { 1125 B = **Bin; 1126 b = (Mat_SeqAIJ *)B->data; 1127 } 1128 1129 /*-------------------------------------------------------------------- 1130 Copy my part of matrix numerical values into the values location 1131 */ 1132 if (flag == MAT_GET_VALUES) { 1133 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1134 MatScalar *sendbuf, *recvbuf; 1135 1136 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1137 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1138 sendcount = ad->nz + bd->nz; 1139 sendbuf = b->a + b->i[rstarts[rank]]; 1140 a_sendbuf = ada; 1141 b_sendbuf = bda; 1142 b_sendj = bd->j; 1143 n = A->rmap->rend - A->rmap->rstart; 1144 cnt = 0; 1145 for (i = 0; i < n; i++) { 1146 /* put in lower diagonal portion */ 1147 m = bd->i[i + 1] - bd->i[i]; 1148 while (m > 0) { 1149 /* is it above diagonal (in bd (compressed) numbering) */ 1150 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1151 sendbuf[cnt++] = *b_sendbuf++; 1152 m--; 1153 b_sendj++; 1154 } 1155 1156 /* put in diagonal portion */ 1157 for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++; 1158 1159 /* put in upper diagonal portion */ 1160 while (m-- > 0) { 1161 sendbuf[cnt++] = *b_sendbuf++; 1162 b_sendj++; 1163 } 1164 } 1165 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1166 1167 /* ----------------------------------------------------------------- 1168 Gather all numerical values to all processors 1169 */ 1170 if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1171 for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]]; 1172 displs[0] = 0; 1173 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1174 recvbuf = b->a; 1175 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A))); 1176 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1177 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1178 } /* endof (flag == MAT_GET_VALUES) */ 1179 PetscCall(PetscFree2(recvcounts, displs)); 1180 1181 PetscCall(MatPropagateSymmetryOptions(A, B)); 1182 PetscFunctionReturn(0); 1183 } 1184 1185 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) 1186 { 1187 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1188 Mat submat, A = c->A, B = c->B; 1189 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1190 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1191 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1192 const PetscInt *icol, *irow; 1193 PetscInt nrow, ncol, start; 1194 PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr; 1195 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc; 1196 PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr; 1197 PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz; 1198 PetscInt *lens, rmax, ncols, *cols, Crow; 1199 #if defined(PETSC_USE_CTABLE) 1200 PetscHMapI cmap, rmap; 1201 PetscInt *cmap_loc, *rmap_loc; 1202 #else 1203 PetscInt *cmap, *rmap; 1204 #endif 1205 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i; 1206 PetscInt *cworkB, lwrite, *subcols, *row2proc; 1207 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1208 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1209 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1210 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1211 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1212 MPI_Comm comm; 1213 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1214 PetscMPIInt *onodes1, *olengths1, idex, end; 1215 Mat_SubSppt *smatis1; 1216 PetscBool isrowsorted, iscolsorted; 1217 1218 PetscFunctionBegin; 1219 PetscValidLogicalCollectiveInt(C, ismax, 2); 1220 PetscValidLogicalCollectiveEnum(C, scall, 5); 1221 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1222 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1223 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1224 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1225 size = c->size; 1226 rank = c->rank; 1227 1228 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1229 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1230 PetscCall(ISGetIndices(isrow[0], &irow)); 1231 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1232 if (allcolumns) { 1233 icol = NULL; 1234 ncol = C->cmap->N; 1235 } else { 1236 PetscCall(ISGetIndices(iscol[0], &icol)); 1237 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1238 } 1239 1240 if (scall == MAT_INITIAL_MATRIX) { 1241 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1242 1243 /* Get some new tags to keep the communication clean */ 1244 tag1 = ((PetscObject)C)->tag; 1245 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1246 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1247 1248 /* evaluate communication - mesg to who, length of mesg, and buffer space 1249 required. Based on this, buffers are allocated, and data copied into them */ 1250 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1251 PetscCall(PetscMalloc1(nrow, &row2proc)); 1252 1253 /* w1[proc] = num of rows owned by proc -- to be requested */ 1254 proc = 0; 1255 nrqs = 0; /* num of outgoing messages */ 1256 for (j = 0; j < nrow; j++) { 1257 row = irow[j]; 1258 if (!isrowsorted) proc = 0; 1259 while (row >= C->rmap->range[proc + 1]) proc++; 1260 w1[proc]++; 1261 row2proc[j] = proc; /* map row index to proc */ 1262 1263 if (proc != rank && !w2[proc]) { 1264 w2[proc] = 1; 1265 nrqs++; 1266 } 1267 } 1268 w1[rank] = 0; /* rows owned by self will not be requested */ 1269 1270 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1271 for (proc = 0, j = 0; proc < size; proc++) { 1272 if (w1[proc]) pa[j++] = proc; 1273 } 1274 1275 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1276 msz = 0; /* total mesg length (for all procs) */ 1277 for (i = 0; i < nrqs; i++) { 1278 proc = pa[i]; 1279 w1[proc] += 3; 1280 msz += w1[proc]; 1281 } 1282 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1283 1284 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1285 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1286 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1287 1288 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1289 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1290 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1291 1292 /* Now post the Irecvs corresponding to these messages */ 1293 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1294 1295 PetscCall(PetscFree(onodes1)); 1296 PetscCall(PetscFree(olengths1)); 1297 1298 /* Allocate Memory for outgoing messages */ 1299 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1300 PetscCall(PetscArrayzero(sbuf1, size)); 1301 PetscCall(PetscArrayzero(ptr, size)); 1302 1303 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1304 iptr = tmp; 1305 for (i = 0; i < nrqs; i++) { 1306 proc = pa[i]; 1307 sbuf1[proc] = iptr; 1308 iptr += w1[proc]; 1309 } 1310 1311 /* Form the outgoing messages */ 1312 /* Initialize the header space */ 1313 for (i = 0; i < nrqs; i++) { 1314 proc = pa[i]; 1315 PetscCall(PetscArrayzero(sbuf1[proc], 3)); 1316 ptr[proc] = sbuf1[proc] + 3; 1317 } 1318 1319 /* Parse the isrow and copy data into outbuf */ 1320 PetscCall(PetscArrayzero(ctr, size)); 1321 for (j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1322 proc = row2proc[j]; 1323 if (proc != rank) { /* copy to the outgoing buf*/ 1324 *ptr[proc] = irow[j]; 1325 ctr[proc]++; 1326 ptr[proc]++; 1327 } 1328 } 1329 1330 /* Update the headers for the current IS */ 1331 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 1332 if ((ctr_j = ctr[j])) { 1333 sbuf1_j = sbuf1[j]; 1334 k = ++sbuf1_j[0]; 1335 sbuf1_j[2 * k] = ctr_j; 1336 sbuf1_j[2 * k - 1] = 0; 1337 } 1338 } 1339 1340 /* Now post the sends */ 1341 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1342 for (i = 0; i < nrqs; ++i) { 1343 proc = pa[i]; 1344 PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i)); 1345 } 1346 1347 /* Post Receives to capture the buffer size */ 1348 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1349 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1350 1351 if (nrqs) rbuf2[0] = tmp + msz; 1352 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1353 1354 for (i = 0; i < nrqs; ++i) { 1355 proc = pa[i]; 1356 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i)); 1357 } 1358 1359 PetscCall(PetscFree2(w1, w2)); 1360 1361 /* Send to other procs the buf size they should allocate */ 1362 /* Receive messages*/ 1363 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1364 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1365 1366 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1367 for (i = 0; i < nrqr; ++i) { 1368 req_size[i] = 0; 1369 rbuf1_i = rbuf1[i]; 1370 start = 2 * rbuf1_i[0] + 1; 1371 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1372 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1373 sbuf2_i = sbuf2[i]; 1374 for (j = start; j < end; j++) { 1375 k = rbuf1_i[j] - rstart; 1376 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1377 sbuf2_i[j] = ncols; 1378 req_size[i] += ncols; 1379 } 1380 req_source1[i] = r_status1[i].MPI_SOURCE; 1381 1382 /* form the header */ 1383 sbuf2_i[0] = req_size[i]; 1384 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1385 1386 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1387 } 1388 1389 PetscCall(PetscFree(r_status1)); 1390 PetscCall(PetscFree(r_waits1)); 1391 1392 /* rbuf2 is received, Post recv column indices a->j */ 1393 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1394 1395 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1396 for (i = 0; i < nrqs; ++i) { 1397 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1398 req_source2[i] = r_status2[i].MPI_SOURCE; 1399 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1400 } 1401 1402 /* Wait on sends1 and sends2 */ 1403 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1404 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1405 PetscCall(PetscFree(s_waits1)); 1406 PetscCall(PetscFree(s_status1)); 1407 1408 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1409 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1410 1411 /* Now allocate sending buffers for a->j, and send them off */ 1412 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1413 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1414 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 1415 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1416 1417 for (i = 0; i < nrqr; i++) { /* for each requested message */ 1418 rbuf1_i = rbuf1[i]; 1419 sbuf_aj_i = sbuf_aj[i]; 1420 ct1 = 2 * rbuf1_i[0] + 1; 1421 ct2 = 0; 1422 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1423 1424 kmax = rbuf1[i][2]; 1425 for (k = 0; k < kmax; k++, ct1++) { /* for each row */ 1426 row = rbuf1_i[ct1] - rstart; 1427 nzA = ai[row + 1] - ai[row]; 1428 nzB = bi[row + 1] - bi[row]; 1429 ncols = nzA + nzB; 1430 cworkA = aj + ai[row]; 1431 cworkB = bj + bi[row]; 1432 1433 /* load the column indices for this row into cols*/ 1434 cols = sbuf_aj_i + ct2; 1435 1436 lwrite = 0; 1437 for (l = 0; l < nzB; l++) { 1438 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1439 } 1440 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1441 for (l = 0; l < nzB; l++) { 1442 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1443 } 1444 1445 ct2 += ncols; 1446 } 1447 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1448 } 1449 1450 /* create column map (cmap): global col of C -> local col of submat */ 1451 #if defined(PETSC_USE_CTABLE) 1452 if (!allcolumns) { 1453 PetscCall(PetscHMapICreateWithSize(ncol, &cmap)); 1454 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1455 for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1456 if (icol[j] >= cstart && icol[j] < cend) { 1457 cmap_loc[icol[j] - cstart] = j + 1; 1458 } else { /* use PetscHMapI for non-local col indices */ 1459 PetscCall(PetscHMapISet(cmap, icol[j] + 1, j + 1)); 1460 } 1461 } 1462 } else { 1463 cmap = NULL; 1464 cmap_loc = NULL; 1465 } 1466 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1467 #else 1468 if (!allcolumns) { 1469 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1470 for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1471 } else { 1472 cmap = NULL; 1473 } 1474 #endif 1475 1476 /* Create lens for MatSeqAIJSetPreallocation() */ 1477 PetscCall(PetscCalloc1(nrow, &lens)); 1478 1479 /* Compute lens from local part of C */ 1480 for (j = 0; j < nrow; j++) { 1481 row = irow[j]; 1482 proc = row2proc[j]; 1483 if (proc == rank) { 1484 /* diagonal part A = c->A */ 1485 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1486 cols = aj + ai[row - rstart]; 1487 if (!allcolumns) { 1488 for (k = 0; k < ncols; k++) { 1489 #if defined(PETSC_USE_CTABLE) 1490 tcol = cmap_loc[cols[k]]; 1491 #else 1492 tcol = cmap[cols[k] + cstart]; 1493 #endif 1494 if (tcol) lens[j]++; 1495 } 1496 } else { /* allcolumns */ 1497 lens[j] = ncols; 1498 } 1499 1500 /* off-diagonal part B = c->B */ 1501 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1502 cols = bj + bi[row - rstart]; 1503 if (!allcolumns) { 1504 for (k = 0; k < ncols; k++) { 1505 #if defined(PETSC_USE_CTABLE) 1506 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1507 #else 1508 tcol = cmap[bmap[cols[k]]]; 1509 #endif 1510 if (tcol) lens[j]++; 1511 } 1512 } else { /* allcolumns */ 1513 lens[j] += ncols; 1514 } 1515 } 1516 } 1517 1518 /* Create row map (rmap): global row of C -> local row of submat */ 1519 #if defined(PETSC_USE_CTABLE) 1520 PetscCall(PetscHMapICreateWithSize(nrow, &rmap)); 1521 for (j = 0; j < nrow; j++) { 1522 row = irow[j]; 1523 proc = row2proc[j]; 1524 if (proc == rank) { /* a local row */ 1525 rmap_loc[row - rstart] = j; 1526 } else { 1527 PetscCall(PetscHMapISet(rmap, irow[j] + 1, j + 1)); 1528 } 1529 } 1530 #else 1531 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1532 for (j = 0; j < nrow; j++) rmap[irow[j]] = j; 1533 #endif 1534 1535 /* Update lens from offproc data */ 1536 /* recv a->j is done */ 1537 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1538 for (i = 0; i < nrqs; i++) { 1539 proc = pa[i]; 1540 sbuf1_i = sbuf1[proc]; 1541 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1542 ct1 = 2 + 1; 1543 ct2 = 0; 1544 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1545 rbuf3_i = rbuf3[i]; /* received C->j */ 1546 1547 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1548 max1 = sbuf1_i[2]; 1549 for (k = 0; k < max1; k++, ct1++) { 1550 #if defined(PETSC_USE_CTABLE) 1551 PetscCall(PetscHMapIGetWithDefault(rmap, sbuf1_i[ct1] + 1, 0, &row)); 1552 row--; 1553 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1554 #else 1555 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1556 #endif 1557 /* Now, store row index of submat in sbuf1_i[ct1] */ 1558 sbuf1_i[ct1] = row; 1559 1560 nnz = rbuf2_i[ct1]; 1561 if (!allcolumns) { 1562 for (l = 0; l < nnz; l++, ct2++) { 1563 #if defined(PETSC_USE_CTABLE) 1564 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1565 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1566 } else { 1567 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1568 } 1569 #else 1570 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1571 #endif 1572 if (tcol) lens[row]++; 1573 } 1574 } else { /* allcolumns */ 1575 lens[row] += nnz; 1576 } 1577 } 1578 } 1579 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1580 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1581 1582 /* Create the submatrices */ 1583 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1584 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1585 1586 PetscCall(ISGetBlockSize(isrow[0], &i)); 1587 PetscCall(ISGetBlockSize(iscol[0], &j)); 1588 PetscCall(MatSetBlockSizes(submat, i, j)); 1589 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1590 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1591 1592 /* create struct Mat_SubSppt and attached it to submat */ 1593 PetscCall(PetscNew(&smatis1)); 1594 subc = (Mat_SeqAIJ *)submat->data; 1595 subc->submatis1 = smatis1; 1596 1597 smatis1->id = 0; 1598 smatis1->nrqs = nrqs; 1599 smatis1->nrqr = nrqr; 1600 smatis1->rbuf1 = rbuf1; 1601 smatis1->rbuf2 = rbuf2; 1602 smatis1->rbuf3 = rbuf3; 1603 smatis1->sbuf2 = sbuf2; 1604 smatis1->req_source2 = req_source2; 1605 1606 smatis1->sbuf1 = sbuf1; 1607 smatis1->ptr = ptr; 1608 smatis1->tmp = tmp; 1609 smatis1->ctr = ctr; 1610 1611 smatis1->pa = pa; 1612 smatis1->req_size = req_size; 1613 smatis1->req_source1 = req_source1; 1614 1615 smatis1->allcolumns = allcolumns; 1616 smatis1->singleis = PETSC_TRUE; 1617 smatis1->row2proc = row2proc; 1618 smatis1->rmap = rmap; 1619 smatis1->cmap = cmap; 1620 #if defined(PETSC_USE_CTABLE) 1621 smatis1->rmap_loc = rmap_loc; 1622 smatis1->cmap_loc = cmap_loc; 1623 #endif 1624 1625 smatis1->destroy = submat->ops->destroy; 1626 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1627 submat->factortype = C->factortype; 1628 1629 /* compute rmax */ 1630 rmax = 0; 1631 for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1632 1633 } else { /* scall == MAT_REUSE_MATRIX */ 1634 submat = submats[0]; 1635 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1636 1637 subc = (Mat_SeqAIJ *)submat->data; 1638 rmax = subc->rmax; 1639 smatis1 = subc->submatis1; 1640 nrqs = smatis1->nrqs; 1641 nrqr = smatis1->nrqr; 1642 rbuf1 = smatis1->rbuf1; 1643 rbuf2 = smatis1->rbuf2; 1644 rbuf3 = smatis1->rbuf3; 1645 req_source2 = smatis1->req_source2; 1646 1647 sbuf1 = smatis1->sbuf1; 1648 sbuf2 = smatis1->sbuf2; 1649 ptr = smatis1->ptr; 1650 tmp = smatis1->tmp; 1651 ctr = smatis1->ctr; 1652 1653 pa = smatis1->pa; 1654 req_size = smatis1->req_size; 1655 req_source1 = smatis1->req_source1; 1656 1657 allcolumns = smatis1->allcolumns; 1658 row2proc = smatis1->row2proc; 1659 rmap = smatis1->rmap; 1660 cmap = smatis1->cmap; 1661 #if defined(PETSC_USE_CTABLE) 1662 rmap_loc = smatis1->rmap_loc; 1663 cmap_loc = smatis1->cmap_loc; 1664 #endif 1665 } 1666 1667 /* Post recv matrix values */ 1668 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1669 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1670 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1671 for (i = 0; i < nrqs; ++i) { 1672 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1673 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1674 } 1675 1676 /* Allocate sending buffers for a->a, and send them off */ 1677 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1678 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1679 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 1680 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1681 1682 for (i = 0; i < nrqr; i++) { 1683 rbuf1_i = rbuf1[i]; 1684 sbuf_aa_i = sbuf_aa[i]; 1685 ct1 = 2 * rbuf1_i[0] + 1; 1686 ct2 = 0; 1687 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1688 1689 kmax = rbuf1_i[2]; 1690 for (k = 0; k < kmax; k++, ct1++) { 1691 row = rbuf1_i[ct1] - rstart; 1692 nzA = ai[row + 1] - ai[row]; 1693 nzB = bi[row + 1] - bi[row]; 1694 ncols = nzA + nzB; 1695 cworkB = bj + bi[row]; 1696 vworkA = a_a + ai[row]; 1697 vworkB = b_a + bi[row]; 1698 1699 /* load the column values for this row into vals*/ 1700 vals = sbuf_aa_i + ct2; 1701 1702 lwrite = 0; 1703 for (l = 0; l < nzB; l++) { 1704 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1705 } 1706 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1707 for (l = 0; l < nzB; l++) { 1708 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1709 } 1710 1711 ct2 += ncols; 1712 } 1713 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1714 } 1715 1716 /* Assemble submat */ 1717 /* First assemble the local rows */ 1718 for (j = 0; j < nrow; j++) { 1719 row = irow[j]; 1720 proc = row2proc[j]; 1721 if (proc == rank) { 1722 Crow = row - rstart; /* local row index of C */ 1723 #if defined(PETSC_USE_CTABLE) 1724 row = rmap_loc[Crow]; /* row index of submat */ 1725 #else 1726 row = rmap[row]; 1727 #endif 1728 1729 if (allcolumns) { 1730 /* diagonal part A = c->A */ 1731 ncols = ai[Crow + 1] - ai[Crow]; 1732 cols = aj + ai[Crow]; 1733 vals = a_a + ai[Crow]; 1734 i = 0; 1735 for (k = 0; k < ncols; k++) { 1736 subcols[i] = cols[k] + cstart; 1737 subvals[i++] = vals[k]; 1738 } 1739 1740 /* off-diagonal part B = c->B */ 1741 ncols = bi[Crow + 1] - bi[Crow]; 1742 cols = bj + bi[Crow]; 1743 vals = b_a + bi[Crow]; 1744 for (k = 0; k < ncols; k++) { 1745 subcols[i] = bmap[cols[k]]; 1746 subvals[i++] = vals[k]; 1747 } 1748 1749 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1750 1751 } else { /* !allcolumns */ 1752 #if defined(PETSC_USE_CTABLE) 1753 /* diagonal part A = c->A */ 1754 ncols = ai[Crow + 1] - ai[Crow]; 1755 cols = aj + ai[Crow]; 1756 vals = a_a + ai[Crow]; 1757 i = 0; 1758 for (k = 0; k < ncols; k++) { 1759 tcol = cmap_loc[cols[k]]; 1760 if (tcol) { 1761 subcols[i] = --tcol; 1762 subvals[i++] = vals[k]; 1763 } 1764 } 1765 1766 /* off-diagonal part B = c->B */ 1767 ncols = bi[Crow + 1] - bi[Crow]; 1768 cols = bj + bi[Crow]; 1769 vals = b_a + bi[Crow]; 1770 for (k = 0; k < ncols; k++) { 1771 PetscCall(PetscHMapIGetWithDefault(cmap, bmap[cols[k]] + 1, 0, &tcol)); 1772 if (tcol) { 1773 subcols[i] = --tcol; 1774 subvals[i++] = vals[k]; 1775 } 1776 } 1777 #else 1778 /* diagonal part A = c->A */ 1779 ncols = ai[Crow + 1] - ai[Crow]; 1780 cols = aj + ai[Crow]; 1781 vals = a_a + ai[Crow]; 1782 i = 0; 1783 for (k = 0; k < ncols; k++) { 1784 tcol = cmap[cols[k] + cstart]; 1785 if (tcol) { 1786 subcols[i] = --tcol; 1787 subvals[i++] = vals[k]; 1788 } 1789 } 1790 1791 /* off-diagonal part B = c->B */ 1792 ncols = bi[Crow + 1] - bi[Crow]; 1793 cols = bj + bi[Crow]; 1794 vals = b_a + bi[Crow]; 1795 for (k = 0; k < ncols; k++) { 1796 tcol = cmap[bmap[cols[k]]]; 1797 if (tcol) { 1798 subcols[i] = --tcol; 1799 subvals[i++] = vals[k]; 1800 } 1801 } 1802 #endif 1803 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1804 } 1805 } 1806 } 1807 1808 /* Now assemble the off-proc rows */ 1809 for (i = 0; i < nrqs; i++) { /* for each requested message */ 1810 /* recv values from other processes */ 1811 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1812 proc = pa[idex]; 1813 sbuf1_i = sbuf1[proc]; 1814 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1815 ct1 = 2 + 1; 1816 ct2 = 0; /* count of received C->j */ 1817 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1818 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1819 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1820 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1821 1822 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1823 max1 = sbuf1_i[2]; /* num of rows */ 1824 for (k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1825 row = sbuf1_i[ct1]; /* row index of submat */ 1826 if (!allcolumns) { 1827 idex = 0; 1828 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1829 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1830 for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1831 #if defined(PETSC_USE_CTABLE) 1832 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1833 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1834 } else { 1835 PetscCall(PetscHMapIGetWithDefault(cmap, rbuf3_i[ct2] + 1, 0, &tcol)); 1836 } 1837 #else 1838 tcol = cmap[rbuf3_i[ct2]]; 1839 #endif 1840 if (tcol) { 1841 subcols[idex] = --tcol; /* may not be sorted */ 1842 subvals[idex++] = rbuf4_i[ct2]; 1843 1844 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1845 For reuse, we replace received C->j with index that should be inserted to submat */ 1846 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1847 } 1848 } 1849 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1850 } else { /* scall == MAT_REUSE_MATRIX */ 1851 submat = submats[0]; 1852 subc = (Mat_SeqAIJ *)submat->data; 1853 1854 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1855 for (l = 0; l < nnz; l++) { 1856 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1857 subvals[idex++] = rbuf4_i[ct2]; 1858 } 1859 1860 bj = subc->j + subc->i[row]; /* sorted column indices */ 1861 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1862 } 1863 } else { /* allcolumns */ 1864 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1865 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES)); 1866 ct2 += nnz; 1867 } 1868 } 1869 } 1870 1871 /* sending a->a are done */ 1872 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1873 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1874 1875 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1876 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1877 submats[0] = submat; 1878 1879 /* Restore the indices */ 1880 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1881 if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol)); 1882 1883 /* Destroy allocated memory */ 1884 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 1885 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1886 if (sbuf_aa) { 1887 PetscCall(PetscFree(sbuf_aa[0])); 1888 PetscCall(PetscFree(sbuf_aa)); 1889 } 1890 1891 if (scall == MAT_INITIAL_MATRIX) { 1892 PetscCall(PetscFree(lens)); 1893 if (sbuf_aj) { 1894 PetscCall(PetscFree(sbuf_aj[0])); 1895 PetscCall(PetscFree(sbuf_aj)); 1896 } 1897 } 1898 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1899 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1900 PetscFunctionReturn(0); 1901 } 1902 1903 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1904 { 1905 PetscInt ncol; 1906 PetscBool colflag, allcolumns = PETSC_FALSE; 1907 1908 PetscFunctionBegin; 1909 /* Allocate memory to hold all the submatrices */ 1910 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat)); 1911 1912 /* Check for special case: each processor gets entire matrix columns */ 1913 PetscCall(ISIdentity(iscol[0], &colflag)); 1914 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1915 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1916 1917 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1918 PetscFunctionReturn(0); 1919 } 1920 1921 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1922 { 1923 PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2]; 1924 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1925 Mat_SeqAIJ *subc; 1926 Mat_SubSppt *smat; 1927 1928 PetscFunctionBegin; 1929 /* Check for special case: each processor has a single IS */ 1930 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1931 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1932 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1933 PetscFunctionReturn(0); 1934 } 1935 1936 /* Collect global wantallmatrix and nstages */ 1937 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1938 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1939 if (!nmax) nmax = 1; 1940 1941 if (scall == MAT_INITIAL_MATRIX) { 1942 /* Collect global wantallmatrix and nstages */ 1943 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1944 PetscCall(ISIdentity(*isrow, &rowflag)); 1945 PetscCall(ISIdentity(*iscol, &colflag)); 1946 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1947 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1948 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1949 wantallmatrix = PETSC_TRUE; 1950 1951 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1952 } 1953 } 1954 1955 /* Determine the number of stages through which submatrices are done 1956 Each stage will extract nmax submatrices. 1957 nmax is determined by the matrix column dimension. 1958 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1959 */ 1960 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1961 1962 in[0] = -1 * (PetscInt)wantallmatrix; 1963 in[1] = nstages; 1964 PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 1965 wantallmatrix = (PetscBool)(-out[0]); 1966 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1967 1968 } else { /* MAT_REUSE_MATRIX */ 1969 if (ismax) { 1970 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 1971 smat = subc->submatis1; 1972 } else { /* (*submat)[0] is a dummy matrix */ 1973 smat = (Mat_SubSppt *)(*submat)[0]->data; 1974 } 1975 if (!smat) { 1976 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 1977 wantallmatrix = PETSC_TRUE; 1978 } else if (smat->singleis) { 1979 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1980 PetscFunctionReturn(0); 1981 } else { 1982 nstages = smat->nstages; 1983 } 1984 } 1985 1986 if (wantallmatrix) { 1987 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 1988 PetscFunctionReturn(0); 1989 } 1990 1991 /* Allocate memory to hold all the submatrices and dummy submatrices */ 1992 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat)); 1993 1994 for (i = 0, pos = 0; i < nstages; i++) { 1995 if (pos + nmax <= ismax) max_no = nmax; 1996 else if (pos >= ismax) max_no = 0; 1997 else max_no = ismax - pos; 1998 1999 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos)); 2000 if (!max_no) { 2001 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2002 smat = (Mat_SubSppt *)(*submat)[pos]->data; 2003 smat->nstages = nstages; 2004 } 2005 pos++; /* advance to next dummy matrix if any */ 2006 } else pos += max_no; 2007 } 2008 2009 if (ismax && scall == MAT_INITIAL_MATRIX) { 2010 /* save nstages for reuse */ 2011 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2012 smat = subc->submatis1; 2013 smat->nstages = nstages; 2014 } 2015 PetscFunctionReturn(0); 2016 } 2017 2018 /* -------------------------------------------------------------------------*/ 2019 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) 2020 { 2021 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2022 Mat A = c->A; 2023 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2024 const PetscInt **icol, **irow; 2025 PetscInt *nrow, *ncol, start; 2026 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr; 2027 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1; 2028 PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol; 2029 PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2030 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2031 #if defined(PETSC_USE_CTABLE) 2032 PetscHMapI *cmap, cmap_i = NULL, *rmap, rmap_i; 2033 #else 2034 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2035 #endif 2036 const PetscInt *irow_i; 2037 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2038 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2039 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2040 MPI_Comm comm; 2041 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2042 PetscMPIInt *onodes1, *olengths1, end; 2043 PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2044 Mat_SubSppt *smat_i; 2045 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2046 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen; 2047 2048 PetscFunctionBegin; 2049 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2050 size = c->size; 2051 rank = c->rank; 2052 2053 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2054 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2055 2056 for (i = 0; i < ismax; i++) { 2057 PetscCall(ISSorted(iscol[i], &issorted[i])); 2058 if (!issorted[i]) iscsorted = issorted[i]; 2059 2060 PetscCall(ISSorted(isrow[i], &issorted[i])); 2061 2062 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2063 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2064 2065 /* Check for special case: allcolumn */ 2066 PetscCall(ISIdentity(iscol[i], &colflag)); 2067 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2068 if (colflag && ncol[i] == C->cmap->N) { 2069 allcolumns[i] = PETSC_TRUE; 2070 icol[i] = NULL; 2071 } else { 2072 allcolumns[i] = PETSC_FALSE; 2073 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2074 } 2075 } 2076 2077 if (scall == MAT_REUSE_MATRIX) { 2078 /* Assumes new rows are same length as the old rows */ 2079 for (i = 0; i < ismax; i++) { 2080 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2081 subc = (Mat_SeqAIJ *)submats[i]->data; 2082 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2083 2084 /* Initial matrix as if empty */ 2085 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2086 2087 smat_i = subc->submatis1; 2088 2089 nrqs = smat_i->nrqs; 2090 nrqr = smat_i->nrqr; 2091 rbuf1 = smat_i->rbuf1; 2092 rbuf2 = smat_i->rbuf2; 2093 rbuf3 = smat_i->rbuf3; 2094 req_source2 = smat_i->req_source2; 2095 2096 sbuf1 = smat_i->sbuf1; 2097 sbuf2 = smat_i->sbuf2; 2098 ptr = smat_i->ptr; 2099 tmp = smat_i->tmp; 2100 ctr = smat_i->ctr; 2101 2102 pa = smat_i->pa; 2103 req_size = smat_i->req_size; 2104 req_source1 = smat_i->req_source1; 2105 2106 allcolumns[i] = smat_i->allcolumns; 2107 row2proc[i] = smat_i->row2proc; 2108 rmap[i] = smat_i->rmap; 2109 cmap[i] = smat_i->cmap; 2110 } 2111 2112 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2113 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2114 smat_i = (Mat_SubSppt *)submats[0]->data; 2115 2116 nrqs = smat_i->nrqs; 2117 nrqr = smat_i->nrqr; 2118 rbuf1 = smat_i->rbuf1; 2119 rbuf2 = smat_i->rbuf2; 2120 rbuf3 = smat_i->rbuf3; 2121 req_source2 = smat_i->req_source2; 2122 2123 sbuf1 = smat_i->sbuf1; 2124 sbuf2 = smat_i->sbuf2; 2125 ptr = smat_i->ptr; 2126 tmp = smat_i->tmp; 2127 ctr = smat_i->ctr; 2128 2129 pa = smat_i->pa; 2130 req_size = smat_i->req_size; 2131 req_source1 = smat_i->req_source1; 2132 2133 allcolumns[0] = PETSC_FALSE; 2134 } 2135 } else { /* scall == MAT_INITIAL_MATRIX */ 2136 /* Get some new tags to keep the communication clean */ 2137 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2138 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2139 2140 /* evaluate communication - mesg to who, length of mesg, and buffer space 2141 required. Based on this, buffers are allocated, and data copied into them*/ 2142 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2143 2144 for (i = 0; i < ismax; i++) { 2145 jmax = nrow[i]; 2146 irow_i = irow[i]; 2147 2148 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2149 row2proc[i] = row2proc_i; 2150 2151 if (issorted[i]) proc = 0; 2152 for (j = 0; j < jmax; j++) { 2153 if (!issorted[i]) proc = 0; 2154 row = irow_i[j]; 2155 while (row >= C->rmap->range[proc + 1]) proc++; 2156 w4[proc]++; 2157 row2proc_i[j] = proc; /* map row index to proc */ 2158 } 2159 for (j = 0; j < size; j++) { 2160 if (w4[j]) { 2161 w1[j] += w4[j]; 2162 w3[j]++; 2163 w4[j] = 0; 2164 } 2165 } 2166 } 2167 2168 nrqs = 0; /* no of outgoing messages */ 2169 msz = 0; /* total mesg length (for all procs) */ 2170 w1[rank] = 0; /* no mesg sent to self */ 2171 w3[rank] = 0; 2172 for (i = 0; i < size; i++) { 2173 if (w1[i]) { 2174 w2[i] = 1; 2175 nrqs++; 2176 } /* there exists a message to proc i */ 2177 } 2178 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2179 for (i = 0, j = 0; i < size; i++) { 2180 if (w1[i]) { 2181 pa[j] = i; 2182 j++; 2183 } 2184 } 2185 2186 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2187 for (i = 0; i < nrqs; i++) { 2188 j = pa[i]; 2189 w1[j] += w2[j] + 2 * w3[j]; 2190 msz += w1[j]; 2191 } 2192 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2193 2194 /* Determine the number of messages to expect, their lengths, from from-ids */ 2195 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2196 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2197 2198 /* Now post the Irecvs corresponding to these messages */ 2199 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2200 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2201 2202 /* Allocate Memory for outgoing messages */ 2203 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2204 PetscCall(PetscArrayzero(sbuf1, size)); 2205 PetscCall(PetscArrayzero(ptr, size)); 2206 2207 { 2208 PetscInt *iptr = tmp; 2209 k = 0; 2210 for (i = 0; i < nrqs; i++) { 2211 j = pa[i]; 2212 iptr += k; 2213 sbuf1[j] = iptr; 2214 k = w1[j]; 2215 } 2216 } 2217 2218 /* Form the outgoing messages. Initialize the header space */ 2219 for (i = 0; i < nrqs; i++) { 2220 j = pa[i]; 2221 sbuf1[j][0] = 0; 2222 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2223 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2224 } 2225 2226 /* Parse the isrow and copy data into outbuf */ 2227 for (i = 0; i < ismax; i++) { 2228 row2proc_i = row2proc[i]; 2229 PetscCall(PetscArrayzero(ctr, size)); 2230 irow_i = irow[i]; 2231 jmax = nrow[i]; 2232 for (j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2233 proc = row2proc_i[j]; 2234 if (proc != rank) { /* copy to the outgoing buf*/ 2235 ctr[proc]++; 2236 *ptr[proc] = irow_i[j]; 2237 ptr[proc]++; 2238 } 2239 } 2240 /* Update the headers for the current IS */ 2241 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 2242 if ((ctr_j = ctr[j])) { 2243 sbuf1_j = sbuf1[j]; 2244 k = ++sbuf1_j[0]; 2245 sbuf1_j[2 * k] = ctr_j; 2246 sbuf1_j[2 * k - 1] = i; 2247 } 2248 } 2249 } 2250 2251 /* Now post the sends */ 2252 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2253 for (i = 0; i < nrqs; ++i) { 2254 j = pa[i]; 2255 PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2256 } 2257 2258 /* Post Receives to capture the buffer size */ 2259 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2260 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2261 if (nrqs) rbuf2[0] = tmp + msz; 2262 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 2263 for (i = 0; i < nrqs; ++i) { 2264 j = pa[i]; 2265 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2266 } 2267 2268 /* Send to other procs the buf size they should allocate */ 2269 /* Receive messages*/ 2270 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2271 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2272 { 2273 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2274 PetscInt *sbuf2_i; 2275 2276 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2277 for (i = 0; i < nrqr; ++i) { 2278 req_size[i] = 0; 2279 rbuf1_i = rbuf1[i]; 2280 start = 2 * rbuf1_i[0] + 1; 2281 end = olengths1[i]; 2282 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2283 sbuf2_i = sbuf2[i]; 2284 for (j = start; j < end; j++) { 2285 id = rbuf1_i[j] - rstart; 2286 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2287 sbuf2_i[j] = ncols; 2288 req_size[i] += ncols; 2289 } 2290 req_source1[i] = onodes1[i]; 2291 /* form the header */ 2292 sbuf2_i[0] = req_size[i]; 2293 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2294 2295 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2296 } 2297 } 2298 2299 PetscCall(PetscFree(onodes1)); 2300 PetscCall(PetscFree(olengths1)); 2301 PetscCall(PetscFree(r_waits1)); 2302 PetscCall(PetscFree4(w1, w2, w3, w4)); 2303 2304 /* Receive messages*/ 2305 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2306 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2307 for (i = 0; i < nrqs; ++i) { 2308 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2309 req_source2[i] = pa[i]; 2310 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2311 } 2312 PetscCall(PetscFree(r_waits2)); 2313 2314 /* Wait on sends1 and sends2 */ 2315 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2316 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2317 PetscCall(PetscFree(s_waits1)); 2318 PetscCall(PetscFree(s_waits2)); 2319 2320 /* Now allocate sending buffers for a->j, and send them off */ 2321 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2322 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2323 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 2324 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2325 2326 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2327 { 2328 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2329 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2330 PetscInt cend = C->cmap->rend; 2331 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2332 2333 for (i = 0; i < nrqr; i++) { 2334 rbuf1_i = rbuf1[i]; 2335 sbuf_aj_i = sbuf_aj[i]; 2336 ct1 = 2 * rbuf1_i[0] + 1; 2337 ct2 = 0; 2338 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2339 kmax = rbuf1[i][2 * j]; 2340 for (k = 0; k < kmax; k++, ct1++) { 2341 row = rbuf1_i[ct1] - rstart; 2342 nzA = a_i[row + 1] - a_i[row]; 2343 nzB = b_i[row + 1] - b_i[row]; 2344 ncols = nzA + nzB; 2345 cworkA = a_j + a_i[row]; 2346 cworkB = b_j + b_i[row]; 2347 2348 /* load the column indices for this row into cols */ 2349 cols = sbuf_aj_i + ct2; 2350 2351 lwrite = 0; 2352 for (l = 0; l < nzB; l++) { 2353 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2354 } 2355 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2356 for (l = 0; l < nzB; l++) { 2357 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2358 } 2359 2360 ct2 += ncols; 2361 } 2362 } 2363 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2364 } 2365 } 2366 2367 /* create col map: global col of C -> local col of submatrices */ 2368 { 2369 const PetscInt *icol_i; 2370 #if defined(PETSC_USE_CTABLE) 2371 for (i = 0; i < ismax; i++) { 2372 if (!allcolumns[i]) { 2373 PetscCall(PetscHMapICreateWithSize(ncol[i], cmap + i)); 2374 2375 jmax = ncol[i]; 2376 icol_i = icol[i]; 2377 cmap_i = cmap[i]; 2378 for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(cmap[i], icol_i[j] + 1, j + 1)); 2379 } else cmap[i] = NULL; 2380 } 2381 #else 2382 for (i = 0; i < ismax; i++) { 2383 if (!allcolumns[i]) { 2384 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2385 jmax = ncol[i]; 2386 icol_i = icol[i]; 2387 cmap_i = cmap[i]; 2388 for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1; 2389 } else cmap[i] = NULL; 2390 } 2391 #endif 2392 } 2393 2394 /* Create lens which is required for MatCreate... */ 2395 for (i = 0, j = 0; i < ismax; i++) j += nrow[i]; 2396 PetscCall(PetscMalloc1(ismax, &lens)); 2397 2398 if (ismax) PetscCall(PetscCalloc1(j, &lens[0])); 2399 for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1]; 2400 2401 /* Update lens from local data */ 2402 for (i = 0; i < ismax; i++) { 2403 row2proc_i = row2proc[i]; 2404 jmax = nrow[i]; 2405 if (!allcolumns[i]) cmap_i = cmap[i]; 2406 irow_i = irow[i]; 2407 lens_i = lens[i]; 2408 for (j = 0; j < jmax; j++) { 2409 row = irow_i[j]; 2410 proc = row2proc_i[j]; 2411 if (proc == rank) { 2412 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2413 if (!allcolumns[i]) { 2414 for (k = 0; k < ncols; k++) { 2415 #if defined(PETSC_USE_CTABLE) 2416 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2417 #else 2418 tcol = cmap_i[cols[k]]; 2419 #endif 2420 if (tcol) lens_i[j]++; 2421 } 2422 } else { /* allcolumns */ 2423 lens_i[j] = ncols; 2424 } 2425 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2426 } 2427 } 2428 } 2429 2430 /* Create row map: global row of C -> local row of submatrices */ 2431 #if defined(PETSC_USE_CTABLE) 2432 for (i = 0; i < ismax; i++) { 2433 PetscCall(PetscHMapICreateWithSize(nrow[i], rmap + i)); 2434 irow_i = irow[i]; 2435 jmax = nrow[i]; 2436 for (j = 0; j < jmax; j++) PetscCall(PetscHMapISet(rmap[i], irow_i[j] + 1, j + 1)); 2437 } 2438 #else 2439 for (i = 0; i < ismax; i++) { 2440 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2441 rmap_i = rmap[i]; 2442 irow_i = irow[i]; 2443 jmax = nrow[i]; 2444 for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j; 2445 } 2446 #endif 2447 2448 /* Update lens from offproc data */ 2449 { 2450 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2451 2452 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2453 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2454 sbuf1_i = sbuf1[pa[tmp2]]; 2455 jmax = sbuf1_i[0]; 2456 ct1 = 2 * jmax + 1; 2457 ct2 = 0; 2458 rbuf2_i = rbuf2[tmp2]; 2459 rbuf3_i = rbuf3[tmp2]; 2460 for (j = 1; j <= jmax; j++) { 2461 is_no = sbuf1_i[2 * j - 1]; 2462 max1 = sbuf1_i[2 * j]; 2463 lens_i = lens[is_no]; 2464 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2465 rmap_i = rmap[is_no]; 2466 for (k = 0; k < max1; k++, ct1++) { 2467 #if defined(PETSC_USE_CTABLE) 2468 PetscCall(PetscHMapIGetWithDefault(rmap_i, sbuf1_i[ct1] + 1, 0, &row)); 2469 row--; 2470 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2471 #else 2472 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2473 #endif 2474 max2 = rbuf2_i[ct1]; 2475 for (l = 0; l < max2; l++, ct2++) { 2476 if (!allcolumns[is_no]) { 2477 #if defined(PETSC_USE_CTABLE) 2478 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2479 #else 2480 tcol = cmap_i[rbuf3_i[ct2]]; 2481 #endif 2482 if (tcol) lens_i[row]++; 2483 } else { /* allcolumns */ 2484 lens_i[row]++; /* lens_i[row] += max2 ? */ 2485 } 2486 } 2487 } 2488 } 2489 } 2490 } 2491 PetscCall(PetscFree(r_waits3)); 2492 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2493 PetscCall(PetscFree(s_waits3)); 2494 2495 /* Create the submatrices */ 2496 for (i = 0; i < ismax; i++) { 2497 PetscInt rbs, cbs; 2498 2499 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2500 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2501 2502 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2503 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2504 2505 PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2506 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2507 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2508 2509 /* create struct Mat_SubSppt and attached it to submat */ 2510 PetscCall(PetscNew(&smat_i)); 2511 subc = (Mat_SeqAIJ *)submats[i]->data; 2512 subc->submatis1 = smat_i; 2513 2514 smat_i->destroy = submats[i]->ops->destroy; 2515 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2516 submats[i]->factortype = C->factortype; 2517 2518 smat_i->id = i; 2519 smat_i->nrqs = nrqs; 2520 smat_i->nrqr = nrqr; 2521 smat_i->rbuf1 = rbuf1; 2522 smat_i->rbuf2 = rbuf2; 2523 smat_i->rbuf3 = rbuf3; 2524 smat_i->sbuf2 = sbuf2; 2525 smat_i->req_source2 = req_source2; 2526 2527 smat_i->sbuf1 = sbuf1; 2528 smat_i->ptr = ptr; 2529 smat_i->tmp = tmp; 2530 smat_i->ctr = ctr; 2531 2532 smat_i->pa = pa; 2533 smat_i->req_size = req_size; 2534 smat_i->req_source1 = req_source1; 2535 2536 smat_i->allcolumns = allcolumns[i]; 2537 smat_i->singleis = PETSC_FALSE; 2538 smat_i->row2proc = row2proc[i]; 2539 smat_i->rmap = rmap[i]; 2540 smat_i->cmap = cmap[i]; 2541 } 2542 2543 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2544 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2545 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2546 PetscCall(MatSetType(submats[0], MATDUMMY)); 2547 2548 /* create struct Mat_SubSppt and attached it to submat */ 2549 PetscCall(PetscNew(&smat_i)); 2550 submats[0]->data = (void *)smat_i; 2551 2552 smat_i->destroy = submats[0]->ops->destroy; 2553 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2554 submats[0]->factortype = C->factortype; 2555 2556 smat_i->id = 0; 2557 smat_i->nrqs = nrqs; 2558 smat_i->nrqr = nrqr; 2559 smat_i->rbuf1 = rbuf1; 2560 smat_i->rbuf2 = rbuf2; 2561 smat_i->rbuf3 = rbuf3; 2562 smat_i->sbuf2 = sbuf2; 2563 smat_i->req_source2 = req_source2; 2564 2565 smat_i->sbuf1 = sbuf1; 2566 smat_i->ptr = ptr; 2567 smat_i->tmp = tmp; 2568 smat_i->ctr = ctr; 2569 2570 smat_i->pa = pa; 2571 smat_i->req_size = req_size; 2572 smat_i->req_source1 = req_source1; 2573 2574 smat_i->allcolumns = PETSC_FALSE; 2575 smat_i->singleis = PETSC_FALSE; 2576 smat_i->row2proc = NULL; 2577 smat_i->rmap = NULL; 2578 smat_i->cmap = NULL; 2579 } 2580 2581 if (ismax) PetscCall(PetscFree(lens[0])); 2582 PetscCall(PetscFree(lens)); 2583 if (sbuf_aj) { 2584 PetscCall(PetscFree(sbuf_aj[0])); 2585 PetscCall(PetscFree(sbuf_aj)); 2586 } 2587 2588 } /* endof scall == MAT_INITIAL_MATRIX */ 2589 2590 /* Post recv matrix values */ 2591 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2592 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2593 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2594 for (i = 0; i < nrqs; ++i) { 2595 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2596 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2597 } 2598 2599 /* Allocate sending buffers for a->a, and send them off */ 2600 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2601 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2602 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 2603 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2604 2605 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2606 { 2607 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2608 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2609 PetscInt cend = C->cmap->rend; 2610 PetscInt *b_j = b->j; 2611 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2612 2613 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2614 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2615 for (i = 0; i < nrqr; i++) { 2616 rbuf1_i = rbuf1[i]; 2617 sbuf_aa_i = sbuf_aa[i]; 2618 ct1 = 2 * rbuf1_i[0] + 1; 2619 ct2 = 0; 2620 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2621 kmax = rbuf1_i[2 * j]; 2622 for (k = 0; k < kmax; k++, ct1++) { 2623 row = rbuf1_i[ct1] - rstart; 2624 nzA = a_i[row + 1] - a_i[row]; 2625 nzB = b_i[row + 1] - b_i[row]; 2626 ncols = nzA + nzB; 2627 cworkB = b_j + b_i[row]; 2628 vworkA = a_a + a_i[row]; 2629 vworkB = b_a + b_i[row]; 2630 2631 /* load the column values for this row into vals*/ 2632 vals = sbuf_aa_i + ct2; 2633 2634 lwrite = 0; 2635 for (l = 0; l < nzB; l++) { 2636 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2637 } 2638 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2639 for (l = 0; l < nzB; l++) { 2640 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2641 } 2642 2643 ct2 += ncols; 2644 } 2645 } 2646 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2647 } 2648 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2649 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2650 } 2651 2652 /* Assemble the matrices */ 2653 /* First assemble the local rows */ 2654 for (i = 0; i < ismax; i++) { 2655 row2proc_i = row2proc[i]; 2656 subc = (Mat_SeqAIJ *)submats[i]->data; 2657 imat_ilen = subc->ilen; 2658 imat_j = subc->j; 2659 imat_i = subc->i; 2660 imat_a = subc->a; 2661 2662 if (!allcolumns[i]) cmap_i = cmap[i]; 2663 rmap_i = rmap[i]; 2664 irow_i = irow[i]; 2665 jmax = nrow[i]; 2666 for (j = 0; j < jmax; j++) { 2667 row = irow_i[j]; 2668 proc = row2proc_i[j]; 2669 if (proc == rank) { 2670 old_row = row; 2671 #if defined(PETSC_USE_CTABLE) 2672 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2673 row--; 2674 #else 2675 row = rmap_i[row]; 2676 #endif 2677 ilen_row = imat_ilen[row]; 2678 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2679 mat_i = imat_i[row]; 2680 mat_a = imat_a + mat_i; 2681 mat_j = imat_j + mat_i; 2682 if (!allcolumns[i]) { 2683 for (k = 0; k < ncols; k++) { 2684 #if defined(PETSC_USE_CTABLE) 2685 PetscCall(PetscHMapIGetWithDefault(cmap_i, cols[k] + 1, 0, &tcol)); 2686 #else 2687 tcol = cmap_i[cols[k]]; 2688 #endif 2689 if (tcol) { 2690 *mat_j++ = tcol - 1; 2691 *mat_a++ = vals[k]; 2692 ilen_row++; 2693 } 2694 } 2695 } else { /* allcolumns */ 2696 for (k = 0; k < ncols; k++) { 2697 *mat_j++ = cols[k]; /* global col index! */ 2698 *mat_a++ = vals[k]; 2699 ilen_row++; 2700 } 2701 } 2702 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2703 2704 imat_ilen[row] = ilen_row; 2705 } 2706 } 2707 } 2708 2709 /* Now assemble the off proc rows */ 2710 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2711 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2712 sbuf1_i = sbuf1[pa[tmp2]]; 2713 jmax = sbuf1_i[0]; 2714 ct1 = 2 * jmax + 1; 2715 ct2 = 0; 2716 rbuf2_i = rbuf2[tmp2]; 2717 rbuf3_i = rbuf3[tmp2]; 2718 rbuf4_i = rbuf4[tmp2]; 2719 for (j = 1; j <= jmax; j++) { 2720 is_no = sbuf1_i[2 * j - 1]; 2721 rmap_i = rmap[is_no]; 2722 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2723 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2724 imat_ilen = subc->ilen; 2725 imat_j = subc->j; 2726 imat_i = subc->i; 2727 imat_a = subc->a; 2728 max1 = sbuf1_i[2 * j]; 2729 for (k = 0; k < max1; k++, ct1++) { 2730 row = sbuf1_i[ct1]; 2731 #if defined(PETSC_USE_CTABLE) 2732 PetscCall(PetscHMapIGetWithDefault(rmap_i, row + 1, 0, &row)); 2733 row--; 2734 #else 2735 row = rmap_i[row]; 2736 #endif 2737 ilen = imat_ilen[row]; 2738 mat_i = imat_i[row]; 2739 mat_a = imat_a + mat_i; 2740 mat_j = imat_j + mat_i; 2741 max2 = rbuf2_i[ct1]; 2742 if (!allcolumns[is_no]) { 2743 for (l = 0; l < max2; l++, ct2++) { 2744 #if defined(PETSC_USE_CTABLE) 2745 PetscCall(PetscHMapIGetWithDefault(cmap_i, rbuf3_i[ct2] + 1, 0, &tcol)); 2746 #else 2747 tcol = cmap_i[rbuf3_i[ct2]]; 2748 #endif 2749 if (tcol) { 2750 *mat_j++ = tcol - 1; 2751 *mat_a++ = rbuf4_i[ct2]; 2752 ilen++; 2753 } 2754 } 2755 } else { /* allcolumns */ 2756 for (l = 0; l < max2; l++, ct2++) { 2757 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2758 *mat_a++ = rbuf4_i[ct2]; 2759 ilen++; 2760 } 2761 } 2762 imat_ilen[row] = ilen; 2763 } 2764 } 2765 } 2766 2767 if (!iscsorted) { /* sort column indices of the rows */ 2768 for (i = 0; i < ismax; i++) { 2769 subc = (Mat_SeqAIJ *)submats[i]->data; 2770 imat_j = subc->j; 2771 imat_i = subc->i; 2772 imat_a = subc->a; 2773 imat_ilen = subc->ilen; 2774 2775 if (allcolumns[i]) continue; 2776 jmax = nrow[i]; 2777 for (j = 0; j < jmax; j++) { 2778 mat_i = imat_i[j]; 2779 mat_a = imat_a + mat_i; 2780 mat_j = imat_j + mat_i; 2781 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2782 } 2783 } 2784 } 2785 2786 PetscCall(PetscFree(r_waits4)); 2787 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2788 PetscCall(PetscFree(s_waits4)); 2789 2790 /* Restore the indices */ 2791 for (i = 0; i < ismax; i++) { 2792 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2793 if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i)); 2794 } 2795 2796 for (i = 0; i < ismax; i++) { 2797 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2798 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2799 } 2800 2801 /* Destroy allocated memory */ 2802 if (sbuf_aa) { 2803 PetscCall(PetscFree(sbuf_aa[0])); 2804 PetscCall(PetscFree(sbuf_aa)); 2805 } 2806 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2807 2808 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 2809 PetscCall(PetscFree(rbuf4)); 2810 2811 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2812 PetscFunctionReturn(0); 2813 } 2814 2815 /* 2816 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2817 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2818 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2819 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2820 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2821 state, and needs to be "assembled" later by compressing B's column space. 2822 2823 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2824 Following this call, C->A & C->B have been created, even if empty. 2825 */ 2826 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) 2827 { 2828 /* If making this function public, change the error returned in this function away from _PLIB. */ 2829 Mat_MPIAIJ *aij; 2830 Mat_SeqAIJ *Baij; 2831 PetscBool seqaij, Bdisassembled; 2832 PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count; 2833 PetscScalar v; 2834 const PetscInt *rowindices, *colindices; 2835 2836 PetscFunctionBegin; 2837 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2838 if (A) { 2839 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2840 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2841 if (rowemb) { 2842 PetscCall(ISGetLocalSize(rowemb, &m)); 2843 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2844 } else { 2845 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2846 } 2847 if (dcolemb) { 2848 PetscCall(ISGetLocalSize(dcolemb, &n)); 2849 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2850 } else { 2851 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2852 } 2853 } 2854 if (B) { 2855 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2856 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2857 if (rowemb) { 2858 PetscCall(ISGetLocalSize(rowemb, &m)); 2859 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2860 } else { 2861 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2862 } 2863 if (ocolemb) { 2864 PetscCall(ISGetLocalSize(ocolemb, &n)); 2865 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2866 } else { 2867 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2868 } 2869 } 2870 2871 aij = (Mat_MPIAIJ *)C->data; 2872 if (!aij->A) { 2873 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2874 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2875 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2876 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2877 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2878 } 2879 if (A) { 2880 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2881 } else { 2882 PetscCall(MatSetUp(aij->A)); 2883 } 2884 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2885 /* 2886 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2887 need to "disassemble" B -- convert it to using C's global indices. 2888 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2889 2890 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2891 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2892 2893 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2894 At least avoid calling MatSetValues() and the implied searches? 2895 */ 2896 2897 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2898 #if defined(PETSC_USE_CTABLE) 2899 PetscCall(PetscHMapIDestroy(&aij->colmap)); 2900 #else 2901 PetscCall(PetscFree(aij->colmap)); 2902 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2903 #endif 2904 ngcol = 0; 2905 if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol)); 2906 if (aij->garray) { PetscCall(PetscFree(aij->garray)); } 2907 PetscCall(VecDestroy(&aij->lvec)); 2908 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2909 } 2910 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B)); 2911 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B)); 2912 } 2913 Bdisassembled = PETSC_FALSE; 2914 if (!aij->B) { 2915 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2916 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2917 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2918 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2919 Bdisassembled = PETSC_TRUE; 2920 } 2921 if (B) { 2922 Baij = (Mat_SeqAIJ *)B->data; 2923 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2924 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2925 for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i]; 2926 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2927 PetscCall(PetscFree(nz)); 2928 } 2929 2930 PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend)); 2931 shift = rend - rstart; 2932 count = 0; 2933 rowindices = NULL; 2934 colindices = NULL; 2935 if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices)); 2936 if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices)); 2937 for (i = 0; i < B->rmap->n; i++) { 2938 PetscInt row; 2939 row = i; 2940 if (rowindices) row = rowindices[i]; 2941 for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2942 col = Baij->j[count]; 2943 if (colindices) col = colindices[col]; 2944 if (Bdisassembled && col >= rstart) col += shift; 2945 v = Baij->a[count]; 2946 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2947 ++count; 2948 } 2949 } 2950 /* No assembly for aij->B is necessary. */ 2951 /* FIXME: set aij->B's nonzerostate correctly. */ 2952 } else { 2953 PetscCall(MatSetUp(aij->B)); 2954 } 2955 C->preallocated = PETSC_TRUE; 2956 C->was_assembled = PETSC_FALSE; 2957 C->assembled = PETSC_FALSE; 2958 /* 2959 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2960 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2961 */ 2962 PetscFunctionReturn(0); 2963 } 2964 2965 /* 2966 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2967 */ 2968 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) 2969 { 2970 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 2971 2972 PetscFunctionBegin; 2973 PetscValidPointer(A, 2); 2974 PetscValidPointer(B, 3); 2975 /* FIXME: make sure C is assembled */ 2976 *A = aij->A; 2977 *B = aij->B; 2978 /* Note that we don't incref *A and *B, so be careful! */ 2979 PetscFunctionReturn(0); 2980 } 2981 2982 /* 2983 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 2984 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 2985 */ 2986 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) 2987 { 2988 PetscMPIInt size, flag; 2989 PetscInt i, ii, cismax, ispar; 2990 Mat *A, *B; 2991 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 2992 2993 PetscFunctionBegin; 2994 if (!ismax) PetscFunctionReturn(0); 2995 2996 for (i = 0, cismax = 0; i < ismax; ++i) { 2997 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 2998 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2999 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3000 if (size > 1) ++cismax; 3001 } 3002 3003 /* 3004 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3005 ispar counts the number of parallel ISs across C's comm. 3006 */ 3007 PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 3008 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3009 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 3010 PetscFunctionReturn(0); 3011 } 3012 3013 /* if (ispar) */ 3014 /* 3015 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3016 These are used to extract the off-diag portion of the resulting parallel matrix. 3017 The row IS for the off-diag portion is the same as for the diag portion, 3018 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3019 */ 3020 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3021 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3022 for (i = 0, ii = 0; i < ismax; ++i) { 3023 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3024 if (size > 1) { 3025 /* 3026 TODO: This is the part that's ***NOT SCALABLE***. 3027 To fix this we need to extract just the indices of C's nonzero columns 3028 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3029 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3030 to be done without serializing on the IS list, so, most likely, it is best 3031 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3032 */ 3033 PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii]))); 3034 /* Now we have to 3035 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3036 were sorted on each rank, concatenated they might no longer be sorted; 3037 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3038 indices in the nondecreasing order to the original index positions. 3039 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3040 */ 3041 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3042 PetscCall(ISSort(ciscol[ii])); 3043 ++ii; 3044 } 3045 } 3046 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3047 for (i = 0, ii = 0; i < ismax; ++i) { 3048 PetscInt j, issize; 3049 const PetscInt *indices; 3050 3051 /* 3052 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3053 */ 3054 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3055 PetscCall(ISSort(isrow[i])); 3056 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3057 PetscCall(ISGetIndices(isrow[i], &indices)); 3058 for (j = 1; j < issize; ++j) { 3059 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3060 } 3061 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3062 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3063 PetscCall(ISSort(iscol[i])); 3064 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3065 PetscCall(ISGetIndices(iscol[i], &indices)); 3066 for (j = 1; j < issize; ++j) { 3067 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3068 } 3069 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3070 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3071 if (size > 1) { 3072 cisrow[ii] = isrow[i]; 3073 ++ii; 3074 } 3075 } 3076 /* 3077 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3078 array of sequential matrices underlying the resulting parallel matrices. 3079 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3080 contain duplicates. 3081 3082 There are as many diag matrices as there are original index sets. There are only as many parallel 3083 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3084 3085 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3086 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3087 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3088 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3089 with A[i] and B[ii] extracted from the corresponding MPI submat. 3090 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3091 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3092 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3093 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3094 values into A[i] and B[ii] sitting inside the corresponding submat. 3095 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3096 A[i], B[ii], AA[i] or BB[ii] matrices. 3097 */ 3098 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3099 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat)); 3100 3101 /* Now obtain the sequential A and B submatrices separately. */ 3102 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3103 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3104 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3105 3106 /* 3107 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3108 matrices A & B have been extracted directly into the parallel matrices containing them, or 3109 simply into the sequential matrix identical with the corresponding A (if size == 1). 3110 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3111 to have the same sparsity pattern. 3112 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3113 must be constructed for C. This is done by setseqmat(s). 3114 */ 3115 for (i = 0, ii = 0; i < ismax; ++i) { 3116 /* 3117 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3118 That way we can avoid sorting and computing permutations when reusing. 3119 To this end: 3120 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3121 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3122 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3123 */ 3124 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3125 3126 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3127 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3128 if (size > 1) { 3129 if (scall == MAT_INITIAL_MATRIX) { 3130 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3131 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3132 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3133 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3134 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3135 } 3136 /* 3137 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3138 */ 3139 { 3140 Mat AA = A[i], BB = B[ii]; 3141 3142 if (AA || BB) { 3143 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3144 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3145 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3146 } 3147 PetscCall(MatDestroy(&AA)); 3148 } 3149 PetscCall(ISDestroy(ciscol + ii)); 3150 PetscCall(ISDestroy(ciscol_p + ii)); 3151 ++ii; 3152 } else { /* if (size == 1) */ 3153 if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i])); 3154 if (isrow_p[i] || iscol_p[i]) { 3155 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3156 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3157 /* Otherwise A is extracted straight into (*submats)[i]. */ 3158 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3159 PetscCall(MatDestroy(A + i)); 3160 } else (*submat)[i] = A[i]; 3161 } 3162 PetscCall(ISDestroy(&isrow_p[i])); 3163 PetscCall(ISDestroy(&iscol_p[i])); 3164 } 3165 PetscCall(PetscFree2(cisrow, ciscol)); 3166 PetscCall(PetscFree2(isrow_p, iscol_p)); 3167 PetscCall(PetscFree(ciscol_p)); 3168 PetscCall(PetscFree(A)); 3169 PetscCall(MatDestroySubMatrices(cismax, &B)); 3170 PetscFunctionReturn(0); 3171 } 3172 3173 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 3174 { 3175 PetscFunctionBegin; 3176 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3177 PetscFunctionReturn(0); 3178 } 3179