1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat, PetscInt, IS *); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat, PetscInt, char **, PetscInt *, PetscInt **, PetscTable *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat, PetscInt, PetscInt **, PetscInt **, PetscInt *); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat, PetscInt, PetscInt *, PetscInt **, PetscScalar **); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat, PetscInt, IS *); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat, PetscInt, IS *); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat, PetscInt, PetscMPIInt, PetscMPIInt *, PetscInt *, PetscInt *, PetscInt **, PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat, PetscInt, IS *, PetscInt, PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C, PetscInt imax, IS is[], PetscInt ov) 22 { 23 PetscInt i; 24 25 PetscFunctionBegin; 26 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 27 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C, imax, is)); 28 PetscFunctionReturn(0); 29 } 30 31 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C, PetscInt imax, IS is[], PetscInt ov) 32 { 33 PetscInt i; 34 35 PetscFunctionBegin; 36 PetscCheck(ov >= 0, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 37 for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C, imax, is)); 38 PetscFunctionReturn(0); 39 } 40 41 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat, PetscInt nidx, IS is[]) 42 { 43 MPI_Comm comm; 44 PetscInt *length, length_i, tlength, *remoterows, nrrows, reducednrrows, *rrow_ranks, *rrow_isids, i, j; 45 PetscInt *tosizes, *tosizes_temp, *toffsets, *fromsizes, *todata, *fromdata; 46 PetscInt nrecvrows, *sbsizes = NULL, *sbdata = NULL; 47 const PetscInt *indices_i, **indices; 48 PetscLayout rmap; 49 PetscMPIInt rank, size, *toranks, *fromranks, nto, nfrom, owner; 50 PetscSF sf; 51 PetscSFNode *remote; 52 53 PetscFunctionBegin; 54 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 55 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 56 PetscCallMPI(MPI_Comm_size(comm, &size)); 57 /* get row map to determine where rows should be going */ 58 PetscCall(MatGetLayouts(mat, &rmap, NULL)); 59 /* retrieve IS data and put all together so that we 60 * can optimize communication 61 * */ 62 PetscCall(PetscMalloc2(nidx, (PetscInt ***)&indices, nidx, &length)); 63 for (i = 0, tlength = 0; i < nidx; i++) { 64 PetscCall(ISGetLocalSize(is[i], &length[i])); 65 tlength += length[i]; 66 PetscCall(ISGetIndices(is[i], &indices[i])); 67 } 68 /* find these rows on remote processors */ 69 PetscCall(PetscCalloc3(tlength, &remoterows, tlength, &rrow_ranks, tlength, &rrow_isids)); 70 PetscCall(PetscCalloc3(size, &toranks, 2 * size, &tosizes, size, &tosizes_temp)); 71 nrrows = 0; 72 for (i = 0; i < nidx; i++) { 73 length_i = length[i]; 74 indices_i = indices[i]; 75 for (j = 0; j < length_i; j++) { 76 owner = -1; 77 PetscCall(PetscLayoutFindOwner(rmap, indices_i[j], &owner)); 78 /* remote processors */ 79 if (owner != rank) { 80 tosizes_temp[owner]++; /* number of rows to owner */ 81 rrow_ranks[nrrows] = owner; /* processor */ 82 rrow_isids[nrrows] = i; /* is id */ 83 remoterows[nrrows++] = indices_i[j]; /* row */ 84 } 85 } 86 PetscCall(ISRestoreIndices(is[i], &indices[i])); 87 } 88 PetscCall(PetscFree2(*(PetscInt ***)&indices, length)); 89 /* test if we need to exchange messages 90 * generally speaking, we do not need to exchange 91 * data when overlap is 1 92 * */ 93 PetscCall(MPIU_Allreduce(&nrrows, &reducednrrows, 1, MPIU_INT, MPI_MAX, comm)); 94 /* we do not have any messages 95 * It usually corresponds to overlap 1 96 * */ 97 if (!reducednrrows) { 98 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 99 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 100 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 101 PetscFunctionReturn(0); 102 } 103 nto = 0; 104 /* send sizes and ranks for building a two-sided communcation */ 105 for (i = 0; i < size; i++) { 106 if (tosizes_temp[i]) { 107 tosizes[nto * 2] = tosizes_temp[i] * 2; /* size */ 108 tosizes_temp[i] = nto; /* a map from processor to index */ 109 toranks[nto++] = i; /* processor */ 110 } 111 } 112 PetscCall(PetscMalloc1(nto + 1, &toffsets)); 113 toffsets[0] = 0; 114 for (i = 0; i < nto; i++) { 115 toffsets[i + 1] = toffsets[i] + tosizes[2 * i]; /* offsets */ 116 tosizes[2 * i + 1] = toffsets[i]; /* offsets to send */ 117 } 118 /* send information to other processors */ 119 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nto, toranks, tosizes, &nfrom, &fromranks, &fromsizes)); 120 nrecvrows = 0; 121 for (i = 0; i < nfrom; i++) nrecvrows += fromsizes[2 * i]; 122 PetscCall(PetscMalloc1(nrecvrows, &remote)); 123 nrecvrows = 0; 124 for (i = 0; i < nfrom; i++) { 125 for (j = 0; j < fromsizes[2 * i]; j++) { 126 remote[nrecvrows].rank = fromranks[i]; 127 remote[nrecvrows++].index = fromsizes[2 * i + 1] + j; 128 } 129 } 130 PetscCall(PetscSFCreate(comm, &sf)); 131 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 132 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 133 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 134 PetscCall(PetscSFSetFromOptions(sf)); 135 /* message pair <no of is, row> */ 136 PetscCall(PetscCalloc2(2 * nrrows, &todata, nrecvrows, &fromdata)); 137 for (i = 0; i < nrrows; i++) { 138 owner = rrow_ranks[i]; /* processor */ 139 j = tosizes_temp[owner]; /* index */ 140 todata[toffsets[j]++] = rrow_isids[i]; 141 todata[toffsets[j]++] = remoterows[i]; 142 } 143 PetscCall(PetscFree3(toranks, tosizes, tosizes_temp)); 144 PetscCall(PetscFree3(remoterows, rrow_ranks, rrow_isids)); 145 PetscCall(PetscFree(toffsets)); 146 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 147 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, todata, fromdata, MPI_REPLACE)); 148 PetscCall(PetscSFDestroy(&sf)); 149 /* send rows belonging to the remote so that then we could get the overlapping data back */ 150 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat, nidx, nfrom, fromranks, fromsizes, fromdata, &sbsizes, &sbdata)); 151 PetscCall(PetscFree2(todata, fromdata)); 152 PetscCall(PetscFree(fromsizes)); 153 PetscCall(PetscCommBuildTwoSided(comm, 2, MPIU_INT, nfrom, fromranks, sbsizes, &nto, &toranks, &tosizes)); 154 PetscCall(PetscFree(fromranks)); 155 nrecvrows = 0; 156 for (i = 0; i < nto; i++) nrecvrows += tosizes[2 * i]; 157 PetscCall(PetscCalloc1(nrecvrows, &todata)); 158 PetscCall(PetscMalloc1(nrecvrows, &remote)); 159 nrecvrows = 0; 160 for (i = 0; i < nto; i++) { 161 for (j = 0; j < tosizes[2 * i]; j++) { 162 remote[nrecvrows].rank = toranks[i]; 163 remote[nrecvrows++].index = tosizes[2 * i + 1] + j; 164 } 165 } 166 PetscCall(PetscSFCreate(comm, &sf)); 167 PetscCall(PetscSFSetGraph(sf, nrecvrows, nrecvrows, NULL, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER)); 168 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 169 PetscCall(PetscSFSetType(sf, PETSCSFBASIC)); 170 PetscCall(PetscSFSetFromOptions(sf)); 171 /* overlap communication and computation */ 172 PetscCall(PetscSFBcastBegin(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 173 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat, nidx, is)); 174 PetscCall(PetscSFBcastEnd(sf, MPIU_INT, sbdata, todata, MPI_REPLACE)); 175 PetscCall(PetscSFDestroy(&sf)); 176 PetscCall(PetscFree2(sbdata, sbsizes)); 177 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat, nidx, is, nrecvrows, todata)); 178 PetscCall(PetscFree(toranks)); 179 PetscCall(PetscFree(tosizes)); 180 PetscCall(PetscFree(todata)); 181 PetscFunctionReturn(0); 182 } 183 184 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat, PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 185 { 186 PetscInt *isz, isz_i, i, j, is_id, data_size; 187 PetscInt col, lsize, max_lsize, *indices_temp, *indices_i; 188 const PetscInt *indices_i_temp; 189 MPI_Comm *iscomms; 190 191 PetscFunctionBegin; 192 max_lsize = 0; 193 PetscCall(PetscMalloc1(nidx, &isz)); 194 for (i = 0; i < nidx; i++) { 195 PetscCall(ISGetLocalSize(is[i], &lsize)); 196 max_lsize = lsize > max_lsize ? lsize : max_lsize; 197 isz[i] = lsize; 198 } 199 PetscCall(PetscMalloc2((max_lsize + nrecvs) * nidx, &indices_temp, nidx, &iscomms)); 200 for (i = 0; i < nidx; i++) { 201 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 202 PetscCall(ISGetIndices(is[i], &indices_i_temp)); 203 PetscCall(PetscArraycpy(indices_temp + i * (max_lsize + nrecvs), indices_i_temp, isz[i])); 204 PetscCall(ISRestoreIndices(is[i], &indices_i_temp)); 205 PetscCall(ISDestroy(&is[i])); 206 } 207 /* retrieve information to get row id and its overlap */ 208 for (i = 0; i < nrecvs;) { 209 is_id = recvdata[i++]; 210 data_size = recvdata[i++]; 211 indices_i = indices_temp + (max_lsize + nrecvs) * is_id; 212 isz_i = isz[is_id]; 213 for (j = 0; j < data_size; j++) { 214 col = recvdata[i++]; 215 indices_i[isz_i++] = col; 216 } 217 isz[is_id] = isz_i; 218 } 219 /* remove duplicate entities */ 220 for (i = 0; i < nidx; i++) { 221 indices_i = indices_temp + (max_lsize + nrecvs) * i; 222 isz_i = isz[i]; 223 PetscCall(PetscSortRemoveDupsInt(&isz_i, indices_i)); 224 PetscCall(ISCreateGeneral(iscomms[i], isz_i, indices_i, PETSC_COPY_VALUES, &is[i])); 225 PetscCall(PetscCommDestroy(&iscomms[i])); 226 } 227 PetscCall(PetscFree(isz)); 228 PetscCall(PetscFree2(indices_temp, iscomms)); 229 PetscFunctionReturn(0); 230 } 231 232 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat, PetscInt nidx, PetscMPIInt nfrom, PetscMPIInt *fromranks, PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 233 { 234 PetscLayout rmap, cmap; 235 PetscInt i, j, k, l, *rows_i, *rows_data_ptr, **rows_data, max_fszs, rows_pos, *rows_pos_i; 236 PetscInt is_id, tnz, an, bn, rstart, cstart, row, start, end, col, totalrows, *sbdata; 237 PetscInt *indv_counts, indvc_ij, *sbsizes, *indices_tmp, *offsets; 238 const PetscInt *gcols, *ai, *aj, *bi, *bj; 239 Mat amat, bmat; 240 PetscMPIInt rank; 241 PetscBool done; 242 MPI_Comm comm; 243 244 PetscFunctionBegin; 245 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 246 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 247 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 248 /* Even if the mat is symmetric, we still assume it is not symmetric */ 249 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 250 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 251 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 252 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 253 /* total number of nonzero values is used to estimate the memory usage in the next step */ 254 tnz = ai[an] + bi[bn]; 255 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 256 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 257 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 258 /* to find the longest message */ 259 max_fszs = 0; 260 for (i = 0; i < nfrom; i++) max_fszs = fromsizes[2 * i] > max_fszs ? fromsizes[2 * i] : max_fszs; 261 /* better way to estimate number of nonzero in the mat??? */ 262 PetscCall(PetscCalloc5(max_fszs * nidx, &rows_data_ptr, nidx, &rows_data, nidx, &rows_pos_i, nfrom * nidx, &indv_counts, tnz, &indices_tmp)); 263 for (i = 0; i < nidx; i++) rows_data[i] = rows_data_ptr + max_fszs * i; 264 rows_pos = 0; 265 totalrows = 0; 266 for (i = 0; i < nfrom; i++) { 267 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 268 /* group data together */ 269 for (j = 0; j < fromsizes[2 * i]; j += 2) { 270 is_id = fromrows[rows_pos++]; /* no of is */ 271 rows_i = rows_data[is_id]; 272 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; /* row */ 273 } 274 /* estimate a space to avoid multiple allocations */ 275 for (j = 0; j < nidx; j++) { 276 indvc_ij = 0; 277 rows_i = rows_data[j]; 278 for (l = 0; l < rows_pos_i[j]; l++) { 279 row = rows_i[l] - rstart; 280 start = ai[row]; 281 end = ai[row + 1]; 282 for (k = start; k < end; k++) { /* Amat */ 283 col = aj[k] + cstart; 284 indices_tmp[indvc_ij++] = col; /* do not count the rows from the original rank */ 285 } 286 start = bi[row]; 287 end = bi[row + 1]; 288 for (k = start; k < end; k++) { /* Bmat */ 289 col = gcols[bj[k]]; 290 indices_tmp[indvc_ij++] = col; 291 } 292 } 293 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 294 indv_counts[i * nidx + j] = indvc_ij; 295 totalrows += indvc_ij; 296 } 297 } 298 /* message triple <no of is, number of rows, rows> */ 299 PetscCall(PetscCalloc2(totalrows + nidx * nfrom * 2, &sbdata, 2 * nfrom, &sbsizes)); 300 totalrows = 0; 301 rows_pos = 0; 302 /* use this code again */ 303 for (i = 0; i < nfrom; i++) { 304 PetscCall(PetscArrayzero(rows_pos_i, nidx)); 305 for (j = 0; j < fromsizes[2 * i]; j += 2) { 306 is_id = fromrows[rows_pos++]; 307 rows_i = rows_data[is_id]; 308 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 309 } 310 /* add data */ 311 for (j = 0; j < nidx; j++) { 312 if (!indv_counts[i * nidx + j]) continue; 313 indvc_ij = 0; 314 sbdata[totalrows++] = j; 315 sbdata[totalrows++] = indv_counts[i * nidx + j]; 316 sbsizes[2 * i] += 2; 317 rows_i = rows_data[j]; 318 for (l = 0; l < rows_pos_i[j]; l++) { 319 row = rows_i[l] - rstart; 320 start = ai[row]; 321 end = ai[row + 1]; 322 for (k = start; k < end; k++) { /* Amat */ 323 col = aj[k] + cstart; 324 indices_tmp[indvc_ij++] = col; 325 } 326 start = bi[row]; 327 end = bi[row + 1]; 328 for (k = start; k < end; k++) { /* Bmat */ 329 col = gcols[bj[k]]; 330 indices_tmp[indvc_ij++] = col; 331 } 332 } 333 PetscCall(PetscSortRemoveDupsInt(&indvc_ij, indices_tmp)); 334 sbsizes[2 * i] += indvc_ij; 335 PetscCall(PetscArraycpy(sbdata + totalrows, indices_tmp, indvc_ij)); 336 totalrows += indvc_ij; 337 } 338 } 339 PetscCall(PetscMalloc1(nfrom + 1, &offsets)); 340 offsets[0] = 0; 341 for (i = 0; i < nfrom; i++) { 342 offsets[i + 1] = offsets[i] + sbsizes[2 * i]; 343 sbsizes[2 * i + 1] = offsets[i]; 344 } 345 PetscCall(PetscFree(offsets)); 346 if (sbrowsizes) *sbrowsizes = sbsizes; 347 if (sbrows) *sbrows = sbdata; 348 PetscCall(PetscFree5(rows_data_ptr, rows_data, rows_pos_i, indv_counts, indices_tmp)); 349 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 350 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 351 PetscFunctionReturn(0); 352 } 353 354 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat, PetscInt nidx, IS is[]) 355 { 356 const PetscInt *gcols, *ai, *aj, *bi, *bj, *indices; 357 PetscInt tnz, an, bn, i, j, row, start, end, rstart, cstart, col, k, *indices_temp; 358 PetscInt lsize, lsize_tmp; 359 PetscMPIInt rank, owner; 360 Mat amat, bmat; 361 PetscBool done; 362 PetscLayout cmap, rmap; 363 MPI_Comm comm; 364 365 PetscFunctionBegin; 366 PetscCall(PetscObjectGetComm((PetscObject)mat, &comm)); 367 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 368 PetscCall(MatMPIAIJGetSeqAIJ(mat, &amat, &bmat, &gcols)); 369 PetscCall(MatGetRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 370 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 371 PetscCall(MatGetRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 372 PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "can not get row IJ "); 373 /* is it a safe way to compute number of nonzero values ? */ 374 tnz = ai[an] + bi[bn]; 375 PetscCall(MatGetLayouts(mat, &rmap, &cmap)); 376 PetscCall(PetscLayoutGetRange(rmap, &rstart, NULL)); 377 PetscCall(PetscLayoutGetRange(cmap, &cstart, NULL)); 378 /* it is a better way to estimate memory than the old implementation 379 * where global size of matrix is used 380 * */ 381 PetscCall(PetscMalloc1(tnz, &indices_temp)); 382 for (i = 0; i < nidx; i++) { 383 MPI_Comm iscomm; 384 385 PetscCall(ISGetLocalSize(is[i], &lsize)); 386 PetscCall(ISGetIndices(is[i], &indices)); 387 lsize_tmp = 0; 388 for (j = 0; j < lsize; j++) { 389 owner = -1; 390 row = indices[j]; 391 PetscCall(PetscLayoutFindOwner(rmap, row, &owner)); 392 if (owner != rank) continue; 393 /* local number */ 394 row -= rstart; 395 start = ai[row]; 396 end = ai[row + 1]; 397 for (k = start; k < end; k++) { /* Amat */ 398 col = aj[k] + cstart; 399 indices_temp[lsize_tmp++] = col; 400 } 401 start = bi[row]; 402 end = bi[row + 1]; 403 for (k = start; k < end; k++) { /* Bmat */ 404 col = gcols[bj[k]]; 405 indices_temp[lsize_tmp++] = col; 406 } 407 } 408 PetscCall(ISRestoreIndices(is[i], &indices)); 409 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomm, NULL)); 410 PetscCall(ISDestroy(&is[i])); 411 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp, indices_temp)); 412 PetscCall(ISCreateGeneral(iscomm, lsize_tmp, indices_temp, PETSC_COPY_VALUES, &is[i])); 413 PetscCall(PetscCommDestroy(&iscomm)); 414 } 415 PetscCall(PetscFree(indices_temp)); 416 PetscCall(MatRestoreRowIJ(amat, 0, PETSC_FALSE, PETSC_FALSE, &an, &ai, &aj, &done)); 417 PetscCall(MatRestoreRowIJ(bmat, 0, PETSC_FALSE, PETSC_FALSE, &bn, &bi, &bj, &done)); 418 PetscFunctionReturn(0); 419 } 420 421 /* 422 Sample message format: 423 If a processor A wants processor B to process some elements corresponding 424 to index sets is[1],is[5] 425 mesg [0] = 2 (no of index sets in the mesg) 426 ----------- 427 mesg [1] = 1 => is[1] 428 mesg [2] = sizeof(is[1]); 429 ----------- 430 mesg [3] = 5 => is[5] 431 mesg [4] = sizeof(is[5]); 432 ----------- 433 mesg [5] 434 mesg [n] datas[1] 435 ----------- 436 mesg[n+1] 437 mesg[m] data(is[5]) 438 ----------- 439 440 Notes: 441 nrqs - no of requests sent (or to be sent out) 442 nrqr - no of requests received (which have to be or which have been processed) 443 */ 444 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C, PetscInt imax, IS is[]) 445 { 446 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 447 PetscMPIInt *w1, *w2, nrqr, *w3, *w4, *onodes1, *olengths1, *onodes2, *olengths2; 448 const PetscInt **idx, *idx_i; 449 PetscInt *n, **data, len; 450 #if defined(PETSC_USE_CTABLE) 451 PetscTable *table_data, table_data_i; 452 PetscInt *tdata, tcount, tcount_max; 453 #else 454 PetscInt *data_i, *d_p; 455 #endif 456 PetscMPIInt size, rank, tag1, tag2, proc = 0; 457 PetscInt M, i, j, k, **rbuf, row, nrqs, msz, **outdat, **ptr; 458 PetscInt *ctr, *pa, *tmp, *isz, *isz1, **xdata, **rbuf2; 459 PetscBT *table; 460 MPI_Comm comm; 461 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2; 462 MPI_Status *recv_status; 463 MPI_Comm *iscomms; 464 char *t_p; 465 466 PetscFunctionBegin; 467 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 468 size = c->size; 469 rank = c->rank; 470 M = C->rmap->N; 471 472 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 473 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 474 475 PetscCall(PetscMalloc2(imax, (PetscInt ***)&idx, imax, &n)); 476 477 for (i = 0; i < imax; i++) { 478 PetscCall(ISGetIndices(is[i], &idx[i])); 479 PetscCall(ISGetLocalSize(is[i], &n[i])); 480 } 481 482 /* evaluate communication - mesg to who,length of mesg, and buffer space 483 required. Based on this, buffers are allocated, and data copied into them */ 484 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); 485 for (i = 0; i < imax; i++) { 486 PetscCall(PetscArrayzero(w4, size)); /* initialise work vector*/ 487 idx_i = idx[i]; 488 len = n[i]; 489 for (j = 0; j < len; j++) { 490 row = idx_i[j]; 491 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index set cannot have negative entries"); 492 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 493 w4[proc]++; 494 } 495 for (j = 0; j < size; j++) { 496 if (w4[j]) { 497 w1[j] += w4[j]; 498 w3[j]++; 499 } 500 } 501 } 502 503 nrqs = 0; /* no of outgoing messages */ 504 msz = 0; /* total mesg length (for all proc */ 505 w1[rank] = 0; /* no mesg sent to intself */ 506 w3[rank] = 0; 507 for (i = 0; i < size; i++) { 508 if (w1[i]) { 509 w2[i] = 1; 510 nrqs++; 511 } /* there exists a message to proc i */ 512 } 513 /* pa - is list of processors to communicate with */ 514 PetscCall(PetscMalloc1(nrqs, &pa)); 515 for (i = 0, j = 0; i < size; i++) { 516 if (w1[i]) { 517 pa[j] = i; 518 j++; 519 } 520 } 521 522 /* Each message would have a header = 1 + 2*(no of IS) + data */ 523 for (i = 0; i < nrqs; i++) { 524 j = pa[i]; 525 w1[j] += w2[j] + 2 * w3[j]; 526 msz += w1[j]; 527 } 528 529 /* Determine the number of messages to expect, their lengths, from from-ids */ 530 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 531 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 532 533 /* Now post the Irecvs corresponding to these messages */ 534 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf, &r_waits1)); 535 536 /* Allocate Memory for outgoing messages */ 537 PetscCall(PetscMalloc4(size, &outdat, size, &ptr, msz, &tmp, size, &ctr)); 538 PetscCall(PetscArrayzero(outdat, size)); 539 PetscCall(PetscArrayzero(ptr, size)); 540 541 { 542 PetscInt *iptr = tmp, ict = 0; 543 for (i = 0; i < nrqs; i++) { 544 j = pa[i]; 545 iptr += ict; 546 outdat[j] = iptr; 547 ict = w1[j]; 548 } 549 } 550 551 /* Form the outgoing messages */ 552 /* plug in the headers */ 553 for (i = 0; i < nrqs; i++) { 554 j = pa[i]; 555 outdat[j][0] = 0; 556 PetscCall(PetscArrayzero(outdat[j] + 1, 2 * w3[j])); 557 ptr[j] = outdat[j] + 2 * w3[j] + 1; 558 } 559 560 /* Memory for doing local proc's work */ 561 { 562 PetscInt M_BPB_imax = 0; 563 #if defined(PETSC_USE_CTABLE) 564 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 565 PetscCall(PetscMalloc1(imax, &table_data)); 566 for (i = 0; i < imax; i++) PetscCall(PetscTableCreate(n[i], M, &table_data[i])); 567 PetscCall(PetscCalloc4(imax, &table, imax, &data, imax, &isz, M_BPB_imax, &t_p)); 568 for (i = 0; i < imax; i++) table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 569 #else 570 PetscInt Mimax = 0; 571 PetscCall(PetscIntMultError(M, imax, &Mimax)); 572 PetscCall(PetscIntMultError((M / PETSC_BITS_PER_BYTE + 1), imax, &M_BPB_imax)); 573 PetscCall(PetscCalloc5(imax, &table, imax, &data, imax, &isz, Mimax, &d_p, M_BPB_imax, &t_p)); 574 for (i = 0; i < imax; i++) { 575 table[i] = t_p + (M / PETSC_BITS_PER_BYTE + 1) * i; 576 data[i] = d_p + M * i; 577 } 578 #endif 579 } 580 581 /* Parse the IS and update local tables and the outgoing buf with the data */ 582 { 583 PetscInt n_i, isz_i, *outdat_j, ctr_j; 584 PetscBT table_i; 585 586 for (i = 0; i < imax; i++) { 587 PetscCall(PetscArrayzero(ctr, size)); 588 n_i = n[i]; 589 table_i = table[i]; 590 idx_i = idx[i]; 591 #if defined(PETSC_USE_CTABLE) 592 table_data_i = table_data[i]; 593 #else 594 data_i = data[i]; 595 #endif 596 isz_i = isz[i]; 597 for (j = 0; j < n_i; j++) { /* parse the indices of each IS */ 598 row = idx_i[j]; 599 PetscCall(PetscLayoutFindOwner(C->rmap, row, &proc)); 600 if (proc != rank) { /* copy to the outgoing buffer */ 601 ctr[proc]++; 602 *ptr[proc] = row; 603 ptr[proc]++; 604 } else if (!PetscBTLookupSet(table_i, row)) { 605 #if defined(PETSC_USE_CTABLE) 606 PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES)); 607 #else 608 data_i[isz_i] = row; /* Update the local table */ 609 #endif 610 isz_i++; 611 } 612 } 613 /* Update the headers for the current IS */ 614 for (j = 0; j < size; j++) { /* Can Optimise this loop by using pa[] */ 615 if ((ctr_j = ctr[j])) { 616 outdat_j = outdat[j]; 617 k = ++outdat_j[0]; 618 outdat_j[2 * k] = ctr_j; 619 outdat_j[2 * k - 1] = i; 620 } 621 } 622 isz[i] = isz_i; 623 } 624 } 625 626 /* Now post the sends */ 627 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 628 for (i = 0; i < nrqs; ++i) { 629 j = pa[i]; 630 PetscCallMPI(MPI_Isend(outdat[j], w1[j], MPIU_INT, j, tag1, comm, s_waits1 + i)); 631 } 632 633 /* No longer need the original indices */ 634 PetscCall(PetscMalloc1(imax, &iscomms)); 635 for (i = 0; i < imax; ++i) { 636 PetscCall(ISRestoreIndices(is[i], idx + i)); 637 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]), &iscomms[i], NULL)); 638 } 639 PetscCall(PetscFree2(*(PetscInt ***)&idx, n)); 640 641 for (i = 0; i < imax; ++i) PetscCall(ISDestroy(&is[i])); 642 643 /* Do Local work */ 644 #if defined(PETSC_USE_CTABLE) 645 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, NULL, table_data)); 646 #else 647 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C, imax, table, isz, data, NULL)); 648 #endif 649 650 /* Receive messages */ 651 PetscCall(PetscMalloc1(nrqr, &recv_status)); 652 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, recv_status)); 653 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 654 655 /* Phase 1 sends are complete - deallocate buffers */ 656 PetscCall(PetscFree4(outdat, ptr, tmp, ctr)); 657 PetscCall(PetscFree4(w1, w2, w3, w4)); 658 659 PetscCall(PetscMalloc1(nrqr, &xdata)); 660 PetscCall(PetscMalloc1(nrqr, &isz1)); 661 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C, nrqr, rbuf, xdata, isz1)); 662 PetscCall(PetscFree(rbuf[0])); 663 PetscCall(PetscFree(rbuf)); 664 665 /* Send the data back */ 666 /* Do a global reduction to know the buffer space req for incoming messages */ 667 { 668 PetscMPIInt *rw1; 669 670 PetscCall(PetscCalloc1(size, &rw1)); 671 672 for (i = 0; i < nrqr; ++i) { 673 proc = recv_status[i].MPI_SOURCE; 674 675 PetscCheck(proc == onodes1[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_SOURCE mismatch"); 676 rw1[proc] = isz1[i]; 677 } 678 PetscCall(PetscFree(onodes1)); 679 PetscCall(PetscFree(olengths1)); 680 681 /* Determine the number of messages to expect, their lengths, from from-ids */ 682 PetscCall(PetscGatherMessageLengths(comm, nrqr, nrqs, rw1, &onodes2, &olengths2)); 683 PetscCall(PetscFree(rw1)); 684 } 685 /* Now post the Irecvs corresponding to these messages */ 686 PetscCall(PetscPostIrecvInt(comm, tag2, nrqs, onodes2, olengths2, &rbuf2, &r_waits2)); 687 688 /* Now post the sends */ 689 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 690 for (i = 0; i < nrqr; ++i) { 691 j = recv_status[i].MPI_SOURCE; 692 PetscCallMPI(MPI_Isend(xdata[i], isz1[i], MPIU_INT, j, tag2, comm, s_waits2 + i)); 693 } 694 695 /* receive work done on other processors */ 696 { 697 PetscInt is_no, ct1, max, *rbuf2_i, isz_i, jmax; 698 PetscMPIInt idex; 699 PetscBT table_i; 700 701 for (i = 0; i < nrqs; ++i) { 702 PetscCallMPI(MPI_Waitany(nrqs, r_waits2, &idex, MPI_STATUS_IGNORE)); 703 /* Process the message */ 704 rbuf2_i = rbuf2[idex]; 705 ct1 = 2 * rbuf2_i[0] + 1; 706 jmax = rbuf2[idex][0]; 707 for (j = 1; j <= jmax; j++) { 708 max = rbuf2_i[2 * j]; 709 is_no = rbuf2_i[2 * j - 1]; 710 isz_i = isz[is_no]; 711 table_i = table[is_no]; 712 #if defined(PETSC_USE_CTABLE) 713 table_data_i = table_data[is_no]; 714 #else 715 data_i = data[is_no]; 716 #endif 717 for (k = 0; k < max; k++, ct1++) { 718 row = rbuf2_i[ct1]; 719 if (!PetscBTLookupSet(table_i, row)) { 720 #if defined(PETSC_USE_CTABLE) 721 PetscCall(PetscTableAdd(table_data_i, row + 1, isz_i + 1, INSERT_VALUES)); 722 #else 723 data_i[isz_i] = row; 724 #endif 725 isz_i++; 726 } 727 } 728 isz[is_no] = isz_i; 729 } 730 } 731 732 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 733 } 734 735 #if defined(PETSC_USE_CTABLE) 736 tcount_max = 0; 737 for (i = 0; i < imax; ++i) { 738 table_data_i = table_data[i]; 739 PetscCall(PetscTableGetCount(table_data_i, &tcount)); 740 if (tcount_max < tcount) tcount_max = tcount; 741 } 742 PetscCall(PetscMalloc1(tcount_max + 1, &tdata)); 743 #endif 744 745 for (i = 0; i < imax; ++i) { 746 #if defined(PETSC_USE_CTABLE) 747 PetscTablePosition tpos; 748 table_data_i = table_data[i]; 749 750 PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos)); 751 while (tpos) { 752 PetscCall(PetscTableGetNext(table_data_i, &tpos, &k, &j)); 753 tdata[--j] = --k; 754 } 755 PetscCall(ISCreateGeneral(iscomms[i], isz[i], tdata, PETSC_COPY_VALUES, is + i)); 756 #else 757 PetscCall(ISCreateGeneral(iscomms[i], isz[i], data[i], PETSC_COPY_VALUES, is + i)); 758 #endif 759 PetscCall(PetscCommDestroy(&iscomms[i])); 760 } 761 762 PetscCall(PetscFree(iscomms)); 763 PetscCall(PetscFree(onodes2)); 764 PetscCall(PetscFree(olengths2)); 765 766 PetscCall(PetscFree(pa)); 767 PetscCall(PetscFree(rbuf2[0])); 768 PetscCall(PetscFree(rbuf2)); 769 PetscCall(PetscFree(s_waits1)); 770 PetscCall(PetscFree(r_waits1)); 771 PetscCall(PetscFree(s_waits2)); 772 PetscCall(PetscFree(r_waits2)); 773 PetscCall(PetscFree(recv_status)); 774 if (xdata) { 775 PetscCall(PetscFree(xdata[0])); 776 PetscCall(PetscFree(xdata)); 777 } 778 PetscCall(PetscFree(isz1)); 779 #if defined(PETSC_USE_CTABLE) 780 for (i = 0; i < imax; i++) PetscCall(PetscTableDestroy((PetscTable *)&table_data[i])); 781 PetscCall(PetscFree(table_data)); 782 PetscCall(PetscFree(tdata)); 783 PetscCall(PetscFree4(table, data, isz, t_p)); 784 #else 785 PetscCall(PetscFree5(table, data, isz, d_p, t_p)); 786 #endif 787 PetscFunctionReturn(0); 788 } 789 790 /* 791 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 792 the work on the local processor. 793 794 Inputs: 795 C - MAT_MPIAIJ; 796 imax - total no of index sets processed at a time; 797 table - an array of char - size = m bits. 798 799 Output: 800 isz - array containing the count of the solution elements corresponding 801 to each index set; 802 data or table_data - pointer to the solutions 803 */ 804 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C, PetscInt imax, PetscBT *table, PetscInt *isz, PetscInt **data, PetscTable *table_data) 805 { 806 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 807 Mat A = c->A, B = c->B; 808 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 809 PetscInt start, end, val, max, rstart, cstart, *ai, *aj; 810 PetscInt *bi, *bj, *garray, i, j, k, row, isz_i; 811 PetscBT table_i; 812 #if defined(PETSC_USE_CTABLE) 813 PetscTable table_data_i; 814 PetscTablePosition tpos; 815 PetscInt tcount, *tdata; 816 #else 817 PetscInt *data_i; 818 #endif 819 820 PetscFunctionBegin; 821 rstart = C->rmap->rstart; 822 cstart = C->cmap->rstart; 823 ai = a->i; 824 aj = a->j; 825 bi = b->i; 826 bj = b->j; 827 garray = c->garray; 828 829 for (i = 0; i < imax; i++) { 830 #if defined(PETSC_USE_CTABLE) 831 /* copy existing entries of table_data_i into tdata[] */ 832 table_data_i = table_data[i]; 833 PetscCall(PetscTableGetCount(table_data_i, &tcount)); 834 PetscCheck(tcount == isz[i], PETSC_COMM_SELF, PETSC_ERR_PLIB, " tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT, tcount, i, isz[i]); 835 836 PetscCall(PetscMalloc1(tcount, &tdata)); 837 PetscCall(PetscTableGetHeadPosition(table_data_i, &tpos)); 838 while (tpos) { 839 PetscCall(PetscTableGetNext(table_data_i, &tpos, &row, &j)); 840 tdata[--j] = --row; 841 PetscCheck(j <= tcount - 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, " j %" PetscInt_FMT " >= tcount %" PetscInt_FMT, j, tcount); 842 } 843 #else 844 data_i = data[i]; 845 #endif 846 table_i = table[i]; 847 isz_i = isz[i]; 848 max = isz[i]; 849 850 for (j = 0; j < max; j++) { 851 #if defined(PETSC_USE_CTABLE) 852 row = tdata[j] - rstart; 853 #else 854 row = data_i[j] - rstart; 855 #endif 856 start = ai[row]; 857 end = ai[row + 1]; 858 for (k = start; k < end; k++) { /* Amat */ 859 val = aj[k] + cstart; 860 if (!PetscBTLookupSet(table_i, val)) { 861 #if defined(PETSC_USE_CTABLE) 862 PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES)); 863 #else 864 data_i[isz_i] = val; 865 #endif 866 isz_i++; 867 } 868 } 869 start = bi[row]; 870 end = bi[row + 1]; 871 for (k = start; k < end; k++) { /* Bmat */ 872 val = garray[bj[k]]; 873 if (!PetscBTLookupSet(table_i, val)) { 874 #if defined(PETSC_USE_CTABLE) 875 PetscCall(PetscTableAdd(table_data_i, val + 1, isz_i + 1, INSERT_VALUES)); 876 #else 877 data_i[isz_i] = val; 878 #endif 879 isz_i++; 880 } 881 } 882 } 883 isz[i] = isz_i; 884 885 #if defined(PETSC_USE_CTABLE) 886 PetscCall(PetscFree(tdata)); 887 #endif 888 } 889 PetscFunctionReturn(0); 890 } 891 892 /* 893 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 894 and return the output 895 896 Input: 897 C - the matrix 898 nrqr - no of messages being processed. 899 rbuf - an array of pointers to the received requests 900 901 Output: 902 xdata - array of messages to be sent back 903 isz1 - size of each message 904 905 For better efficiency perhaps we should malloc separately each xdata[i], 906 then if a remalloc is required we need only copy the data for that one row 907 rather then all previous rows as it is now where a single large chunk of 908 memory is used. 909 910 */ 911 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C, PetscInt nrqr, PetscInt **rbuf, PetscInt **xdata, PetscInt *isz1) 912 { 913 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 914 Mat A = c->A, B = c->B; 915 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data; 916 PetscInt rstart, cstart, *ai, *aj, *bi, *bj, *garray, i, j, k; 917 PetscInt row, total_sz, ct, ct1, ct2, ct3, mem_estimate, oct2, l, start, end; 918 PetscInt val, max1, max2, m, no_malloc = 0, *tmp, new_estimate, ctr; 919 PetscInt *rbuf_i, kmax, rbuf_0; 920 PetscBT xtable; 921 922 PetscFunctionBegin; 923 m = C->rmap->N; 924 rstart = C->rmap->rstart; 925 cstart = C->cmap->rstart; 926 ai = a->i; 927 aj = a->j; 928 bi = b->i; 929 bj = b->j; 930 garray = c->garray; 931 932 for (i = 0, ct = 0, total_sz = 0; i < nrqr; ++i) { 933 rbuf_i = rbuf[i]; 934 rbuf_0 = rbuf_i[0]; 935 ct += rbuf_0; 936 for (j = 1; j <= rbuf_0; j++) total_sz += rbuf_i[2 * j]; 937 } 938 939 if (C->rmap->n) max1 = ct * (a->nz + b->nz) / C->rmap->n; 940 else max1 = 1; 941 mem_estimate = 3 * ((total_sz > max1 ? total_sz : max1) + 1); 942 if (nrqr) { 943 PetscCall(PetscMalloc1(mem_estimate, &xdata[0])); 944 ++no_malloc; 945 } 946 PetscCall(PetscBTCreate(m, &xtable)); 947 PetscCall(PetscArrayzero(isz1, nrqr)); 948 949 ct3 = 0; 950 for (i = 0; i < nrqr; i++) { /* for easch mesg from proc i */ 951 rbuf_i = rbuf[i]; 952 rbuf_0 = rbuf_i[0]; 953 ct1 = 2 * rbuf_0 + 1; 954 ct2 = ct1; 955 ct3 += ct1; 956 for (j = 1; j <= rbuf_0; j++) { /* for each IS from proc i*/ 957 PetscCall(PetscBTMemzero(m, xtable)); 958 oct2 = ct2; 959 kmax = rbuf_i[2 * j]; 960 for (k = 0; k < kmax; k++, ct1++) { 961 row = rbuf_i[ct1]; 962 if (!PetscBTLookupSet(xtable, row)) { 963 if (!(ct3 < mem_estimate)) { 964 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 965 PetscCall(PetscMalloc1(new_estimate, &tmp)); 966 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 967 PetscCall(PetscFree(xdata[0])); 968 xdata[0] = tmp; 969 mem_estimate = new_estimate; 970 ++no_malloc; 971 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 972 } 973 xdata[i][ct2++] = row; 974 ct3++; 975 } 976 } 977 for (k = oct2, max2 = ct2; k < max2; k++) { 978 row = xdata[i][k] - rstart; 979 start = ai[row]; 980 end = ai[row + 1]; 981 for (l = start; l < end; l++) { 982 val = aj[l] + cstart; 983 if (!PetscBTLookupSet(xtable, val)) { 984 if (!(ct3 < mem_estimate)) { 985 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 986 PetscCall(PetscMalloc1(new_estimate, &tmp)); 987 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 988 PetscCall(PetscFree(xdata[0])); 989 xdata[0] = tmp; 990 mem_estimate = new_estimate; 991 ++no_malloc; 992 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 993 } 994 xdata[i][ct2++] = val; 995 ct3++; 996 } 997 } 998 start = bi[row]; 999 end = bi[row + 1]; 1000 for (l = start; l < end; l++) { 1001 val = garray[bj[l]]; 1002 if (!PetscBTLookupSet(xtable, val)) { 1003 if (!(ct3 < mem_estimate)) { 1004 new_estimate = (PetscInt)(1.5 * mem_estimate) + 1; 1005 PetscCall(PetscMalloc1(new_estimate, &tmp)); 1006 PetscCall(PetscArraycpy(tmp, xdata[0], mem_estimate)); 1007 PetscCall(PetscFree(xdata[0])); 1008 xdata[0] = tmp; 1009 mem_estimate = new_estimate; 1010 ++no_malloc; 1011 for (ctr = 1; ctr <= i; ctr++) xdata[ctr] = xdata[ctr - 1] + isz1[ctr - 1]; 1012 } 1013 xdata[i][ct2++] = val; 1014 ct3++; 1015 } 1016 } 1017 } 1018 /* Update the header*/ 1019 xdata[i][2 * j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1020 xdata[i][2 * j - 1] = rbuf_i[2 * j - 1]; 1021 } 1022 xdata[i][0] = rbuf_0; 1023 if (i + 1 < nrqr) xdata[i + 1] = xdata[i] + ct2; 1024 isz1[i] = ct2; /* size of each message */ 1025 } 1026 PetscCall(PetscBTDestroy(&xtable)); 1027 PetscCall(PetscInfo(C, "Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n", mem_estimate, ct3, no_malloc)); 1028 PetscFunctionReturn(0); 1029 } 1030 /* -------------------------------------------------------------------------*/ 1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat *); 1032 /* 1033 Every processor gets the entire matrix 1034 */ 1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A, MatCreateSubMatrixOption flag, MatReuse scall, Mat *Bin[]) 1036 { 1037 Mat B; 1038 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1039 Mat_SeqAIJ *b, *ad = (Mat_SeqAIJ *)a->A->data, *bd = (Mat_SeqAIJ *)a->B->data; 1040 PetscMPIInt size, rank, *recvcounts = NULL, *displs = NULL; 1041 PetscInt sendcount, i, *rstarts = A->rmap->range, n, cnt, j; 1042 PetscInt m, *b_sendj, *garray = a->garray, *lens, *jsendbuf, *a_jsendbuf, *b_jsendbuf; 1043 1044 PetscFunctionBegin; 1045 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 1046 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A), &rank)); 1047 if (scall == MAT_INITIAL_MATRIX) { 1048 /* ---------------------------------------------------------------- 1049 Tell every processor the number of nonzeros per row 1050 */ 1051 PetscCall(PetscMalloc1(A->rmap->N, &lens)); 1052 for (i = A->rmap->rstart; i < A->rmap->rend; i++) lens[i] = ad->i[i - A->rmap->rstart + 1] - ad->i[i - A->rmap->rstart] + bd->i[i - A->rmap->rstart + 1] - bd->i[i - A->rmap->rstart]; 1053 PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1054 for (i = 0; i < size; i++) { 1055 recvcounts[i] = A->rmap->range[i + 1] - A->rmap->range[i]; 1056 displs[i] = A->rmap->range[i]; 1057 } 1058 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, lens, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1059 /* --------------------------------------------------------------- 1060 Create the sequential matrix of the same type as the local block diagonal 1061 */ 1062 PetscCall(MatCreate(PETSC_COMM_SELF, &B)); 1063 PetscCall(MatSetSizes(B, A->rmap->N, A->cmap->N, PETSC_DETERMINE, PETSC_DETERMINE)); 1064 PetscCall(MatSetBlockSizesFromMats(B, A, A)); 1065 PetscCall(MatSetType(B, ((PetscObject)a->A)->type_name)); 1066 PetscCall(MatSeqAIJSetPreallocation(B, 0, lens)); 1067 PetscCall(PetscCalloc1(2, Bin)); 1068 **Bin = B; 1069 b = (Mat_SeqAIJ *)B->data; 1070 1071 /*-------------------------------------------------------------------- 1072 Copy my part of matrix column indices over 1073 */ 1074 sendcount = ad->nz + bd->nz; 1075 jsendbuf = b->j + b->i[rstarts[rank]]; 1076 a_jsendbuf = ad->j; 1077 b_jsendbuf = bd->j; 1078 n = A->rmap->rend - A->rmap->rstart; 1079 cnt = 0; 1080 for (i = 0; i < n; i++) { 1081 /* put in lower diagonal portion */ 1082 m = bd->i[i + 1] - bd->i[i]; 1083 while (m > 0) { 1084 /* is it above diagonal (in bd (compressed) numbering) */ 1085 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1086 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1087 m--; 1088 } 1089 1090 /* put in diagonal portion */ 1091 for (j = ad->i[i]; j < ad->i[i + 1]; j++) jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1092 1093 /* put in upper diagonal portion */ 1094 while (m-- > 0) jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1095 } 1096 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1097 1098 /*-------------------------------------------------------------------- 1099 Gather all column indices to all processors 1100 */ 1101 for (i = 0; i < size; i++) { 1102 recvcounts[i] = 0; 1103 for (j = A->rmap->range[i]; j < A->rmap->range[i + 1]; j++) recvcounts[i] += lens[j]; 1104 } 1105 displs[0] = 0; 1106 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1107 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, b->j, recvcounts, displs, MPIU_INT, PetscObjectComm((PetscObject)A))); 1108 /*-------------------------------------------------------------------- 1109 Assemble the matrix into useable form (note numerical values not yet set) 1110 */ 1111 /* set the b->ilen (length of each row) values */ 1112 PetscCall(PetscArraycpy(b->ilen, lens, A->rmap->N)); 1113 /* set the b->i indices */ 1114 b->i[0] = 0; 1115 for (i = 1; i <= A->rmap->N; i++) b->i[i] = b->i[i - 1] + lens[i - 1]; 1116 PetscCall(PetscFree(lens)); 1117 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1118 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1119 1120 } else { 1121 B = **Bin; 1122 b = (Mat_SeqAIJ *)B->data; 1123 } 1124 1125 /*-------------------------------------------------------------------- 1126 Copy my part of matrix numerical values into the values location 1127 */ 1128 if (flag == MAT_GET_VALUES) { 1129 const PetscScalar *ada, *bda, *a_sendbuf, *b_sendbuf; 1130 MatScalar *sendbuf, *recvbuf; 1131 1132 PetscCall(MatSeqAIJGetArrayRead(a->A, &ada)); 1133 PetscCall(MatSeqAIJGetArrayRead(a->B, &bda)); 1134 sendcount = ad->nz + bd->nz; 1135 sendbuf = b->a + b->i[rstarts[rank]]; 1136 a_sendbuf = ada; 1137 b_sendbuf = bda; 1138 b_sendj = bd->j; 1139 n = A->rmap->rend - A->rmap->rstart; 1140 cnt = 0; 1141 for (i = 0; i < n; i++) { 1142 /* put in lower diagonal portion */ 1143 m = bd->i[i + 1] - bd->i[i]; 1144 while (m > 0) { 1145 /* is it above diagonal (in bd (compressed) numbering) */ 1146 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1147 sendbuf[cnt++] = *b_sendbuf++; 1148 m--; 1149 b_sendj++; 1150 } 1151 1152 /* put in diagonal portion */ 1153 for (j = ad->i[i]; j < ad->i[i + 1]; j++) sendbuf[cnt++] = *a_sendbuf++; 1154 1155 /* put in upper diagonal portion */ 1156 while (m-- > 0) { 1157 sendbuf[cnt++] = *b_sendbuf++; 1158 b_sendj++; 1159 } 1160 } 1161 PetscCheck(cnt == sendcount, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT, sendcount, cnt); 1162 1163 /* ----------------------------------------------------------------- 1164 Gather all numerical values to all processors 1165 */ 1166 if (!recvcounts) PetscCall(PetscMalloc2(size, &recvcounts, size, &displs)); 1167 for (i = 0; i < size; i++) recvcounts[i] = b->i[rstarts[i + 1]] - b->i[rstarts[i]]; 1168 displs[0] = 0; 1169 for (i = 1; i < size; i++) displs[i] = displs[i - 1] + recvcounts[i - 1]; 1170 recvbuf = b->a; 1171 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, recvbuf, recvcounts, displs, MPIU_SCALAR, PetscObjectComm((PetscObject)A))); 1172 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &ada)); 1173 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bda)); 1174 } /* endof (flag == MAT_GET_VALUES) */ 1175 PetscCall(PetscFree2(recvcounts, displs)); 1176 1177 PetscCall(MatPropagateSymmetryOptions(A, B)); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, PetscBool allcolumns, Mat *submats) 1182 { 1183 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 1184 Mat submat, A = c->A, B = c->B; 1185 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *subc; 1186 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, nzA, nzB; 1187 PetscInt cstart = C->cmap->rstart, cend = C->cmap->rend, rstart = C->rmap->rstart, *bmap = c->garray; 1188 const PetscInt *icol, *irow; 1189 PetscInt nrow, ncol, start; 1190 PetscMPIInt rank, size, tag1, tag2, tag3, tag4, *w1, *w2, nrqr; 1191 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, ct3, **rbuf1, row, proc; 1192 PetscInt nrqs = 0, msz, **ptr, *req_size, *ctr, *pa, *tmp, tcol, *iptr; 1193 PetscInt **rbuf3, *req_source1, *req_source2, **sbuf_aj, **rbuf2, max1, nnz; 1194 PetscInt *lens, rmax, ncols, *cols, Crow; 1195 #if defined(PETSC_USE_CTABLE) 1196 PetscTable cmap, rmap; 1197 PetscInt *cmap_loc, *rmap_loc; 1198 #else 1199 PetscInt *cmap, *rmap; 1200 #endif 1201 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *sbuf1_i, *rbuf2_i, *rbuf3_i; 1202 PetscInt *cworkB, lwrite, *subcols, *row2proc; 1203 PetscScalar *vworkA, *vworkB, *a_a, *b_a, *subvals = NULL; 1204 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 1205 MPI_Request *r_waits4, *s_waits3 = NULL, *s_waits4; 1206 MPI_Status *r_status1, *r_status2, *s_status1, *s_status3 = NULL, *s_status2; 1207 MPI_Status *r_status3 = NULL, *r_status4, *s_status4; 1208 MPI_Comm comm; 1209 PetscScalar **rbuf4, **sbuf_aa, *vals, *sbuf_aa_i, *rbuf4_i; 1210 PetscMPIInt *onodes1, *olengths1, idex, end; 1211 Mat_SubSppt *smatis1; 1212 PetscBool isrowsorted, iscolsorted; 1213 1214 PetscFunctionBegin; 1215 PetscValidLogicalCollectiveInt(C, ismax, 2); 1216 PetscValidLogicalCollectiveEnum(C, scall, 5); 1217 PetscCheck(ismax == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "This routine only works when all processes have ismax=1"); 1218 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 1219 PetscCall(MatSeqAIJGetArrayRead(B, (const PetscScalar **)&b_a)); 1220 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1221 size = c->size; 1222 rank = c->rank; 1223 1224 PetscCall(ISSorted(iscol[0], &iscolsorted)); 1225 PetscCall(ISSorted(isrow[0], &isrowsorted)); 1226 PetscCall(ISGetIndices(isrow[0], &irow)); 1227 PetscCall(ISGetLocalSize(isrow[0], &nrow)); 1228 if (allcolumns) { 1229 icol = NULL; 1230 ncol = C->cmap->N; 1231 } else { 1232 PetscCall(ISGetIndices(iscol[0], &icol)); 1233 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1234 } 1235 1236 if (scall == MAT_INITIAL_MATRIX) { 1237 PetscInt *sbuf2_i, *cworkA, lwrite, ctmp; 1238 1239 /* Get some new tags to keep the communication clean */ 1240 tag1 = ((PetscObject)C)->tag; 1241 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 1242 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 1243 1244 /* evaluate communication - mesg to who, length of mesg, and buffer space 1245 required. Based on this, buffers are allocated, and data copied into them */ 1246 PetscCall(PetscCalloc2(size, &w1, size, &w2)); 1247 PetscCall(PetscMalloc1(nrow, &row2proc)); 1248 1249 /* w1[proc] = num of rows owned by proc -- to be requested */ 1250 proc = 0; 1251 nrqs = 0; /* num of outgoing messages */ 1252 for (j = 0; j < nrow; j++) { 1253 row = irow[j]; 1254 if (!isrowsorted) proc = 0; 1255 while (row >= C->rmap->range[proc + 1]) proc++; 1256 w1[proc]++; 1257 row2proc[j] = proc; /* map row index to proc */ 1258 1259 if (proc != rank && !w2[proc]) { 1260 w2[proc] = 1; 1261 nrqs++; 1262 } 1263 } 1264 w1[rank] = 0; /* rows owned by self will not be requested */ 1265 1266 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 1267 for (proc = 0, j = 0; proc < size; proc++) { 1268 if (w1[proc]) pa[j++] = proc; 1269 } 1270 1271 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1272 msz = 0; /* total mesg length (for all procs) */ 1273 for (i = 0; i < nrqs; i++) { 1274 proc = pa[i]; 1275 w1[proc] += 3; 1276 msz += w1[proc]; 1277 } 1278 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 1279 1280 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1281 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1282 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 1283 1284 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1285 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1286 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 1287 1288 /* Now post the Irecvs corresponding to these messages */ 1289 PetscCall(PetscPostIrecvInt(comm, tag1, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 1290 1291 PetscCall(PetscFree(onodes1)); 1292 PetscCall(PetscFree(olengths1)); 1293 1294 /* Allocate Memory for outgoing messages */ 1295 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 1296 PetscCall(PetscArrayzero(sbuf1, size)); 1297 PetscCall(PetscArrayzero(ptr, size)); 1298 1299 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1300 iptr = tmp; 1301 for (i = 0; i < nrqs; i++) { 1302 proc = pa[i]; 1303 sbuf1[proc] = iptr; 1304 iptr += w1[proc]; 1305 } 1306 1307 /* Form the outgoing messages */ 1308 /* Initialize the header space */ 1309 for (i = 0; i < nrqs; i++) { 1310 proc = pa[i]; 1311 PetscCall(PetscArrayzero(sbuf1[proc], 3)); 1312 ptr[proc] = sbuf1[proc] + 3; 1313 } 1314 1315 /* Parse the isrow and copy data into outbuf */ 1316 PetscCall(PetscArrayzero(ctr, size)); 1317 for (j = 0; j < nrow; j++) { /* parse the indices of each IS */ 1318 proc = row2proc[j]; 1319 if (proc != rank) { /* copy to the outgoing buf*/ 1320 *ptr[proc] = irow[j]; 1321 ctr[proc]++; 1322 ptr[proc]++; 1323 } 1324 } 1325 1326 /* Update the headers for the current IS */ 1327 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 1328 if ((ctr_j = ctr[j])) { 1329 sbuf1_j = sbuf1[j]; 1330 k = ++sbuf1_j[0]; 1331 sbuf1_j[2 * k] = ctr_j; 1332 sbuf1_j[2 * k - 1] = 0; 1333 } 1334 } 1335 1336 /* Now post the sends */ 1337 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 1338 for (i = 0; i < nrqs; ++i) { 1339 proc = pa[i]; 1340 PetscCallMPI(MPI_Isend(sbuf1[proc], w1[proc], MPIU_INT, proc, tag1, comm, s_waits1 + i)); 1341 } 1342 1343 /* Post Receives to capture the buffer size */ 1344 PetscCall(PetscMalloc4(nrqs, &r_status2, nrqr, &s_waits2, nrqs, &r_waits2, nrqr, &s_status2)); 1345 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 1346 1347 if (nrqs) rbuf2[0] = tmp + msz; 1348 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 1349 1350 for (i = 0; i < nrqs; ++i) { 1351 proc = pa[i]; 1352 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[proc], MPIU_INT, proc, tag2, comm, r_waits2 + i)); 1353 } 1354 1355 PetscCall(PetscFree2(w1, w2)); 1356 1357 /* Send to other procs the buf size they should allocate */ 1358 /* Receive messages*/ 1359 PetscCall(PetscMalloc1(nrqr, &r_status1)); 1360 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 1361 1362 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, r_status1)); 1363 for (i = 0; i < nrqr; ++i) { 1364 req_size[i] = 0; 1365 rbuf1_i = rbuf1[i]; 1366 start = 2 * rbuf1_i[0] + 1; 1367 PetscCallMPI(MPI_Get_count(r_status1 + i, MPIU_INT, &end)); 1368 PetscCall(PetscMalloc1(end, &sbuf2[i])); 1369 sbuf2_i = sbuf2[i]; 1370 for (j = start; j < end; j++) { 1371 k = rbuf1_i[j] - rstart; 1372 ncols = ai[k + 1] - ai[k] + bi[k + 1] - bi[k]; 1373 sbuf2_i[j] = ncols; 1374 req_size[i] += ncols; 1375 } 1376 req_source1[i] = r_status1[i].MPI_SOURCE; 1377 1378 /* form the header */ 1379 sbuf2_i[0] = req_size[i]; 1380 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 1381 1382 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 1383 } 1384 1385 PetscCall(PetscFree(r_status1)); 1386 PetscCall(PetscFree(r_waits1)); 1387 1388 /* rbuf2 is received, Post recv column indices a->j */ 1389 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, r_status2)); 1390 1391 PetscCall(PetscMalloc4(nrqs, &r_waits3, nrqr, &s_waits3, nrqs, &r_status3, nrqr, &s_status3)); 1392 for (i = 0; i < nrqs; ++i) { 1393 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 1394 req_source2[i] = r_status2[i].MPI_SOURCE; 1395 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 1396 } 1397 1398 /* Wait on sends1 and sends2 */ 1399 PetscCall(PetscMalloc1(nrqs, &s_status1)); 1400 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status1)); 1401 PetscCall(PetscFree(s_waits1)); 1402 PetscCall(PetscFree(s_status1)); 1403 1404 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status2)); 1405 PetscCall(PetscFree4(r_status2, s_waits2, r_waits2, s_status2)); 1406 1407 /* Now allocate sending buffers for a->j, and send them off */ 1408 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 1409 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1410 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 1411 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 1412 1413 for (i = 0; i < nrqr; i++) { /* for each requested message */ 1414 rbuf1_i = rbuf1[i]; 1415 sbuf_aj_i = sbuf_aj[i]; 1416 ct1 = 2 * rbuf1_i[0] + 1; 1417 ct2 = 0; 1418 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1419 1420 kmax = rbuf1[i][2]; 1421 for (k = 0; k < kmax; k++, ct1++) { /* for each row */ 1422 row = rbuf1_i[ct1] - rstart; 1423 nzA = ai[row + 1] - ai[row]; 1424 nzB = bi[row + 1] - bi[row]; 1425 ncols = nzA + nzB; 1426 cworkA = aj + ai[row]; 1427 cworkB = bj + bi[row]; 1428 1429 /* load the column indices for this row into cols*/ 1430 cols = sbuf_aj_i + ct2; 1431 1432 lwrite = 0; 1433 for (l = 0; l < nzB; l++) { 1434 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1435 } 1436 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1437 for (l = 0; l < nzB; l++) { 1438 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1439 } 1440 1441 ct2 += ncols; 1442 } 1443 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 1444 } 1445 1446 /* create column map (cmap): global col of C -> local col of submat */ 1447 #if defined(PETSC_USE_CTABLE) 1448 if (!allcolumns) { 1449 PetscCall(PetscTableCreate(ncol, C->cmap->N, &cmap)); 1450 PetscCall(PetscCalloc1(C->cmap->n, &cmap_loc)); 1451 for (j = 0; j < ncol; j++) { /* use array cmap_loc[] for local col indices */ 1452 if (icol[j] >= cstart && icol[j] < cend) { 1453 cmap_loc[icol[j] - cstart] = j + 1; 1454 } else { /* use PetscTable for non-local col indices */ 1455 PetscCall(PetscTableAdd(cmap, icol[j] + 1, j + 1, INSERT_VALUES)); 1456 } 1457 } 1458 } else { 1459 cmap = NULL; 1460 cmap_loc = NULL; 1461 } 1462 PetscCall(PetscCalloc1(C->rmap->n, &rmap_loc)); 1463 #else 1464 if (!allcolumns) { 1465 PetscCall(PetscCalloc1(C->cmap->N, &cmap)); 1466 for (j = 0; j < ncol; j++) cmap[icol[j]] = j + 1; 1467 } else { 1468 cmap = NULL; 1469 } 1470 #endif 1471 1472 /* Create lens for MatSeqAIJSetPreallocation() */ 1473 PetscCall(PetscCalloc1(nrow, &lens)); 1474 1475 /* Compute lens from local part of C */ 1476 for (j = 0; j < nrow; j++) { 1477 row = irow[j]; 1478 proc = row2proc[j]; 1479 if (proc == rank) { 1480 /* diagonal part A = c->A */ 1481 ncols = ai[row - rstart + 1] - ai[row - rstart]; 1482 cols = aj + ai[row - rstart]; 1483 if (!allcolumns) { 1484 for (k = 0; k < ncols; k++) { 1485 #if defined(PETSC_USE_CTABLE) 1486 tcol = cmap_loc[cols[k]]; 1487 #else 1488 tcol = cmap[cols[k] + cstart]; 1489 #endif 1490 if (tcol) lens[j]++; 1491 } 1492 } else { /* allcolumns */ 1493 lens[j] = ncols; 1494 } 1495 1496 /* off-diagonal part B = c->B */ 1497 ncols = bi[row - rstart + 1] - bi[row - rstart]; 1498 cols = bj + bi[row - rstart]; 1499 if (!allcolumns) { 1500 for (k = 0; k < ncols; k++) { 1501 #if defined(PETSC_USE_CTABLE) 1502 PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol)); 1503 #else 1504 tcol = cmap[bmap[cols[k]]]; 1505 #endif 1506 if (tcol) lens[j]++; 1507 } 1508 } else { /* allcolumns */ 1509 lens[j] += ncols; 1510 } 1511 } 1512 } 1513 1514 /* Create row map (rmap): global row of C -> local row of submat */ 1515 #if defined(PETSC_USE_CTABLE) 1516 PetscCall(PetscTableCreate(nrow, C->rmap->N, &rmap)); 1517 for (j = 0; j < nrow; j++) { 1518 row = irow[j]; 1519 proc = row2proc[j]; 1520 if (proc == rank) { /* a local row */ 1521 rmap_loc[row - rstart] = j; 1522 } else { 1523 PetscCall(PetscTableAdd(rmap, irow[j] + 1, j + 1, INSERT_VALUES)); 1524 } 1525 } 1526 #else 1527 PetscCall(PetscCalloc1(C->rmap->N, &rmap)); 1528 for (j = 0; j < nrow; j++) rmap[irow[j]] = j; 1529 #endif 1530 1531 /* Update lens from offproc data */ 1532 /* recv a->j is done */ 1533 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, r_status3)); 1534 for (i = 0; i < nrqs; i++) { 1535 proc = pa[i]; 1536 sbuf1_i = sbuf1[proc]; 1537 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1538 ct1 = 2 + 1; 1539 ct2 = 0; 1540 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1541 rbuf3_i = rbuf3[i]; /* received C->j */ 1542 1543 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1544 max1 = sbuf1_i[2]; 1545 for (k = 0; k < max1; k++, ct1++) { 1546 #if defined(PETSC_USE_CTABLE) 1547 PetscCall(PetscTableFind(rmap, sbuf1_i[ct1] + 1, &row)); 1548 row--; 1549 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 1550 #else 1551 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1552 #endif 1553 /* Now, store row index of submat in sbuf1_i[ct1] */ 1554 sbuf1_i[ct1] = row; 1555 1556 nnz = rbuf2_i[ct1]; 1557 if (!allcolumns) { 1558 for (l = 0; l < nnz; l++, ct2++) { 1559 #if defined(PETSC_USE_CTABLE) 1560 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1561 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1562 } else { 1563 PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol)); 1564 } 1565 #else 1566 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1567 #endif 1568 if (tcol) lens[row]++; 1569 } 1570 } else { /* allcolumns */ 1571 lens[row] += nnz; 1572 } 1573 } 1574 } 1575 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, s_status3)); 1576 PetscCall(PetscFree4(r_waits3, s_waits3, r_status3, s_status3)); 1577 1578 /* Create the submatrices */ 1579 PetscCall(MatCreate(PETSC_COMM_SELF, &submat)); 1580 PetscCall(MatSetSizes(submat, nrow, ncol, PETSC_DETERMINE, PETSC_DETERMINE)); 1581 1582 PetscCall(ISGetBlockSize(isrow[0], &i)); 1583 PetscCall(ISGetBlockSize(iscol[0], &j)); 1584 PetscCall(MatSetBlockSizes(submat, i, j)); 1585 PetscCall(MatSetType(submat, ((PetscObject)A)->type_name)); 1586 PetscCall(MatSeqAIJSetPreallocation(submat, 0, lens)); 1587 1588 /* create struct Mat_SubSppt and attached it to submat */ 1589 PetscCall(PetscNew(&smatis1)); 1590 subc = (Mat_SeqAIJ *)submat->data; 1591 subc->submatis1 = smatis1; 1592 1593 smatis1->id = 0; 1594 smatis1->nrqs = nrqs; 1595 smatis1->nrqr = nrqr; 1596 smatis1->rbuf1 = rbuf1; 1597 smatis1->rbuf2 = rbuf2; 1598 smatis1->rbuf3 = rbuf3; 1599 smatis1->sbuf2 = sbuf2; 1600 smatis1->req_source2 = req_source2; 1601 1602 smatis1->sbuf1 = sbuf1; 1603 smatis1->ptr = ptr; 1604 smatis1->tmp = tmp; 1605 smatis1->ctr = ctr; 1606 1607 smatis1->pa = pa; 1608 smatis1->req_size = req_size; 1609 smatis1->req_source1 = req_source1; 1610 1611 smatis1->allcolumns = allcolumns; 1612 smatis1->singleis = PETSC_TRUE; 1613 smatis1->row2proc = row2proc; 1614 smatis1->rmap = rmap; 1615 smatis1->cmap = cmap; 1616 #if defined(PETSC_USE_CTABLE) 1617 smatis1->rmap_loc = rmap_loc; 1618 smatis1->cmap_loc = cmap_loc; 1619 #endif 1620 1621 smatis1->destroy = submat->ops->destroy; 1622 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1623 submat->factortype = C->factortype; 1624 1625 /* compute rmax */ 1626 rmax = 0; 1627 for (i = 0; i < nrow; i++) rmax = PetscMax(rmax, lens[i]); 1628 1629 } else { /* scall == MAT_REUSE_MATRIX */ 1630 submat = submats[0]; 1631 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 1632 1633 subc = (Mat_SeqAIJ *)submat->data; 1634 rmax = subc->rmax; 1635 smatis1 = subc->submatis1; 1636 nrqs = smatis1->nrqs; 1637 nrqr = smatis1->nrqr; 1638 rbuf1 = smatis1->rbuf1; 1639 rbuf2 = smatis1->rbuf2; 1640 rbuf3 = smatis1->rbuf3; 1641 req_source2 = smatis1->req_source2; 1642 1643 sbuf1 = smatis1->sbuf1; 1644 sbuf2 = smatis1->sbuf2; 1645 ptr = smatis1->ptr; 1646 tmp = smatis1->tmp; 1647 ctr = smatis1->ctr; 1648 1649 pa = smatis1->pa; 1650 req_size = smatis1->req_size; 1651 req_source1 = smatis1->req_source1; 1652 1653 allcolumns = smatis1->allcolumns; 1654 row2proc = smatis1->row2proc; 1655 rmap = smatis1->rmap; 1656 cmap = smatis1->cmap; 1657 #if defined(PETSC_USE_CTABLE) 1658 rmap_loc = smatis1->rmap_loc; 1659 cmap_loc = smatis1->cmap_loc; 1660 #endif 1661 } 1662 1663 /* Post recv matrix values */ 1664 PetscCall(PetscMalloc3(nrqs, &rbuf4, rmax, &subcols, rmax, &subvals)); 1665 PetscCall(PetscMalloc4(nrqs, &r_waits4, nrqr, &s_waits4, nrqs, &r_status4, nrqr, &s_status4)); 1666 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 1667 for (i = 0; i < nrqs; ++i) { 1668 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 1669 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 1670 } 1671 1672 /* Allocate sending buffers for a->a, and send them off */ 1673 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 1674 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 1675 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 1676 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 1677 1678 for (i = 0; i < nrqr; i++) { 1679 rbuf1_i = rbuf1[i]; 1680 sbuf_aa_i = sbuf_aa[i]; 1681 ct1 = 2 * rbuf1_i[0] + 1; 1682 ct2 = 0; 1683 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1684 1685 kmax = rbuf1_i[2]; 1686 for (k = 0; k < kmax; k++, ct1++) { 1687 row = rbuf1_i[ct1] - rstart; 1688 nzA = ai[row + 1] - ai[row]; 1689 nzB = bi[row + 1] - bi[row]; 1690 ncols = nzA + nzB; 1691 cworkB = bj + bi[row]; 1692 vworkA = a_a + ai[row]; 1693 vworkB = b_a + bi[row]; 1694 1695 /* load the column values for this row into vals*/ 1696 vals = sbuf_aa_i + ct2; 1697 1698 lwrite = 0; 1699 for (l = 0; l < nzB; l++) { 1700 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1701 } 1702 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 1703 for (l = 0; l < nzB; l++) { 1704 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1705 } 1706 1707 ct2 += ncols; 1708 } 1709 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 1710 } 1711 1712 /* Assemble submat */ 1713 /* First assemble the local rows */ 1714 for (j = 0; j < nrow; j++) { 1715 row = irow[j]; 1716 proc = row2proc[j]; 1717 if (proc == rank) { 1718 Crow = row - rstart; /* local row index of C */ 1719 #if defined(PETSC_USE_CTABLE) 1720 row = rmap_loc[Crow]; /* row index of submat */ 1721 #else 1722 row = rmap[row]; 1723 #endif 1724 1725 if (allcolumns) { 1726 /* diagonal part A = c->A */ 1727 ncols = ai[Crow + 1] - ai[Crow]; 1728 cols = aj + ai[Crow]; 1729 vals = a_a + ai[Crow]; 1730 i = 0; 1731 for (k = 0; k < ncols; k++) { 1732 subcols[i] = cols[k] + cstart; 1733 subvals[i++] = vals[k]; 1734 } 1735 1736 /* off-diagonal part B = c->B */ 1737 ncols = bi[Crow + 1] - bi[Crow]; 1738 cols = bj + bi[Crow]; 1739 vals = b_a + bi[Crow]; 1740 for (k = 0; k < ncols; k++) { 1741 subcols[i] = bmap[cols[k]]; 1742 subvals[i++] = vals[k]; 1743 } 1744 1745 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1746 1747 } else { /* !allcolumns */ 1748 #if defined(PETSC_USE_CTABLE) 1749 /* diagonal part A = c->A */ 1750 ncols = ai[Crow + 1] - ai[Crow]; 1751 cols = aj + ai[Crow]; 1752 vals = a_a + ai[Crow]; 1753 i = 0; 1754 for (k = 0; k < ncols; k++) { 1755 tcol = cmap_loc[cols[k]]; 1756 if (tcol) { 1757 subcols[i] = --tcol; 1758 subvals[i++] = vals[k]; 1759 } 1760 } 1761 1762 /* off-diagonal part B = c->B */ 1763 ncols = bi[Crow + 1] - bi[Crow]; 1764 cols = bj + bi[Crow]; 1765 vals = b_a + bi[Crow]; 1766 for (k = 0; k < ncols; k++) { 1767 PetscCall(PetscTableFind(cmap, bmap[cols[k]] + 1, &tcol)); 1768 if (tcol) { 1769 subcols[i] = --tcol; 1770 subvals[i++] = vals[k]; 1771 } 1772 } 1773 #else 1774 /* diagonal part A = c->A */ 1775 ncols = ai[Crow + 1] - ai[Crow]; 1776 cols = aj + ai[Crow]; 1777 vals = a_a + ai[Crow]; 1778 i = 0; 1779 for (k = 0; k < ncols; k++) { 1780 tcol = cmap[cols[k] + cstart]; 1781 if (tcol) { 1782 subcols[i] = --tcol; 1783 subvals[i++] = vals[k]; 1784 } 1785 } 1786 1787 /* off-diagonal part B = c->B */ 1788 ncols = bi[Crow + 1] - bi[Crow]; 1789 cols = bj + bi[Crow]; 1790 vals = b_a + bi[Crow]; 1791 for (k = 0; k < ncols; k++) { 1792 tcol = cmap[bmap[cols[k]]]; 1793 if (tcol) { 1794 subcols[i] = --tcol; 1795 subvals[i++] = vals[k]; 1796 } 1797 } 1798 #endif 1799 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, i, subcols, subvals, INSERT_VALUES)); 1800 } 1801 } 1802 } 1803 1804 /* Now assemble the off-proc rows */ 1805 for (i = 0; i < nrqs; i++) { /* for each requested message */ 1806 /* recv values from other processes */ 1807 PetscCallMPI(MPI_Waitany(nrqs, r_waits4, &idex, r_status4 + i)); 1808 proc = pa[idex]; 1809 sbuf1_i = sbuf1[proc]; 1810 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1811 ct1 = 2 + 1; 1812 ct2 = 0; /* count of received C->j */ 1813 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1814 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1815 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1816 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1817 1818 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1819 max1 = sbuf1_i[2]; /* num of rows */ 1820 for (k = 0; k < max1; k++, ct1++) { /* for each recved row */ 1821 row = sbuf1_i[ct1]; /* row index of submat */ 1822 if (!allcolumns) { 1823 idex = 0; 1824 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1825 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1826 for (l = 0; l < nnz; l++, ct2++) { /* for each recved column */ 1827 #if defined(PETSC_USE_CTABLE) 1828 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] < cend) { 1829 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1830 } else { 1831 PetscCall(PetscTableFind(cmap, rbuf3_i[ct2] + 1, &tcol)); 1832 } 1833 #else 1834 tcol = cmap[rbuf3_i[ct2]]; 1835 #endif 1836 if (tcol) { 1837 subcols[idex] = --tcol; /* may not be sorted */ 1838 subvals[idex++] = rbuf4_i[ct2]; 1839 1840 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1841 For reuse, we replace received C->j with index that should be inserted to submat */ 1842 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1843 } 1844 } 1845 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, idex, subcols, subvals, INSERT_VALUES)); 1846 } else { /* scall == MAT_REUSE_MATRIX */ 1847 submat = submats[0]; 1848 subc = (Mat_SeqAIJ *)submat->data; 1849 1850 nnz = subc->i[row + 1] - subc->i[row]; /* num of submat entries in this row */ 1851 for (l = 0; l < nnz; l++) { 1852 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1853 subvals[idex++] = rbuf4_i[ct2]; 1854 } 1855 1856 bj = subc->j + subc->i[row]; /* sorted column indices */ 1857 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, bj, subvals, INSERT_VALUES)); 1858 } 1859 } else { /* allcolumns */ 1860 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1861 PetscCall(MatSetValues_SeqAIJ(submat, 1, &row, nnz, rbuf3_i + ct2, rbuf4_i + ct2, INSERT_VALUES)); 1862 ct2 += nnz; 1863 } 1864 } 1865 } 1866 1867 /* sending a->a are done */ 1868 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, s_status4)); 1869 PetscCall(PetscFree4(r_waits4, s_waits4, r_status4, s_status4)); 1870 1871 PetscCall(MatAssemblyBegin(submat, MAT_FINAL_ASSEMBLY)); 1872 PetscCall(MatAssemblyEnd(submat, MAT_FINAL_ASSEMBLY)); 1873 submats[0] = submat; 1874 1875 /* Restore the indices */ 1876 PetscCall(ISRestoreIndices(isrow[0], &irow)); 1877 if (!allcolumns) PetscCall(ISRestoreIndices(iscol[0], &icol)); 1878 1879 /* Destroy allocated memory */ 1880 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 1881 PetscCall(PetscFree3(rbuf4, subcols, subvals)); 1882 if (sbuf_aa) { 1883 PetscCall(PetscFree(sbuf_aa[0])); 1884 PetscCall(PetscFree(sbuf_aa)); 1885 } 1886 1887 if (scall == MAT_INITIAL_MATRIX) { 1888 PetscCall(PetscFree(lens)); 1889 if (sbuf_aj) { 1890 PetscCall(PetscFree(sbuf_aj[0])); 1891 PetscCall(PetscFree(sbuf_aj)); 1892 } 1893 } 1894 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 1895 PetscCall(MatSeqAIJRestoreArrayRead(B, (const PetscScalar **)&b_a)); 1896 PetscFunctionReturn(0); 1897 } 1898 1899 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1900 { 1901 PetscInt ncol; 1902 PetscBool colflag, allcolumns = PETSC_FALSE; 1903 1904 PetscFunctionBegin; 1905 /* Allocate memory to hold all the submatrices */ 1906 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(2, submat)); 1907 1908 /* Check for special case: each processor gets entire matrix columns */ 1909 PetscCall(ISIdentity(iscol[0], &colflag)); 1910 PetscCall(ISGetLocalSize(iscol[0], &ncol)); 1911 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1912 1913 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C, ismax, isrow, iscol, scall, allcolumns, *submat)); 1914 PetscFunctionReturn(0); 1915 } 1916 1917 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 1918 { 1919 PetscInt nmax, nstages = 0, i, pos, max_no, nrow, ncol, in[2], out[2]; 1920 PetscBool rowflag, colflag, wantallmatrix = PETSC_FALSE; 1921 Mat_SeqAIJ *subc; 1922 Mat_SubSppt *smat; 1923 1924 PetscFunctionBegin; 1925 /* Check for special case: each processor has a single IS */ 1926 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1927 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1928 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1929 PetscFunctionReturn(0); 1930 } 1931 1932 /* Collect global wantallmatrix and nstages */ 1933 if (!C->cmap->N) nmax = 20 * 1000000 / sizeof(PetscInt); 1934 else nmax = 20 * 1000000 / (C->cmap->N * sizeof(PetscInt)); 1935 if (!nmax) nmax = 1; 1936 1937 if (scall == MAT_INITIAL_MATRIX) { 1938 /* Collect global wantallmatrix and nstages */ 1939 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1940 PetscCall(ISIdentity(*isrow, &rowflag)); 1941 PetscCall(ISIdentity(*iscol, &colflag)); 1942 PetscCall(ISGetLocalSize(*isrow, &nrow)); 1943 PetscCall(ISGetLocalSize(*iscol, &ncol)); 1944 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1945 wantallmatrix = PETSC_TRUE; 1946 1947 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options, ((PetscObject)C)->prefix, "-use_fast_submatrix", &wantallmatrix, NULL)); 1948 } 1949 } 1950 1951 /* Determine the number of stages through which submatrices are done 1952 Each stage will extract nmax submatrices. 1953 nmax is determined by the matrix column dimension. 1954 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1955 */ 1956 nstages = ismax / nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1957 1958 in[0] = -1 * (PetscInt)wantallmatrix; 1959 in[1] = nstages; 1960 PetscCall(MPIU_Allreduce(in, out, 2, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 1961 wantallmatrix = (PetscBool)(-out[0]); 1962 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1963 1964 } else { /* MAT_REUSE_MATRIX */ 1965 if (ismax) { 1966 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 1967 smat = subc->submatis1; 1968 } else { /* (*submat)[0] is a dummy matrix */ 1969 smat = (Mat_SubSppt *)(*submat)[0]->data; 1970 } 1971 if (!smat) { 1972 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 1973 wantallmatrix = PETSC_TRUE; 1974 } else if (smat->singleis) { 1975 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C, ismax, isrow, iscol, scall, submat)); 1976 PetscFunctionReturn(0); 1977 } else { 1978 nstages = smat->nstages; 1979 } 1980 } 1981 1982 if (wantallmatrix) { 1983 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C, MAT_GET_VALUES, scall, submat)); 1984 PetscFunctionReturn(0); 1985 } 1986 1987 /* Allocate memory to hold all the submatrices and dummy submatrices */ 1988 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(ismax + nstages, submat)); 1989 1990 for (i = 0, pos = 0; i < nstages; i++) { 1991 if (pos + nmax <= ismax) max_no = nmax; 1992 else if (pos >= ismax) max_no = 0; 1993 else max_no = ismax - pos; 1994 1995 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C, max_no, isrow + pos, iscol + pos, scall, *submat + pos)); 1996 if (!max_no) { 1997 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 1998 smat = (Mat_SubSppt *)(*submat)[pos]->data; 1999 smat->nstages = nstages; 2000 } 2001 pos++; /* advance to next dummy matrix if any */ 2002 } else pos += max_no; 2003 } 2004 2005 if (ismax && scall == MAT_INITIAL_MATRIX) { 2006 /* save nstages for reuse */ 2007 subc = (Mat_SeqAIJ *)(*submat)[0]->data; 2008 smat = subc->submatis1; 2009 smat->nstages = nstages; 2010 } 2011 PetscFunctionReturn(0); 2012 } 2013 2014 /* -------------------------------------------------------------------------*/ 2015 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submats) 2016 { 2017 Mat_MPIAIJ *c = (Mat_MPIAIJ *)C->data; 2018 Mat A = c->A; 2019 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)c->B->data, *subc; 2020 const PetscInt **icol, **irow; 2021 PetscInt *nrow, *ncol, start; 2022 PetscMPIInt rank, size, tag0, tag2, tag3, tag4, *w1, *w2, *w3, *w4, nrqr; 2023 PetscInt **sbuf1, **sbuf2, i, j, k, l, ct1, ct2, **rbuf1, row, proc = -1; 2024 PetscInt nrqs = 0, msz, **ptr = NULL, *req_size = NULL, *ctr = NULL, *pa, *tmp = NULL, tcol; 2025 PetscInt **rbuf3 = NULL, *req_source1 = NULL, *req_source2, **sbuf_aj, **rbuf2 = NULL, max1, max2; 2026 PetscInt **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2, jmax; 2027 #if defined(PETSC_USE_CTABLE) 2028 PetscTable *cmap, cmap_i = NULL, *rmap, rmap_i; 2029 #else 2030 PetscInt **cmap, *cmap_i = NULL, **rmap, *rmap_i; 2031 #endif 2032 const PetscInt *irow_i; 2033 PetscInt ctr_j, *sbuf1_j, *sbuf_aj_i, *rbuf1_i, kmax, *lens_i; 2034 MPI_Request *s_waits1, *r_waits1, *s_waits2, *r_waits2, *r_waits3; 2035 MPI_Request *r_waits4, *s_waits3, *s_waits4; 2036 MPI_Comm comm; 2037 PetscScalar **rbuf4, *rbuf4_i, **sbuf_aa, *vals, *mat_a, *imat_a, *sbuf_aa_i; 2038 PetscMPIInt *onodes1, *olengths1, end; 2039 PetscInt **row2proc, *row2proc_i, ilen_row, *imat_ilen, *imat_j, *imat_i, old_row; 2040 Mat_SubSppt *smat_i; 2041 PetscBool *issorted, *allcolumns, colflag, iscsorted = PETSC_TRUE; 2042 PetscInt *sbuf1_i, *rbuf2_i, *rbuf3_i, ilen; 2043 2044 PetscFunctionBegin; 2045 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2046 size = c->size; 2047 rank = c->rank; 2048 2049 PetscCall(PetscMalloc4(ismax, &row2proc, ismax, &cmap, ismax, &rmap, ismax + 1, &allcolumns)); 2050 PetscCall(PetscMalloc5(ismax, (PetscInt ***)&irow, ismax, (PetscInt ***)&icol, ismax, &nrow, ismax, &ncol, ismax, &issorted)); 2051 2052 for (i = 0; i < ismax; i++) { 2053 PetscCall(ISSorted(iscol[i], &issorted[i])); 2054 if (!issorted[i]) iscsorted = issorted[i]; 2055 2056 PetscCall(ISSorted(isrow[i], &issorted[i])); 2057 2058 PetscCall(ISGetIndices(isrow[i], &irow[i])); 2059 PetscCall(ISGetLocalSize(isrow[i], &nrow[i])); 2060 2061 /* Check for special case: allcolumn */ 2062 PetscCall(ISIdentity(iscol[i], &colflag)); 2063 PetscCall(ISGetLocalSize(iscol[i], &ncol[i])); 2064 if (colflag && ncol[i] == C->cmap->N) { 2065 allcolumns[i] = PETSC_TRUE; 2066 icol[i] = NULL; 2067 } else { 2068 allcolumns[i] = PETSC_FALSE; 2069 PetscCall(ISGetIndices(iscol[i], &icol[i])); 2070 } 2071 } 2072 2073 if (scall == MAT_REUSE_MATRIX) { 2074 /* Assumes new rows are same length as the old rows */ 2075 for (i = 0; i < ismax; i++) { 2076 PetscCheck(submats[i], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats[%" PetscInt_FMT "] is null, cannot reuse", i); 2077 subc = (Mat_SeqAIJ *)submats[i]->data; 2078 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]), PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong size"); 2079 2080 /* Initial matrix as if empty */ 2081 PetscCall(PetscArrayzero(subc->ilen, submats[i]->rmap->n)); 2082 2083 smat_i = subc->submatis1; 2084 2085 nrqs = smat_i->nrqs; 2086 nrqr = smat_i->nrqr; 2087 rbuf1 = smat_i->rbuf1; 2088 rbuf2 = smat_i->rbuf2; 2089 rbuf3 = smat_i->rbuf3; 2090 req_source2 = smat_i->req_source2; 2091 2092 sbuf1 = smat_i->sbuf1; 2093 sbuf2 = smat_i->sbuf2; 2094 ptr = smat_i->ptr; 2095 tmp = smat_i->tmp; 2096 ctr = smat_i->ctr; 2097 2098 pa = smat_i->pa; 2099 req_size = smat_i->req_size; 2100 req_source1 = smat_i->req_source1; 2101 2102 allcolumns[i] = smat_i->allcolumns; 2103 row2proc[i] = smat_i->row2proc; 2104 rmap[i] = smat_i->rmap; 2105 cmap[i] = smat_i->cmap; 2106 } 2107 2108 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2109 PetscCheck(submats[0], PETSC_COMM_SELF, PETSC_ERR_ARG_NULL, "submats are null, cannot reuse"); 2110 smat_i = (Mat_SubSppt *)submats[0]->data; 2111 2112 nrqs = smat_i->nrqs; 2113 nrqr = smat_i->nrqr; 2114 rbuf1 = smat_i->rbuf1; 2115 rbuf2 = smat_i->rbuf2; 2116 rbuf3 = smat_i->rbuf3; 2117 req_source2 = smat_i->req_source2; 2118 2119 sbuf1 = smat_i->sbuf1; 2120 sbuf2 = smat_i->sbuf2; 2121 ptr = smat_i->ptr; 2122 tmp = smat_i->tmp; 2123 ctr = smat_i->ctr; 2124 2125 pa = smat_i->pa; 2126 req_size = smat_i->req_size; 2127 req_source1 = smat_i->req_source1; 2128 2129 allcolumns[0] = PETSC_FALSE; 2130 } 2131 } else { /* scall == MAT_INITIAL_MATRIX */ 2132 /* Get some new tags to keep the communication clean */ 2133 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 2134 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag3)); 2135 2136 /* evaluate communication - mesg to who, length of mesg, and buffer space 2137 required. Based on this, buffers are allocated, and data copied into them*/ 2138 PetscCall(PetscCalloc4(size, &w1, size, &w2, size, &w3, size, &w4)); /* mesg size, initialize work vectors */ 2139 2140 for (i = 0; i < ismax; i++) { 2141 jmax = nrow[i]; 2142 irow_i = irow[i]; 2143 2144 PetscCall(PetscMalloc1(jmax, &row2proc_i)); 2145 row2proc[i] = row2proc_i; 2146 2147 if (issorted[i]) proc = 0; 2148 for (j = 0; j < jmax; j++) { 2149 if (!issorted[i]) proc = 0; 2150 row = irow_i[j]; 2151 while (row >= C->rmap->range[proc + 1]) proc++; 2152 w4[proc]++; 2153 row2proc_i[j] = proc; /* map row index to proc */ 2154 } 2155 for (j = 0; j < size; j++) { 2156 if (w4[j]) { 2157 w1[j] += w4[j]; 2158 w3[j]++; 2159 w4[j] = 0; 2160 } 2161 } 2162 } 2163 2164 nrqs = 0; /* no of outgoing messages */ 2165 msz = 0; /* total mesg length (for all procs) */ 2166 w1[rank] = 0; /* no mesg sent to self */ 2167 w3[rank] = 0; 2168 for (i = 0; i < size; i++) { 2169 if (w1[i]) { 2170 w2[i] = 1; 2171 nrqs++; 2172 } /* there exists a message to proc i */ 2173 } 2174 PetscCall(PetscMalloc1(nrqs, &pa)); /*(proc -array)*/ 2175 for (i = 0, j = 0; i < size; i++) { 2176 if (w1[i]) { 2177 pa[j] = i; 2178 j++; 2179 } 2180 } 2181 2182 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2183 for (i = 0; i < nrqs; i++) { 2184 j = pa[i]; 2185 w1[j] += w2[j] + 2 * w3[j]; 2186 msz += w1[j]; 2187 } 2188 PetscCall(PetscInfo(0, "Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n", nrqs, msz)); 2189 2190 /* Determine the number of messages to expect, their lengths, from from-ids */ 2191 PetscCall(PetscGatherNumberOfMessages(comm, w2, w1, &nrqr)); 2192 PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, w1, &onodes1, &olengths1)); 2193 2194 /* Now post the Irecvs corresponding to these messages */ 2195 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag0)); 2196 PetscCall(PetscPostIrecvInt(comm, tag0, nrqr, onodes1, olengths1, &rbuf1, &r_waits1)); 2197 2198 /* Allocate Memory for outgoing messages */ 2199 PetscCall(PetscMalloc4(size, &sbuf1, size, &ptr, 2 * msz, &tmp, size, &ctr)); 2200 PetscCall(PetscArrayzero(sbuf1, size)); 2201 PetscCall(PetscArrayzero(ptr, size)); 2202 2203 { 2204 PetscInt *iptr = tmp; 2205 k = 0; 2206 for (i = 0; i < nrqs; i++) { 2207 j = pa[i]; 2208 iptr += k; 2209 sbuf1[j] = iptr; 2210 k = w1[j]; 2211 } 2212 } 2213 2214 /* Form the outgoing messages. Initialize the header space */ 2215 for (i = 0; i < nrqs; i++) { 2216 j = pa[i]; 2217 sbuf1[j][0] = 0; 2218 PetscCall(PetscArrayzero(sbuf1[j] + 1, 2 * w3[j])); 2219 ptr[j] = sbuf1[j] + 2 * w3[j] + 1; 2220 } 2221 2222 /* Parse the isrow and copy data into outbuf */ 2223 for (i = 0; i < ismax; i++) { 2224 row2proc_i = row2proc[i]; 2225 PetscCall(PetscArrayzero(ctr, size)); 2226 irow_i = irow[i]; 2227 jmax = nrow[i]; 2228 for (j = 0; j < jmax; j++) { /* parse the indices of each IS */ 2229 proc = row2proc_i[j]; 2230 if (proc != rank) { /* copy to the outgoing buf*/ 2231 ctr[proc]++; 2232 *ptr[proc] = irow_i[j]; 2233 ptr[proc]++; 2234 } 2235 } 2236 /* Update the headers for the current IS */ 2237 for (j = 0; j < size; j++) { /* Can Optimise this loop too */ 2238 if ((ctr_j = ctr[j])) { 2239 sbuf1_j = sbuf1[j]; 2240 k = ++sbuf1_j[0]; 2241 sbuf1_j[2 * k] = ctr_j; 2242 sbuf1_j[2 * k - 1] = i; 2243 } 2244 } 2245 } 2246 2247 /* Now post the sends */ 2248 PetscCall(PetscMalloc1(nrqs, &s_waits1)); 2249 for (i = 0; i < nrqs; ++i) { 2250 j = pa[i]; 2251 PetscCallMPI(MPI_Isend(sbuf1[j], w1[j], MPIU_INT, j, tag0, comm, s_waits1 + i)); 2252 } 2253 2254 /* Post Receives to capture the buffer size */ 2255 PetscCall(PetscMalloc1(nrqs, &r_waits2)); 2256 PetscCall(PetscMalloc3(nrqs, &req_source2, nrqs, &rbuf2, nrqs, &rbuf3)); 2257 if (nrqs) rbuf2[0] = tmp + msz; 2258 for (i = 1; i < nrqs; ++i) rbuf2[i] = rbuf2[i - 1] + w1[pa[i - 1]]; 2259 for (i = 0; i < nrqs; ++i) { 2260 j = pa[i]; 2261 PetscCallMPI(MPI_Irecv(rbuf2[i], w1[j], MPIU_INT, j, tag2, comm, r_waits2 + i)); 2262 } 2263 2264 /* Send to other procs the buf size they should allocate */ 2265 /* Receive messages*/ 2266 PetscCall(PetscMalloc1(nrqr, &s_waits2)); 2267 PetscCall(PetscMalloc3(nrqr, &sbuf2, nrqr, &req_size, nrqr, &req_source1)); 2268 { 2269 PetscInt *sAi = a->i, *sBi = b->i, id, rstart = C->rmap->rstart; 2270 PetscInt *sbuf2_i; 2271 2272 PetscCallMPI(MPI_Waitall(nrqr, r_waits1, MPI_STATUSES_IGNORE)); 2273 for (i = 0; i < nrqr; ++i) { 2274 req_size[i] = 0; 2275 rbuf1_i = rbuf1[i]; 2276 start = 2 * rbuf1_i[0] + 1; 2277 end = olengths1[i]; 2278 PetscCall(PetscMalloc1(end, &sbuf2[i])); 2279 sbuf2_i = sbuf2[i]; 2280 for (j = start; j < end; j++) { 2281 id = rbuf1_i[j] - rstart; 2282 ncols = sAi[id + 1] - sAi[id] + sBi[id + 1] - sBi[id]; 2283 sbuf2_i[j] = ncols; 2284 req_size[i] += ncols; 2285 } 2286 req_source1[i] = onodes1[i]; 2287 /* form the header */ 2288 sbuf2_i[0] = req_size[i]; 2289 for (j = 1; j < start; j++) sbuf2_i[j] = rbuf1_i[j]; 2290 2291 PetscCallMPI(MPI_Isend(sbuf2_i, end, MPIU_INT, req_source1[i], tag2, comm, s_waits2 + i)); 2292 } 2293 } 2294 2295 PetscCall(PetscFree(onodes1)); 2296 PetscCall(PetscFree(olengths1)); 2297 PetscCall(PetscFree(r_waits1)); 2298 PetscCall(PetscFree4(w1, w2, w3, w4)); 2299 2300 /* Receive messages*/ 2301 PetscCall(PetscMalloc1(nrqs, &r_waits3)); 2302 PetscCallMPI(MPI_Waitall(nrqs, r_waits2, MPI_STATUSES_IGNORE)); 2303 for (i = 0; i < nrqs; ++i) { 2304 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf3[i])); 2305 req_source2[i] = pa[i]; 2306 PetscCallMPI(MPI_Irecv(rbuf3[i], rbuf2[i][0], MPIU_INT, req_source2[i], tag3, comm, r_waits3 + i)); 2307 } 2308 PetscCall(PetscFree(r_waits2)); 2309 2310 /* Wait on sends1 and sends2 */ 2311 PetscCallMPI(MPI_Waitall(nrqs, s_waits1, MPI_STATUSES_IGNORE)); 2312 PetscCallMPI(MPI_Waitall(nrqr, s_waits2, MPI_STATUSES_IGNORE)); 2313 PetscCall(PetscFree(s_waits1)); 2314 PetscCall(PetscFree(s_waits2)); 2315 2316 /* Now allocate sending buffers for a->j, and send them off */ 2317 PetscCall(PetscMalloc1(nrqr, &sbuf_aj)); 2318 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2319 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aj[0])); 2320 for (i = 1; i < nrqr; i++) sbuf_aj[i] = sbuf_aj[i - 1] + req_size[i - 1]; 2321 2322 PetscCall(PetscMalloc1(nrqr, &s_waits3)); 2323 { 2324 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, lwrite; 2325 PetscInt *cworkA, *cworkB, cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2326 PetscInt cend = C->cmap->rend; 2327 PetscInt *a_j = a->j, *b_j = b->j, ctmp; 2328 2329 for (i = 0; i < nrqr; i++) { 2330 rbuf1_i = rbuf1[i]; 2331 sbuf_aj_i = sbuf_aj[i]; 2332 ct1 = 2 * rbuf1_i[0] + 1; 2333 ct2 = 0; 2334 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2335 kmax = rbuf1[i][2 * j]; 2336 for (k = 0; k < kmax; k++, ct1++) { 2337 row = rbuf1_i[ct1] - rstart; 2338 nzA = a_i[row + 1] - a_i[row]; 2339 nzB = b_i[row + 1] - b_i[row]; 2340 ncols = nzA + nzB; 2341 cworkA = a_j + a_i[row]; 2342 cworkB = b_j + b_i[row]; 2343 2344 /* load the column indices for this row into cols */ 2345 cols = sbuf_aj_i + ct2; 2346 2347 lwrite = 0; 2348 for (l = 0; l < nzB; l++) { 2349 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2350 } 2351 for (l = 0; l < nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2352 for (l = 0; l < nzB; l++) { 2353 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2354 } 2355 2356 ct2 += ncols; 2357 } 2358 } 2359 PetscCallMPI(MPI_Isend(sbuf_aj_i, req_size[i], MPIU_INT, req_source1[i], tag3, comm, s_waits3 + i)); 2360 } 2361 } 2362 2363 /* create col map: global col of C -> local col of submatrices */ 2364 { 2365 const PetscInt *icol_i; 2366 #if defined(PETSC_USE_CTABLE) 2367 for (i = 0; i < ismax; i++) { 2368 if (!allcolumns[i]) { 2369 PetscCall(PetscTableCreate(ncol[i], C->cmap->N, &cmap[i])); 2370 2371 jmax = ncol[i]; 2372 icol_i = icol[i]; 2373 cmap_i = cmap[i]; 2374 for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(cmap[i], icol_i[j] + 1, j + 1, INSERT_VALUES)); 2375 } else cmap[i] = NULL; 2376 } 2377 #else 2378 for (i = 0; i < ismax; i++) { 2379 if (!allcolumns[i]) { 2380 PetscCall(PetscCalloc1(C->cmap->N, &cmap[i])); 2381 jmax = ncol[i]; 2382 icol_i = icol[i]; 2383 cmap_i = cmap[i]; 2384 for (j = 0; j < jmax; j++) cmap_i[icol_i[j]] = j + 1; 2385 } else cmap[i] = NULL; 2386 } 2387 #endif 2388 } 2389 2390 /* Create lens which is required for MatCreate... */ 2391 for (i = 0, j = 0; i < ismax; i++) j += nrow[i]; 2392 PetscCall(PetscMalloc1(ismax, &lens)); 2393 2394 if (ismax) PetscCall(PetscCalloc1(j, &lens[0])); 2395 for (i = 1; i < ismax; i++) lens[i] = lens[i - 1] + nrow[i - 1]; 2396 2397 /* Update lens from local data */ 2398 for (i = 0; i < ismax; i++) { 2399 row2proc_i = row2proc[i]; 2400 jmax = nrow[i]; 2401 if (!allcolumns[i]) cmap_i = cmap[i]; 2402 irow_i = irow[i]; 2403 lens_i = lens[i]; 2404 for (j = 0; j < jmax; j++) { 2405 row = irow_i[j]; 2406 proc = row2proc_i[j]; 2407 if (proc == rank) { 2408 PetscCall(MatGetRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2409 if (!allcolumns[i]) { 2410 for (k = 0; k < ncols; k++) { 2411 #if defined(PETSC_USE_CTABLE) 2412 PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol)); 2413 #else 2414 tcol = cmap_i[cols[k]]; 2415 #endif 2416 if (tcol) lens_i[j]++; 2417 } 2418 } else { /* allcolumns */ 2419 lens_i[j] = ncols; 2420 } 2421 PetscCall(MatRestoreRow_MPIAIJ(C, row, &ncols, &cols, NULL)); 2422 } 2423 } 2424 } 2425 2426 /* Create row map: global row of C -> local row of submatrices */ 2427 #if defined(PETSC_USE_CTABLE) 2428 for (i = 0; i < ismax; i++) { 2429 PetscCall(PetscTableCreate(nrow[i], C->rmap->N, &rmap[i])); 2430 irow_i = irow[i]; 2431 jmax = nrow[i]; 2432 for (j = 0; j < jmax; j++) PetscCall(PetscTableAdd(rmap[i], irow_i[j] + 1, j + 1, INSERT_VALUES)); 2433 } 2434 #else 2435 for (i = 0; i < ismax; i++) { 2436 PetscCall(PetscCalloc1(C->rmap->N, &rmap[i])); 2437 rmap_i = rmap[i]; 2438 irow_i = irow[i]; 2439 jmax = nrow[i]; 2440 for (j = 0; j < jmax; j++) rmap_i[irow_i[j]] = j; 2441 } 2442 #endif 2443 2444 /* Update lens from offproc data */ 2445 { 2446 PetscInt *rbuf2_i, *rbuf3_i, *sbuf1_i; 2447 2448 PetscCallMPI(MPI_Waitall(nrqs, r_waits3, MPI_STATUSES_IGNORE)); 2449 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2450 sbuf1_i = sbuf1[pa[tmp2]]; 2451 jmax = sbuf1_i[0]; 2452 ct1 = 2 * jmax + 1; 2453 ct2 = 0; 2454 rbuf2_i = rbuf2[tmp2]; 2455 rbuf3_i = rbuf3[tmp2]; 2456 for (j = 1; j <= jmax; j++) { 2457 is_no = sbuf1_i[2 * j - 1]; 2458 max1 = sbuf1_i[2 * j]; 2459 lens_i = lens[is_no]; 2460 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2461 rmap_i = rmap[is_no]; 2462 for (k = 0; k < max1; k++, ct1++) { 2463 #if defined(PETSC_USE_CTABLE) 2464 PetscCall(PetscTableFind(rmap_i, sbuf1_i[ct1] + 1, &row)); 2465 row--; 2466 PetscCheck(row >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "row not found in table"); 2467 #else 2468 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2469 #endif 2470 max2 = rbuf2_i[ct1]; 2471 for (l = 0; l < max2; l++, ct2++) { 2472 if (!allcolumns[is_no]) { 2473 #if defined(PETSC_USE_CTABLE) 2474 PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol)); 2475 #else 2476 tcol = cmap_i[rbuf3_i[ct2]]; 2477 #endif 2478 if (tcol) lens_i[row]++; 2479 } else { /* allcolumns */ 2480 lens_i[row]++; /* lens_i[row] += max2 ? */ 2481 } 2482 } 2483 } 2484 } 2485 } 2486 } 2487 PetscCall(PetscFree(r_waits3)); 2488 PetscCallMPI(MPI_Waitall(nrqr, s_waits3, MPI_STATUSES_IGNORE)); 2489 PetscCall(PetscFree(s_waits3)); 2490 2491 /* Create the submatrices */ 2492 for (i = 0; i < ismax; i++) { 2493 PetscInt rbs, cbs; 2494 2495 PetscCall(ISGetBlockSize(isrow[i], &rbs)); 2496 PetscCall(ISGetBlockSize(iscol[i], &cbs)); 2497 2498 PetscCall(MatCreate(PETSC_COMM_SELF, submats + i)); 2499 PetscCall(MatSetSizes(submats[i], nrow[i], ncol[i], PETSC_DETERMINE, PETSC_DETERMINE)); 2500 2501 PetscCall(MatSetBlockSizes(submats[i], rbs, cbs)); 2502 PetscCall(MatSetType(submats[i], ((PetscObject)A)->type_name)); 2503 PetscCall(MatSeqAIJSetPreallocation(submats[i], 0, lens[i])); 2504 2505 /* create struct Mat_SubSppt and attached it to submat */ 2506 PetscCall(PetscNew(&smat_i)); 2507 subc = (Mat_SeqAIJ *)submats[i]->data; 2508 subc->submatis1 = smat_i; 2509 2510 smat_i->destroy = submats[i]->ops->destroy; 2511 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2512 submats[i]->factortype = C->factortype; 2513 2514 smat_i->id = i; 2515 smat_i->nrqs = nrqs; 2516 smat_i->nrqr = nrqr; 2517 smat_i->rbuf1 = rbuf1; 2518 smat_i->rbuf2 = rbuf2; 2519 smat_i->rbuf3 = rbuf3; 2520 smat_i->sbuf2 = sbuf2; 2521 smat_i->req_source2 = req_source2; 2522 2523 smat_i->sbuf1 = sbuf1; 2524 smat_i->ptr = ptr; 2525 smat_i->tmp = tmp; 2526 smat_i->ctr = ctr; 2527 2528 smat_i->pa = pa; 2529 smat_i->req_size = req_size; 2530 smat_i->req_source1 = req_source1; 2531 2532 smat_i->allcolumns = allcolumns[i]; 2533 smat_i->singleis = PETSC_FALSE; 2534 smat_i->row2proc = row2proc[i]; 2535 smat_i->rmap = rmap[i]; 2536 smat_i->cmap = cmap[i]; 2537 } 2538 2539 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2540 PetscCall(MatCreate(PETSC_COMM_SELF, &submats[0])); 2541 PetscCall(MatSetSizes(submats[0], 0, 0, PETSC_DETERMINE, PETSC_DETERMINE)); 2542 PetscCall(MatSetType(submats[0], MATDUMMY)); 2543 2544 /* create struct Mat_SubSppt and attached it to submat */ 2545 PetscCall(PetscNew(&smat_i)); 2546 submats[0]->data = (void *)smat_i; 2547 2548 smat_i->destroy = submats[0]->ops->destroy; 2549 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2550 submats[0]->factortype = C->factortype; 2551 2552 smat_i->id = 0; 2553 smat_i->nrqs = nrqs; 2554 smat_i->nrqr = nrqr; 2555 smat_i->rbuf1 = rbuf1; 2556 smat_i->rbuf2 = rbuf2; 2557 smat_i->rbuf3 = rbuf3; 2558 smat_i->sbuf2 = sbuf2; 2559 smat_i->req_source2 = req_source2; 2560 2561 smat_i->sbuf1 = sbuf1; 2562 smat_i->ptr = ptr; 2563 smat_i->tmp = tmp; 2564 smat_i->ctr = ctr; 2565 2566 smat_i->pa = pa; 2567 smat_i->req_size = req_size; 2568 smat_i->req_source1 = req_source1; 2569 2570 smat_i->allcolumns = PETSC_FALSE; 2571 smat_i->singleis = PETSC_FALSE; 2572 smat_i->row2proc = NULL; 2573 smat_i->rmap = NULL; 2574 smat_i->cmap = NULL; 2575 } 2576 2577 if (ismax) PetscCall(PetscFree(lens[0])); 2578 PetscCall(PetscFree(lens)); 2579 if (sbuf_aj) { 2580 PetscCall(PetscFree(sbuf_aj[0])); 2581 PetscCall(PetscFree(sbuf_aj)); 2582 } 2583 2584 } /* endof scall == MAT_INITIAL_MATRIX */ 2585 2586 /* Post recv matrix values */ 2587 PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag4)); 2588 PetscCall(PetscMalloc1(nrqs, &rbuf4)); 2589 PetscCall(PetscMalloc1(nrqs, &r_waits4)); 2590 for (i = 0; i < nrqs; ++i) { 2591 PetscCall(PetscMalloc1(rbuf2[i][0], &rbuf4[i])); 2592 PetscCallMPI(MPI_Irecv(rbuf4[i], rbuf2[i][0], MPIU_SCALAR, req_source2[i], tag4, comm, r_waits4 + i)); 2593 } 2594 2595 /* Allocate sending buffers for a->a, and send them off */ 2596 PetscCall(PetscMalloc1(nrqr, &sbuf_aa)); 2597 for (i = 0, j = 0; i < nrqr; i++) j += req_size[i]; 2598 if (nrqr) PetscCall(PetscMalloc1(j, &sbuf_aa[0])); 2599 for (i = 1; i < nrqr; i++) sbuf_aa[i] = sbuf_aa[i - 1] + req_size[i - 1]; 2600 2601 PetscCall(PetscMalloc1(nrqr, &s_waits4)); 2602 { 2603 PetscInt nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, lwrite; 2604 PetscInt cstart = C->cmap->rstart, rstart = C->rmap->rstart, *bmap = c->garray; 2605 PetscInt cend = C->cmap->rend; 2606 PetscInt *b_j = b->j; 2607 PetscScalar *vworkA, *vworkB, *a_a, *b_a; 2608 2609 PetscCall(MatSeqAIJGetArrayRead(A, (const PetscScalar **)&a_a)); 2610 PetscCall(MatSeqAIJGetArrayRead(c->B, (const PetscScalar **)&b_a)); 2611 for (i = 0; i < nrqr; i++) { 2612 rbuf1_i = rbuf1[i]; 2613 sbuf_aa_i = sbuf_aa[i]; 2614 ct1 = 2 * rbuf1_i[0] + 1; 2615 ct2 = 0; 2616 for (j = 1, max1 = rbuf1_i[0]; j <= max1; j++) { 2617 kmax = rbuf1_i[2 * j]; 2618 for (k = 0; k < kmax; k++, ct1++) { 2619 row = rbuf1_i[ct1] - rstart; 2620 nzA = a_i[row + 1] - a_i[row]; 2621 nzB = b_i[row + 1] - b_i[row]; 2622 ncols = nzA + nzB; 2623 cworkB = b_j + b_i[row]; 2624 vworkA = a_a + a_i[row]; 2625 vworkB = b_a + b_i[row]; 2626 2627 /* load the column values for this row into vals*/ 2628 vals = sbuf_aa_i + ct2; 2629 2630 lwrite = 0; 2631 for (l = 0; l < nzB; l++) { 2632 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2633 } 2634 for (l = 0; l < nzA; l++) vals[lwrite++] = vworkA[l]; 2635 for (l = 0; l < nzB; l++) { 2636 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2637 } 2638 2639 ct2 += ncols; 2640 } 2641 } 2642 PetscCallMPI(MPI_Isend(sbuf_aa_i, req_size[i], MPIU_SCALAR, req_source1[i], tag4, comm, s_waits4 + i)); 2643 } 2644 PetscCall(MatSeqAIJRestoreArrayRead(A, (const PetscScalar **)&a_a)); 2645 PetscCall(MatSeqAIJRestoreArrayRead(c->B, (const PetscScalar **)&b_a)); 2646 } 2647 2648 /* Assemble the matrices */ 2649 /* First assemble the local rows */ 2650 for (i = 0; i < ismax; i++) { 2651 row2proc_i = row2proc[i]; 2652 subc = (Mat_SeqAIJ *)submats[i]->data; 2653 imat_ilen = subc->ilen; 2654 imat_j = subc->j; 2655 imat_i = subc->i; 2656 imat_a = subc->a; 2657 2658 if (!allcolumns[i]) cmap_i = cmap[i]; 2659 rmap_i = rmap[i]; 2660 irow_i = irow[i]; 2661 jmax = nrow[i]; 2662 for (j = 0; j < jmax; j++) { 2663 row = irow_i[j]; 2664 proc = row2proc_i[j]; 2665 if (proc == rank) { 2666 old_row = row; 2667 #if defined(PETSC_USE_CTABLE) 2668 PetscCall(PetscTableFind(rmap_i, row + 1, &row)); 2669 row--; 2670 #else 2671 row = rmap_i[row]; 2672 #endif 2673 ilen_row = imat_ilen[row]; 2674 PetscCall(MatGetRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2675 mat_i = imat_i[row]; 2676 mat_a = imat_a + mat_i; 2677 mat_j = imat_j + mat_i; 2678 if (!allcolumns[i]) { 2679 for (k = 0; k < ncols; k++) { 2680 #if defined(PETSC_USE_CTABLE) 2681 PetscCall(PetscTableFind(cmap_i, cols[k] + 1, &tcol)); 2682 #else 2683 tcol = cmap_i[cols[k]]; 2684 #endif 2685 if (tcol) { 2686 *mat_j++ = tcol - 1; 2687 *mat_a++ = vals[k]; 2688 ilen_row++; 2689 } 2690 } 2691 } else { /* allcolumns */ 2692 for (k = 0; k < ncols; k++) { 2693 *mat_j++ = cols[k]; /* global col index! */ 2694 *mat_a++ = vals[k]; 2695 ilen_row++; 2696 } 2697 } 2698 PetscCall(MatRestoreRow_MPIAIJ(C, old_row, &ncols, &cols, &vals)); 2699 2700 imat_ilen[row] = ilen_row; 2701 } 2702 } 2703 } 2704 2705 /* Now assemble the off proc rows */ 2706 PetscCallMPI(MPI_Waitall(nrqs, r_waits4, MPI_STATUSES_IGNORE)); 2707 for (tmp2 = 0; tmp2 < nrqs; tmp2++) { 2708 sbuf1_i = sbuf1[pa[tmp2]]; 2709 jmax = sbuf1_i[0]; 2710 ct1 = 2 * jmax + 1; 2711 ct2 = 0; 2712 rbuf2_i = rbuf2[tmp2]; 2713 rbuf3_i = rbuf3[tmp2]; 2714 rbuf4_i = rbuf4[tmp2]; 2715 for (j = 1; j <= jmax; j++) { 2716 is_no = sbuf1_i[2 * j - 1]; 2717 rmap_i = rmap[is_no]; 2718 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2719 subc = (Mat_SeqAIJ *)submats[is_no]->data; 2720 imat_ilen = subc->ilen; 2721 imat_j = subc->j; 2722 imat_i = subc->i; 2723 imat_a = subc->a; 2724 max1 = sbuf1_i[2 * j]; 2725 for (k = 0; k < max1; k++, ct1++) { 2726 row = sbuf1_i[ct1]; 2727 #if defined(PETSC_USE_CTABLE) 2728 PetscCall(PetscTableFind(rmap_i, row + 1, &row)); 2729 row--; 2730 #else 2731 row = rmap_i[row]; 2732 #endif 2733 ilen = imat_ilen[row]; 2734 mat_i = imat_i[row]; 2735 mat_a = imat_a + mat_i; 2736 mat_j = imat_j + mat_i; 2737 max2 = rbuf2_i[ct1]; 2738 if (!allcolumns[is_no]) { 2739 for (l = 0; l < max2; l++, ct2++) { 2740 #if defined(PETSC_USE_CTABLE) 2741 PetscCall(PetscTableFind(cmap_i, rbuf3_i[ct2] + 1, &tcol)); 2742 #else 2743 tcol = cmap_i[rbuf3_i[ct2]]; 2744 #endif 2745 if (tcol) { 2746 *mat_j++ = tcol - 1; 2747 *mat_a++ = rbuf4_i[ct2]; 2748 ilen++; 2749 } 2750 } 2751 } else { /* allcolumns */ 2752 for (l = 0; l < max2; l++, ct2++) { 2753 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2754 *mat_a++ = rbuf4_i[ct2]; 2755 ilen++; 2756 } 2757 } 2758 imat_ilen[row] = ilen; 2759 } 2760 } 2761 } 2762 2763 if (!iscsorted) { /* sort column indices of the rows */ 2764 for (i = 0; i < ismax; i++) { 2765 subc = (Mat_SeqAIJ *)submats[i]->data; 2766 imat_j = subc->j; 2767 imat_i = subc->i; 2768 imat_a = subc->a; 2769 imat_ilen = subc->ilen; 2770 2771 if (allcolumns[i]) continue; 2772 jmax = nrow[i]; 2773 for (j = 0; j < jmax; j++) { 2774 mat_i = imat_i[j]; 2775 mat_a = imat_a + mat_i; 2776 mat_j = imat_j + mat_i; 2777 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j], mat_j, mat_a)); 2778 } 2779 } 2780 } 2781 2782 PetscCall(PetscFree(r_waits4)); 2783 PetscCallMPI(MPI_Waitall(nrqr, s_waits4, MPI_STATUSES_IGNORE)); 2784 PetscCall(PetscFree(s_waits4)); 2785 2786 /* Restore the indices */ 2787 for (i = 0; i < ismax; i++) { 2788 PetscCall(ISRestoreIndices(isrow[i], irow + i)); 2789 if (!allcolumns[i]) PetscCall(ISRestoreIndices(iscol[i], icol + i)); 2790 } 2791 2792 for (i = 0; i < ismax; i++) { 2793 PetscCall(MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY)); 2794 PetscCall(MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY)); 2795 } 2796 2797 /* Destroy allocated memory */ 2798 if (sbuf_aa) { 2799 PetscCall(PetscFree(sbuf_aa[0])); 2800 PetscCall(PetscFree(sbuf_aa)); 2801 } 2802 PetscCall(PetscFree5(*(PetscInt ***)&irow, *(PetscInt ***)&icol, nrow, ncol, issorted)); 2803 2804 for (i = 0; i < nrqs; ++i) PetscCall(PetscFree(rbuf4[i])); 2805 PetscCall(PetscFree(rbuf4)); 2806 2807 PetscCall(PetscFree4(row2proc, cmap, rmap, allcolumns)); 2808 PetscFunctionReturn(0); 2809 } 2810 2811 /* 2812 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2813 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2814 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2815 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2816 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2817 state, and needs to be "assembled" later by compressing B's column space. 2818 2819 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2820 Following this call, C->A & C->B have been created, even if empty. 2821 */ 2822 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C, IS rowemb, IS dcolemb, IS ocolemb, MatStructure pattern, Mat A, Mat B) 2823 { 2824 /* If making this function public, change the error returned in this function away from _PLIB. */ 2825 Mat_MPIAIJ *aij; 2826 Mat_SeqAIJ *Baij; 2827 PetscBool seqaij, Bdisassembled; 2828 PetscInt m, n, *nz, i, j, ngcol, col, rstart, rend, shift, count; 2829 PetscScalar v; 2830 const PetscInt *rowindices, *colindices; 2831 2832 PetscFunctionBegin; 2833 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2834 if (A) { 2835 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij)); 2836 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 2837 if (rowemb) { 2838 PetscCall(ISGetLocalSize(rowemb, &m)); 2839 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2840 } else { 2841 PetscCheck(C->rmap->n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2842 } 2843 if (dcolemb) { 2844 PetscCall(ISGetLocalSize(dcolemb, &n)); 2845 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT, n, A->cmap->n); 2846 } else { 2847 PetscCheck(C->cmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2848 } 2849 } 2850 if (B) { 2851 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij)); 2852 PetscCheck(seqaij, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 2853 if (rowemb) { 2854 PetscCall(ISGetLocalSize(rowemb, &m)); 2855 PetscCheck(m == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT, m, A->rmap->n); 2856 } else { 2857 PetscCheck(C->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2858 } 2859 if (ocolemb) { 2860 PetscCall(ISGetLocalSize(ocolemb, &n)); 2861 PetscCheck(n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT, n, B->cmap->n); 2862 } else { 2863 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2864 } 2865 } 2866 2867 aij = (Mat_MPIAIJ *)C->data; 2868 if (!aij->A) { 2869 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2870 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->A)); 2871 PetscCall(MatSetSizes(aij->A, C->rmap->n, C->cmap->n, C->rmap->n, C->cmap->n)); 2872 PetscCall(MatSetBlockSizesFromMats(aij->A, C, C)); 2873 PetscCall(MatSetType(aij->A, MATSEQAIJ)); 2874 } 2875 if (A) { 2876 PetscCall(MatSetSeqMat_SeqAIJ(aij->A, rowemb, dcolemb, pattern, A)); 2877 } else { 2878 PetscCall(MatSetUp(aij->A)); 2879 } 2880 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2881 /* 2882 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2883 need to "disassemble" B -- convert it to using C's global indices. 2884 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2885 2886 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2887 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2888 2889 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2890 At least avoid calling MatSetValues() and the implied searches? 2891 */ 2892 2893 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2894 #if defined(PETSC_USE_CTABLE) 2895 PetscCall(PetscTableDestroy(&aij->colmap)); 2896 #else 2897 PetscCall(PetscFree(aij->colmap)); 2898 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2899 #endif 2900 ngcol = 0; 2901 if (aij->lvec) PetscCall(VecGetSize(aij->lvec, &ngcol)); 2902 if (aij->garray) { PetscCall(PetscFree(aij->garray)); } 2903 PetscCall(VecDestroy(&aij->lvec)); 2904 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2905 } 2906 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) PetscCall(MatDestroy(&aij->B)); 2907 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) PetscCall(MatZeroEntries(aij->B)); 2908 } 2909 Bdisassembled = PETSC_FALSE; 2910 if (!aij->B) { 2911 PetscCall(MatCreate(PETSC_COMM_SELF, &aij->B)); 2912 PetscCall(MatSetSizes(aij->B, C->rmap->n, C->cmap->N, C->rmap->n, C->cmap->N)); 2913 PetscCall(MatSetBlockSizesFromMats(aij->B, B, B)); 2914 PetscCall(MatSetType(aij->B, MATSEQAIJ)); 2915 Bdisassembled = PETSC_TRUE; 2916 } 2917 if (B) { 2918 Baij = (Mat_SeqAIJ *)B->data; 2919 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2920 PetscCall(PetscMalloc1(B->rmap->n, &nz)); 2921 for (i = 0; i < B->rmap->n; i++) nz[i] = Baij->i[i + 1] - Baij->i[i]; 2922 PetscCall(MatSeqAIJSetPreallocation(aij->B, 0, nz)); 2923 PetscCall(PetscFree(nz)); 2924 } 2925 2926 PetscCall(PetscLayoutGetRange(C->rmap, &rstart, &rend)); 2927 shift = rend - rstart; 2928 count = 0; 2929 rowindices = NULL; 2930 colindices = NULL; 2931 if (rowemb) PetscCall(ISGetIndices(rowemb, &rowindices)); 2932 if (ocolemb) PetscCall(ISGetIndices(ocolemb, &colindices)); 2933 for (i = 0; i < B->rmap->n; i++) { 2934 PetscInt row; 2935 row = i; 2936 if (rowindices) row = rowindices[i]; 2937 for (j = Baij->i[i]; j < Baij->i[i + 1]; j++) { 2938 col = Baij->j[count]; 2939 if (colindices) col = colindices[col]; 2940 if (Bdisassembled && col >= rstart) col += shift; 2941 v = Baij->a[count]; 2942 PetscCall(MatSetValues(aij->B, 1, &row, 1, &col, &v, INSERT_VALUES)); 2943 ++count; 2944 } 2945 } 2946 /* No assembly for aij->B is necessary. */ 2947 /* FIXME: set aij->B's nonzerostate correctly. */ 2948 } else { 2949 PetscCall(MatSetUp(aij->B)); 2950 } 2951 C->preallocated = PETSC_TRUE; 2952 C->was_assembled = PETSC_FALSE; 2953 C->assembled = PETSC_FALSE; 2954 /* 2955 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2956 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2957 */ 2958 PetscFunctionReturn(0); 2959 } 2960 2961 /* 2962 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2963 */ 2964 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C, Mat *A, Mat *B) 2965 { 2966 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)C->data; 2967 2968 PetscFunctionBegin; 2969 PetscValidPointer(A, 2); 2970 PetscValidPointer(B, 3); 2971 /* FIXME: make sure C is assembled */ 2972 *A = aij->A; 2973 *B = aij->B; 2974 /* Note that we don't incref *A and *B, so be careful! */ 2975 PetscFunctionReturn(0); 2976 } 2977 2978 /* 2979 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 2980 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 2981 */ 2982 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[], PetscErrorCode (*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat **), PetscErrorCode (*getlocalmats)(Mat, Mat *, Mat *), PetscErrorCode (*setseqmat)(Mat, IS, IS, MatStructure, Mat), PetscErrorCode (*setseqmats)(Mat, IS, IS, IS, MatStructure, Mat, Mat)) 2983 { 2984 PetscMPIInt size, flag; 2985 PetscInt i, ii, cismax, ispar; 2986 Mat *A, *B; 2987 IS *isrow_p, *iscol_p, *cisrow, *ciscol, *ciscol_p; 2988 2989 PetscFunctionBegin; 2990 if (!ismax) PetscFunctionReturn(0); 2991 2992 for (i = 0, cismax = 0; i < ismax; ++i) { 2993 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm, ((PetscObject)iscol[i])->comm, &flag)); 2994 PetscCheck(flag == MPI_IDENT, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2995 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 2996 if (size > 1) ++cismax; 2997 } 2998 2999 /* 3000 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3001 ispar counts the number of parallel ISs across C's comm. 3002 */ 3003 PetscCall(MPIU_Allreduce(&cismax, &ispar, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 3004 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3005 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, scall, submat)); 3006 PetscFunctionReturn(0); 3007 } 3008 3009 /* if (ispar) */ 3010 /* 3011 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3012 These are used to extract the off-diag portion of the resulting parallel matrix. 3013 The row IS for the off-diag portion is the same as for the diag portion, 3014 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3015 */ 3016 PetscCall(PetscMalloc2(cismax, &cisrow, cismax, &ciscol)); 3017 PetscCall(PetscMalloc1(cismax, &ciscol_p)); 3018 for (i = 0, ii = 0; i < ismax; ++i) { 3019 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3020 if (size > 1) { 3021 /* 3022 TODO: This is the part that's ***NOT SCALABLE***. 3023 To fix this we need to extract just the indices of C's nonzero columns 3024 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3025 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3026 to be done without serializing on the IS list, so, most likely, it is best 3027 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3028 */ 3029 PetscCall(ISGetNonlocalIS(iscol[i], &(ciscol[ii]))); 3030 /* Now we have to 3031 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3032 were sorted on each rank, concatenated they might no longer be sorted; 3033 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3034 indices in the nondecreasing order to the original index positions. 3035 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3036 */ 3037 PetscCall(ISSortPermutation(ciscol[ii], PETSC_FALSE, ciscol_p + ii)); 3038 PetscCall(ISSort(ciscol[ii])); 3039 ++ii; 3040 } 3041 } 3042 PetscCall(PetscMalloc2(ismax, &isrow_p, ismax, &iscol_p)); 3043 for (i = 0, ii = 0; i < ismax; ++i) { 3044 PetscInt j, issize; 3045 const PetscInt *indices; 3046 3047 /* 3048 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3049 */ 3050 PetscCall(ISSortPermutation(isrow[i], PETSC_FALSE, isrow_p + i)); 3051 PetscCall(ISSort(isrow[i])); 3052 PetscCall(ISGetLocalSize(isrow[i], &issize)); 3053 PetscCall(ISGetIndices(isrow[i], &indices)); 3054 for (j = 1; j < issize; ++j) { 3055 PetscCheck(indices[j] != indices[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3056 } 3057 PetscCall(ISRestoreIndices(isrow[i], &indices)); 3058 PetscCall(ISSortPermutation(iscol[i], PETSC_FALSE, iscol_p + i)); 3059 PetscCall(ISSort(iscol[i])); 3060 PetscCall(ISGetLocalSize(iscol[i], &issize)); 3061 PetscCall(ISGetIndices(iscol[i], &indices)); 3062 for (j = 1; j < issize; ++j) { 3063 PetscCheck(indices[j - 1] != indices[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT, i, j - 1, j, indices[j]); 3064 } 3065 PetscCall(ISRestoreIndices(iscol[i], &indices)); 3066 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3067 if (size > 1) { 3068 cisrow[ii] = isrow[i]; 3069 ++ii; 3070 } 3071 } 3072 /* 3073 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3074 array of sequential matrices underlying the resulting parallel matrices. 3075 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3076 contain duplicates. 3077 3078 There are as many diag matrices as there are original index sets. There are only as many parallel 3079 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3080 3081 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3082 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3083 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3084 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3085 with A[i] and B[ii] extracted from the corresponding MPI submat. 3086 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3087 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3088 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3089 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3090 values into A[i] and B[ii] sitting inside the corresponding submat. 3091 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3092 A[i], B[ii], AA[i] or BB[ii] matrices. 3093 */ 3094 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3095 if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscMalloc1(ismax, submat)); 3096 3097 /* Now obtain the sequential A and B submatrices separately. */ 3098 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3099 PetscCall((*getsubmats_seq)(C, ismax, isrow, iscol, MAT_INITIAL_MATRIX, &A)); 3100 PetscCall((*getsubmats_seq)(C, cismax, cisrow, ciscol, MAT_INITIAL_MATRIX, &B)); 3101 3102 /* 3103 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3104 matrices A & B have been extracted directly into the parallel matrices containing them, or 3105 simply into the sequential matrix identical with the corresponding A (if size == 1). 3106 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3107 to have the same sparsity pattern. 3108 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3109 must be constructed for C. This is done by setseqmat(s). 3110 */ 3111 for (i = 0, ii = 0; i < ismax; ++i) { 3112 /* 3113 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3114 That way we can avoid sorting and computing permutations when reusing. 3115 To this end: 3116 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3117 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3118 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3119 */ 3120 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3121 3122 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3123 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3124 if (size > 1) { 3125 if (scall == MAT_INITIAL_MATRIX) { 3126 PetscCall(MatCreate(((PetscObject)isrow[i])->comm, (*submat) + i)); 3127 PetscCall(MatSetSizes((*submat)[i], A[i]->rmap->n, A[i]->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 3128 PetscCall(MatSetType((*submat)[i], MATMPIAIJ)); 3129 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3130 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3131 } 3132 /* 3133 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3134 */ 3135 { 3136 Mat AA = A[i], BB = B[ii]; 3137 3138 if (AA || BB) { 3139 PetscCall(setseqmats((*submat)[i], isrow_p[i], iscol_p[i], ciscol_p[ii], pattern, AA, BB)); 3140 PetscCall(MatAssemblyBegin((*submat)[i], MAT_FINAL_ASSEMBLY)); 3141 PetscCall(MatAssemblyEnd((*submat)[i], MAT_FINAL_ASSEMBLY)); 3142 } 3143 PetscCall(MatDestroy(&AA)); 3144 } 3145 PetscCall(ISDestroy(ciscol + ii)); 3146 PetscCall(ISDestroy(ciscol_p + ii)); 3147 ++ii; 3148 } else { /* if (size == 1) */ 3149 if (scall == MAT_REUSE_MATRIX) PetscCall(MatDestroy(&(*submat)[i])); 3150 if (isrow_p[i] || iscol_p[i]) { 3151 PetscCall(MatDuplicate(A[i], MAT_DO_NOT_COPY_VALUES, (*submat) + i)); 3152 PetscCall(setseqmat((*submat)[i], isrow_p[i], iscol_p[i], pattern, A[i])); 3153 /* Otherwise A is extracted straight into (*submats)[i]. */ 3154 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3155 PetscCall(MatDestroy(A + i)); 3156 } else (*submat)[i] = A[i]; 3157 } 3158 PetscCall(ISDestroy(&isrow_p[i])); 3159 PetscCall(ISDestroy(&iscol_p[i])); 3160 } 3161 PetscCall(PetscFree2(cisrow, ciscol)); 3162 PetscCall(PetscFree2(isrow_p, iscol_p)); 3163 PetscCall(PetscFree(ciscol_p)); 3164 PetscCall(PetscFree(A)); 3165 PetscCall(MatDestroySubMatrices(cismax, &B)); 3166 PetscFunctionReturn(0); 3167 } 3168 3169 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C, PetscInt ismax, const IS isrow[], const IS iscol[], MatReuse scall, Mat *submat[]) 3170 { 3171 PetscFunctionBegin; 3172 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C, ismax, isrow, iscol, scall, submat, MatCreateSubMatrices_MPIAIJ, MatGetSeqMats_MPIAIJ, MatSetSeqMat_SeqAIJ, MatSetSeqMats_MPIAIJ)); 3173 PetscFunctionReturn(0); 3174 } 3175