1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 #include <thrust/advance.h> 8 #include <thrust/partition.h> 9 #include <thrust/sort.h> 10 #include <thrust/unique.h> 11 #include <petscsf.h> 12 13 struct VecCUDAEquals { 14 template <typename Tuple> 15 __host__ __device__ void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void **data) 22 { 23 MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data; 24 25 PetscFunctionBegin; 26 PetscCall(PetscSFDestroy(&coo->sf)); 27 PetscCallCUDA(cudaFree(coo->Ajmap1)); 28 PetscCallCUDA(cudaFree(coo->Aperm1)); 29 PetscCallCUDA(cudaFree(coo->Bjmap1)); 30 PetscCallCUDA(cudaFree(coo->Bperm1)); 31 PetscCallCUDA(cudaFree(coo->Aimap2)); 32 PetscCallCUDA(cudaFree(coo->Ajmap2)); 33 PetscCallCUDA(cudaFree(coo->Aperm2)); 34 PetscCallCUDA(cudaFree(coo->Bimap2)); 35 PetscCallCUDA(cudaFree(coo->Bjmap2)); 36 PetscCallCUDA(cudaFree(coo->Bperm2)); 37 PetscCallCUDA(cudaFree(coo->Cperm1)); 38 PetscCallCUDA(cudaFree(coo->sendbuf)); 39 PetscCallCUDA(cudaFree(coo->recvbuf)); 40 PetscCall(PetscFree(coo)); 41 PetscFunctionReturn(PETSC_SUCCESS); 42 } 43 44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 45 { 46 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 47 PetscBool dev_ij = PETSC_FALSE; 48 PetscMemType mtype = PETSC_MEMTYPE_HOST; 49 PetscInt *i, *j; 50 PetscContainer container_h; 51 MatCOOStruct_MPIAIJ *coo_h, *coo_d; 52 53 PetscFunctionBegin; 54 PetscCall(PetscFree(mpiaij->garray)); 55 PetscCall(VecDestroy(&mpiaij->lvec)); 56 #if defined(PETSC_USE_CTABLE) 57 PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 58 #else 59 PetscCall(PetscFree(mpiaij->colmap)); 60 #endif 61 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 62 mat->assembled = PETSC_FALSE; 63 mat->was_assembled = PETSC_FALSE; 64 PetscCall(PetscGetMemType(coo_i, &mtype)); 65 if (PetscMemTypeDevice(mtype)) { 66 dev_ij = PETSC_TRUE; 67 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 68 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 69 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 70 } else { 71 i = coo_i; 72 j = coo_j; 73 } 74 75 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j)); 76 if (dev_ij) PetscCall(PetscFree2(i, j)); 77 mat->offloadmask = PETSC_OFFLOAD_CPU; 78 // Create the GPU memory 79 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 80 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 81 82 // Copy the COO struct to device 83 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 84 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 85 PetscCall(PetscMalloc1(1, &coo_d)); 86 *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d 87 88 PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d 89 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount))); 90 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount))); 91 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount))); 92 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount))); 93 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount))); 94 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount))); 95 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount))); 96 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount))); 97 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount))); 98 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount))); 99 PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount))); 100 PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar))); 101 PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar))); 102 103 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 104 PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 105 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 106 PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 107 PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 108 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 109 PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 110 PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 111 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 112 PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 113 PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 114 115 // Put the COO struct in a container and then attach that to the matrix 116 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_MPIAIJCUSPARSE)); 117 PetscFunctionReturn(PETSC_SUCCESS); 118 } 119 120 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 121 { 122 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 123 const PetscCount grid_size = gridDim.x * blockDim.x; 124 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 125 } 126 127 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 128 { 129 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 130 const PetscCount grid_size = gridDim.x * blockDim.x; 131 for (; i < Annz + Bnnz; i += grid_size) { 132 PetscScalar sum = 0.0; 133 if (i < Annz) { 134 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 135 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 136 } else { 137 i -= Annz; 138 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 139 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 140 } 141 } 142 } 143 144 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 145 { 146 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 147 const PetscCount grid_size = gridDim.x * blockDim.x; 148 for (; i < Annz2 + Bnnz2; i += grid_size) { 149 if (i < Annz2) { 150 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 151 } else { 152 i -= Annz2; 153 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 154 } 155 } 156 } 157 158 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 159 { 160 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 161 Mat A = mpiaij->A, B = mpiaij->B; 162 PetscScalar *Aa, *Ba; 163 const PetscScalar *v1 = v; 164 PetscMemType memtype; 165 PetscContainer container; 166 MatCOOStruct_MPIAIJ *coo; 167 168 PetscFunctionBegin; 169 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 170 PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix"); 171 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 172 173 const auto &Annz = coo->Annz; 174 const auto &Annz2 = coo->Annz2; 175 const auto &Bnnz = coo->Bnnz; 176 const auto &Bnnz2 = coo->Bnnz2; 177 const auto &vsend = coo->sendbuf; 178 const auto &v2 = coo->recvbuf; 179 const auto &Ajmap1 = coo->Ajmap1; 180 const auto &Ajmap2 = coo->Ajmap2; 181 const auto &Aimap2 = coo->Aimap2; 182 const auto &Bjmap1 = coo->Bjmap1; 183 const auto &Bjmap2 = coo->Bjmap2; 184 const auto &Bimap2 = coo->Bimap2; 185 const auto &Aperm1 = coo->Aperm1; 186 const auto &Aperm2 = coo->Aperm2; 187 const auto &Bperm1 = coo->Bperm1; 188 const auto &Bperm2 = coo->Bperm2; 189 const auto &Cperm1 = coo->Cperm1; 190 191 PetscCall(PetscGetMemType(v, &memtype)); 192 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 193 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 194 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 195 } 196 197 if (imode == INSERT_VALUES) { 198 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 199 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 200 } else { 201 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 202 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 203 } 204 205 PetscCall(PetscLogGpuTimeBegin()); 206 /* Pack entries to be sent to remote */ 207 if (coo->sendlen) { 208 MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend); 209 PetscCallCUDA(cudaPeekAtLastError()); 210 } 211 212 /* Send remote entries to their owner and overlap the communication with local computation */ 213 PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 214 /* Add local entries to A and B */ 215 if (Annz + Bnnz > 0) { 216 MatAddLocalCOOValues<<<(int)((Annz + Bnnz + 255) / 256), 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 217 PetscCallCUDA(cudaPeekAtLastError()); 218 } 219 PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 220 221 /* Add received remote entries to A and B */ 222 if (Annz2 + Bnnz2 > 0) { 223 MatAddRemoteCOOValues<<<(int)((Annz2 + Bnnz2 + 255) / 256), 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 224 PetscCallCUDA(cudaPeekAtLastError()); 225 } 226 PetscCall(PetscLogGpuTimeEnd()); 227 228 if (imode == INSERT_VALUES) { 229 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 230 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 231 } else { 232 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 233 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 234 } 235 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 236 mat->offloadmask = PETSC_OFFLOAD_GPU; 237 PetscFunctionReturn(PETSC_SUCCESS); 238 } 239 240 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 241 { 242 Mat Ad, Ao; 243 const PetscInt *cmap; 244 245 PetscFunctionBegin; 246 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 247 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 248 if (glob) { 249 PetscInt cst, i, dn, on, *gidx; 250 251 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 252 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 253 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 254 PetscCall(PetscMalloc1(dn + on, &gidx)); 255 for (i = 0; i < dn; i++) gidx[i] = cst + i; 256 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 257 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 258 } 259 PetscFunctionReturn(PETSC_SUCCESS); 260 } 261 262 static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 263 { 264 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 265 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 266 PetscInt i; 267 268 PetscFunctionBegin; 269 if (B->hash_active) { 270 B->ops[0] = b->cops; 271 B->hash_active = PETSC_FALSE; 272 } 273 PetscCall(PetscLayoutSetUp(B->rmap)); 274 PetscCall(PetscLayoutSetUp(B->cmap)); 275 if (PetscDefined(USE_DEBUG) && d_nnz) { 276 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 277 } 278 if (PetscDefined(USE_DEBUG) && o_nnz) { 279 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 280 } 281 #if defined(PETSC_USE_CTABLE) 282 PetscCall(PetscHMapIDestroy(&b->colmap)); 283 #else 284 PetscCall(PetscFree(b->colmap)); 285 #endif 286 PetscCall(PetscFree(b->garray)); 287 PetscCall(VecDestroy(&b->lvec)); 288 PetscCall(VecScatterDestroy(&b->Mvctx)); 289 /* Because the B will have been resized we simply destroy it and create a new one each time */ 290 PetscCall(MatDestroy(&b->B)); 291 if (!b->A) { 292 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 293 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 294 } 295 if (!b->B) { 296 PetscMPIInt size; 297 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 298 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 299 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 300 } 301 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 302 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 303 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 304 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 305 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 306 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 307 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 308 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 309 B->preallocated = PETSC_TRUE; 310 B->was_assembled = PETSC_FALSE; 311 B->assembled = PETSC_FALSE; 312 PetscFunctionReturn(PETSC_SUCCESS); 313 } 314 315 static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 316 { 317 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 318 319 PetscFunctionBegin; 320 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 321 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 322 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 323 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 324 PetscFunctionReturn(PETSC_SUCCESS); 325 } 326 327 static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 328 { 329 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 330 331 PetscFunctionBegin; 332 PetscCall(MatZeroEntries(l->A)); 333 PetscCall(MatZeroEntries(l->B)); 334 PetscFunctionReturn(PETSC_SUCCESS); 335 } 336 337 static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 338 { 339 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 340 341 PetscFunctionBegin; 342 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 343 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 344 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 345 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 346 PetscFunctionReturn(PETSC_SUCCESS); 347 } 348 349 static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 350 { 351 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 352 353 PetscFunctionBegin; 354 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 355 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 356 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 357 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 358 PetscFunctionReturn(PETSC_SUCCESS); 359 } 360 361 static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 362 { 363 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 364 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 365 366 PetscFunctionBegin; 367 switch (op) { 368 case MAT_CUSPARSE_MULT_DIAG: 369 cusparseStruct->diagGPUMatFormat = format; 370 break; 371 case MAT_CUSPARSE_MULT_OFFDIAG: 372 cusparseStruct->offdiagGPUMatFormat = format; 373 break; 374 case MAT_CUSPARSE_ALL: 375 cusparseStruct->diagGPUMatFormat = format; 376 cusparseStruct->offdiagGPUMatFormat = format; 377 break; 378 default: 379 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 380 } 381 PetscFunctionReturn(PETSC_SUCCESS); 382 } 383 384 static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 385 { 386 MatCUSPARSEStorageFormat format; 387 PetscBool flg; 388 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 389 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 390 391 PetscFunctionBegin; 392 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 393 if (A->factortype == MAT_FACTOR_NONE) { 394 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 395 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 396 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 397 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 398 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 399 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 400 } 401 PetscOptionsHeadEnd(); 402 PetscFunctionReturn(PETSC_SUCCESS); 403 } 404 405 static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 406 { 407 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 408 409 PetscFunctionBegin; 410 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 411 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 412 PetscFunctionReturn(PETSC_SUCCESS); 413 } 414 415 static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 416 { 417 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 418 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 419 420 PetscFunctionBegin; 421 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 422 PetscCallCXX(delete cusparseStruct); 423 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 424 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 425 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 426 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 427 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 428 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 429 PetscCall(MatDestroy_MPIAIJ(A)); 430 PetscFunctionReturn(PETSC_SUCCESS); 431 } 432 433 /* defines MatSetValues_MPICUSPARSE_Hash() */ 434 #define TYPE AIJ 435 #define TYPE_AIJ 436 #define SUB_TYPE_CUSPARSE 437 #include "../src/mat/impls/aij/mpi/mpihashmat.h" 438 #undef TYPE 439 #undef TYPE_AIJ 440 #undef SUB_TYPE_CUSPARSE 441 442 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 443 { 444 Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 445 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 446 447 PetscFunctionBegin; 448 PetscCall(MatSetUp_MPI_Hash(A)); 449 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 450 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 451 A->preallocated = PETSC_TRUE; 452 PetscFunctionReturn(PETSC_SUCCESS); 453 } 454 455 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 456 { 457 Mat_MPIAIJ *a; 458 Mat A; 459 460 PetscFunctionBegin; 461 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 462 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 463 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 464 A = *newmat; 465 A->boundtocpu = PETSC_FALSE; 466 PetscCall(PetscFree(A->defaultvectype)); 467 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 468 469 a = (Mat_MPIAIJ *)A->data; 470 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 471 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 472 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 473 474 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 475 476 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 477 A->ops->mult = MatMult_MPIAIJCUSPARSE; 478 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 479 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 480 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 481 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 482 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 483 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 484 A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 485 A->ops->getcurrentmemtype = MatGetCurrentMemType_MPIAIJ; 486 487 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 488 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 489 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 490 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 491 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 492 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 493 #if defined(PETSC_HAVE_HYPRE) 494 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 495 #endif 496 PetscFunctionReturn(PETSC_SUCCESS); 497 } 498 499 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 500 { 501 PetscFunctionBegin; 502 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 503 PetscCall(MatCreate_MPIAIJ(A)); 504 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 505 PetscFunctionReturn(PETSC_SUCCESS); 506 } 507 508 /*@ 509 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 510 (the default parallel PETSc format). This matrix will ultimately pushed down 511 to NVIDIA GPUs and use the CuSPARSE library for calculations. 512 513 Collective 514 515 Input Parameters: 516 + comm - MPI communicator, set to `PETSC_COMM_SELF` 517 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 518 This value should be the same as the local size used in creating the 519 y vector for the matrix-vector product y = Ax. 520 . n - This value should be the same as the local size used in creating the 521 x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 522 calculated if `N` is given) For square matrices `n` is almost always `m`. 523 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 524 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 525 . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 526 (same value is used for all local rows) 527 . d_nnz - array containing the number of nonzeros in the various rows of the 528 DIAGONAL portion of the local submatrix (possibly different for each row) 529 or `NULL`, if `d_nz` is used to specify the nonzero structure. 530 The size of this array is equal to the number of local rows, i.e `m`. 531 For matrices you plan to factor you must leave room for the diagonal entry and 532 put in the entry even if it is zero. 533 . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 534 submatrix (same value is used for all local rows). 535 - o_nnz - array containing the number of nonzeros in the various rows of the 536 OFF-DIAGONAL portion of the local submatrix (possibly different for 537 each row) or `NULL`, if `o_nz` is used to specify the nonzero 538 structure. The size of this array is equal to the number 539 of local rows, i.e `m`. 540 541 Output Parameter: 542 . A - the matrix 543 544 Level: intermediate 545 546 Notes: 547 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 548 MatXXXXSetPreallocation() paradigm instead of this routine directly. 549 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 550 551 The AIJ format, also called the 552 compressed row storage), is fully compatible with standard Fortran 553 storage. That is, the stored row and column indices can begin at 554 either one (as in Fortran) or zero. 555 556 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE` 557 @*/ 558 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 559 { 560 PetscMPIInt size; 561 562 PetscFunctionBegin; 563 PetscCall(MatCreate(comm, A)); 564 PetscCall(MatSetSizes(*A, m, n, M, N)); 565 PetscCallMPI(MPI_Comm_size(comm, &size)); 566 if (size > 1) { 567 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 568 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 569 } else { 570 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 571 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 572 } 573 PetscFunctionReturn(PETSC_SUCCESS); 574 } 575 576 /*MC 577 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 578 579 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 580 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 581 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 582 583 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 584 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 585 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 586 for communicators controlling multiple processes. It is recommended that you call both of 587 the above preallocation routines for simplicity. 588 589 Options Database Keys: 590 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` 591 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid). 592 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 593 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 594 595 Level: beginner 596 597 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 598 M*/ 599 600 /*MC 601 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 602 603 Level: beginner 604 605 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 606 M*/ 607