1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 #include <thrust/advance.h> 8 #include <thrust/partition.h> 9 #include <thrust/sort.h> 10 #include <thrust/unique.h> 11 #include <petscsf.h> 12 13 struct VecCUDAEquals { 14 template <typename Tuple> 15 __host__ __device__ void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data) 22 { 23 MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data; 24 25 PetscFunctionBegin; 26 PetscCall(PetscSFDestroy(&coo->sf)); 27 PetscCallCUDA(cudaFree(coo->Ajmap1)); 28 PetscCallCUDA(cudaFree(coo->Aperm1)); 29 PetscCallCUDA(cudaFree(coo->Bjmap1)); 30 PetscCallCUDA(cudaFree(coo->Bperm1)); 31 PetscCallCUDA(cudaFree(coo->Aimap2)); 32 PetscCallCUDA(cudaFree(coo->Ajmap2)); 33 PetscCallCUDA(cudaFree(coo->Aperm2)); 34 PetscCallCUDA(cudaFree(coo->Bimap2)); 35 PetscCallCUDA(cudaFree(coo->Bjmap2)); 36 PetscCallCUDA(cudaFree(coo->Bperm2)); 37 PetscCallCUDA(cudaFree(coo->Cperm1)); 38 PetscCallCUDA(cudaFree(coo->sendbuf)); 39 PetscCallCUDA(cudaFree(coo->recvbuf)); 40 PetscCall(PetscFree(coo)); 41 PetscFunctionReturn(PETSC_SUCCESS); 42 } 43 44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 45 { 46 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 47 PetscBool dev_ij = PETSC_FALSE; 48 PetscMemType mtype = PETSC_MEMTYPE_HOST; 49 PetscInt *i, *j; 50 PetscContainer container_h, container_d; 51 MatCOOStruct_MPIAIJ *coo_h, *coo_d; 52 53 PetscFunctionBegin; 54 PetscCall(PetscFree(mpiaij->garray)); 55 PetscCall(VecDestroy(&mpiaij->lvec)); 56 #if defined(PETSC_USE_CTABLE) 57 PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 58 #else 59 PetscCall(PetscFree(mpiaij->colmap)); 60 #endif 61 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 62 mat->assembled = PETSC_FALSE; 63 mat->was_assembled = PETSC_FALSE; 64 PetscCall(PetscGetMemType(coo_i, &mtype)); 65 if (PetscMemTypeDevice(mtype)) { 66 dev_ij = PETSC_TRUE; 67 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 68 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 69 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 70 } else { 71 i = coo_i; 72 j = coo_j; 73 } 74 75 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j)); 76 if (dev_ij) PetscCall(PetscFree2(i, j)); 77 mat->offloadmask = PETSC_OFFLOAD_CPU; 78 // Create the GPU memory 79 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 80 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 81 82 // Copy the COO struct to device 83 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 84 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 85 PetscCall(PetscMalloc1(1, &coo_d)); 86 *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d 87 88 PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d 89 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount))); 90 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount))); 91 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount))); 92 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount))); 93 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount))); 94 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount))); 95 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount))); 96 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount))); 97 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount))); 98 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount))); 99 PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount))); 100 PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar))); 101 PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar))); 102 103 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 104 PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 105 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 106 PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 107 PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 108 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 109 PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 110 PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 111 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 112 PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 113 PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 114 115 // Put the COO struct in a container and then attach that to the matrix 116 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 117 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 118 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE)); 119 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 120 PetscCall(PetscContainerDestroy(&container_d)); 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 125 { 126 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 127 const PetscCount grid_size = gridDim.x * blockDim.x; 128 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 129 } 130 131 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 132 { 133 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 134 const PetscCount grid_size = gridDim.x * blockDim.x; 135 for (; i < Annz + Bnnz; i += grid_size) { 136 PetscScalar sum = 0.0; 137 if (i < Annz) { 138 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 139 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 140 } else { 141 i -= Annz; 142 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 143 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 144 } 145 } 146 } 147 148 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 149 { 150 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 151 const PetscCount grid_size = gridDim.x * blockDim.x; 152 for (; i < Annz2 + Bnnz2; i += grid_size) { 153 if (i < Annz2) { 154 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 155 } else { 156 i -= Annz2; 157 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 158 } 159 } 160 } 161 162 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 163 { 164 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 165 Mat A = mpiaij->A, B = mpiaij->B; 166 PetscScalar *Aa, *Ba; 167 const PetscScalar *v1 = v; 168 PetscMemType memtype; 169 PetscContainer container; 170 MatCOOStruct_MPIAIJ *coo; 171 172 PetscFunctionBegin; 173 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 174 PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix"); 175 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 176 177 const auto &Annz = coo->Annz; 178 const auto &Annz2 = coo->Annz2; 179 const auto &Bnnz = coo->Bnnz; 180 const auto &Bnnz2 = coo->Bnnz2; 181 const auto &vsend = coo->sendbuf; 182 const auto &v2 = coo->recvbuf; 183 const auto &Ajmap1 = coo->Ajmap1; 184 const auto &Ajmap2 = coo->Ajmap2; 185 const auto &Aimap2 = coo->Aimap2; 186 const auto &Bjmap1 = coo->Bjmap1; 187 const auto &Bjmap2 = coo->Bjmap2; 188 const auto &Bimap2 = coo->Bimap2; 189 const auto &Aperm1 = coo->Aperm1; 190 const auto &Aperm2 = coo->Aperm2; 191 const auto &Bperm1 = coo->Bperm1; 192 const auto &Bperm2 = coo->Bperm2; 193 const auto &Cperm1 = coo->Cperm1; 194 195 PetscCall(PetscGetMemType(v, &memtype)); 196 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 197 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 198 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 199 } 200 201 if (imode == INSERT_VALUES) { 202 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 203 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 204 } else { 205 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 206 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 207 } 208 209 /* Pack entries to be sent to remote */ 210 if (coo->sendlen) { 211 MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend); 212 PetscCallCUDA(cudaPeekAtLastError()); 213 } 214 215 /* Send remote entries to their owner and overlap the communication with local computation */ 216 PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 217 /* Add local entries to A and B */ 218 if (Annz + Bnnz > 0) { 219 MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 220 PetscCallCUDA(cudaPeekAtLastError()); 221 } 222 PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 223 224 /* Add received remote entries to A and B */ 225 if (Annz2 + Bnnz2 > 0) { 226 MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 227 PetscCallCUDA(cudaPeekAtLastError()); 228 } 229 230 if (imode == INSERT_VALUES) { 231 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 232 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 233 } else { 234 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 235 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 236 } 237 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 238 mat->offloadmask = PETSC_OFFLOAD_GPU; 239 PetscFunctionReturn(PETSC_SUCCESS); 240 } 241 242 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 243 { 244 Mat Ad, Ao; 245 const PetscInt *cmap; 246 247 PetscFunctionBegin; 248 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 249 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 250 if (glob) { 251 PetscInt cst, i, dn, on, *gidx; 252 253 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 254 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 255 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 256 PetscCall(PetscMalloc1(dn + on, &gidx)); 257 for (i = 0; i < dn; i++) gidx[i] = cst + i; 258 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 259 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 260 } 261 PetscFunctionReturn(PETSC_SUCCESS); 262 } 263 264 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 265 { 266 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 267 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 268 PetscInt i; 269 270 PetscFunctionBegin; 271 PetscCall(PetscLayoutSetUp(B->rmap)); 272 PetscCall(PetscLayoutSetUp(B->cmap)); 273 if (PetscDefined(USE_DEBUG) && d_nnz) { 274 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 275 } 276 if (PetscDefined(USE_DEBUG) && o_nnz) { 277 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 278 } 279 #if defined(PETSC_USE_CTABLE) 280 PetscCall(PetscHMapIDestroy(&b->colmap)); 281 #else 282 PetscCall(PetscFree(b->colmap)); 283 #endif 284 PetscCall(PetscFree(b->garray)); 285 PetscCall(VecDestroy(&b->lvec)); 286 PetscCall(VecScatterDestroy(&b->Mvctx)); 287 /* Because the B will have been resized we simply destroy it and create a new one each time */ 288 PetscCall(MatDestroy(&b->B)); 289 if (!b->A) { 290 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 291 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 292 } 293 if (!b->B) { 294 PetscMPIInt size; 295 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 296 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 297 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 298 } 299 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 300 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 301 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 302 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 303 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 304 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 305 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 306 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 307 B->preallocated = PETSC_TRUE; 308 PetscFunctionReturn(PETSC_SUCCESS); 309 } 310 311 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 312 { 313 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 314 315 PetscFunctionBegin; 316 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 317 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 318 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 319 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 320 PetscFunctionReturn(PETSC_SUCCESS); 321 } 322 323 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 324 { 325 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 326 327 PetscFunctionBegin; 328 PetscCall(MatZeroEntries(l->A)); 329 PetscCall(MatZeroEntries(l->B)); 330 PetscFunctionReturn(PETSC_SUCCESS); 331 } 332 333 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 334 { 335 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 336 337 PetscFunctionBegin; 338 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 339 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 340 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 341 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 342 PetscFunctionReturn(PETSC_SUCCESS); 343 } 344 345 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 346 { 347 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 348 349 PetscFunctionBegin; 350 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 351 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 352 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 353 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 354 PetscFunctionReturn(PETSC_SUCCESS); 355 } 356 357 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 358 { 359 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 360 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 361 362 PetscFunctionBegin; 363 switch (op) { 364 case MAT_CUSPARSE_MULT_DIAG: 365 cusparseStruct->diagGPUMatFormat = format; 366 break; 367 case MAT_CUSPARSE_MULT_OFFDIAG: 368 cusparseStruct->offdiagGPUMatFormat = format; 369 break; 370 case MAT_CUSPARSE_ALL: 371 cusparseStruct->diagGPUMatFormat = format; 372 cusparseStruct->offdiagGPUMatFormat = format; 373 break; 374 default: 375 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 376 } 377 PetscFunctionReturn(PETSC_SUCCESS); 378 } 379 380 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 381 { 382 MatCUSPARSEStorageFormat format; 383 PetscBool flg; 384 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 385 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 386 387 PetscFunctionBegin; 388 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 389 if (A->factortype == MAT_FACTOR_NONE) { 390 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 391 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 392 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 393 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 394 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 395 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 396 } 397 PetscOptionsHeadEnd(); 398 PetscFunctionReturn(PETSC_SUCCESS); 399 } 400 401 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 402 { 403 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 404 405 PetscFunctionBegin; 406 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 407 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 408 PetscFunctionReturn(PETSC_SUCCESS); 409 } 410 411 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 412 { 413 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 414 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 415 416 PetscFunctionBegin; 417 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 418 PetscCallCXX(delete cusparseStruct); 419 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 420 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 421 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 422 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 423 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 424 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 425 PetscCall(MatDestroy_MPIAIJ(A)); 426 PetscFunctionReturn(PETSC_SUCCESS); 427 } 428 429 /* defines MatSetValues_MPICUSPARSE_Hash() */ 430 #define TYPE AIJ 431 #define TYPE_AIJ 432 #define SUB_TYPE_CUSPARSE 433 #include "../src/mat/impls/aij/mpi/mpihashmat.h" 434 #undef TYPE 435 #undef TYPE_AIJ 436 #undef SUB_TYPE_CUSPARSE 437 438 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 439 { 440 Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 441 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 442 443 PetscFunctionBegin; 444 PetscCall(MatSetUp_MPI_Hash(A)); 445 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 446 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 447 A->preallocated = PETSC_TRUE; 448 PetscFunctionReturn(PETSC_SUCCESS); 449 } 450 451 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 452 { 453 Mat_MPIAIJ *a; 454 Mat A; 455 456 PetscFunctionBegin; 457 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 458 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 459 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 460 A = *newmat; 461 A->boundtocpu = PETSC_FALSE; 462 PetscCall(PetscFree(A->defaultvectype)); 463 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 464 465 a = (Mat_MPIAIJ *)A->data; 466 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 467 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 468 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 469 470 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 471 472 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 473 A->ops->mult = MatMult_MPIAIJCUSPARSE; 474 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 475 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 476 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 477 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 478 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 479 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 480 A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 481 482 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 483 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 484 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 485 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 486 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 487 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 488 #if defined(PETSC_HAVE_HYPRE) 489 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 490 #endif 491 PetscFunctionReturn(PETSC_SUCCESS); 492 } 493 494 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 495 { 496 PetscFunctionBegin; 497 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 498 PetscCall(MatCreate_MPIAIJ(A)); 499 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 500 PetscFunctionReturn(PETSC_SUCCESS); 501 } 502 503 /*@ 504 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 505 (the default parallel PETSc format). This matrix will ultimately pushed down 506 to NVIDIA GPUs and use the CuSPARSE library for calculations. 507 508 Collective 509 510 Input Parameters: 511 + comm - MPI communicator, set to `PETSC_COMM_SELF` 512 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 513 This value should be the same as the local size used in creating the 514 y vector for the matrix-vector product y = Ax. 515 . n - This value should be the same as the local size used in creating the 516 x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 517 calculated if `N` is given) For square matrices `n` is almost always `m`. 518 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 519 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 520 . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 521 (same value is used for all local rows) 522 . d_nnz - array containing the number of nonzeros in the various rows of the 523 DIAGONAL portion of the local submatrix (possibly different for each row) 524 or `NULL`, if `d_nz` is used to specify the nonzero structure. 525 The size of this array is equal to the number of local rows, i.e `m`. 526 For matrices you plan to factor you must leave room for the diagonal entry and 527 put in the entry even if it is zero. 528 . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 529 submatrix (same value is used for all local rows). 530 - o_nnz - array containing the number of nonzeros in the various rows of the 531 OFF-DIAGONAL portion of the local submatrix (possibly different for 532 each row) or `NULL`, if `o_nz` is used to specify the nonzero 533 structure. The size of this array is equal to the number 534 of local rows, i.e `m`. 535 536 Output Parameter: 537 . A - the matrix 538 539 Level: intermediate 540 541 Notes: 542 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 543 MatXXXXSetPreallocation() paradigm instead of this routine directly. 544 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 545 546 The AIJ format, also called the 547 compressed row storage), is fully compatible with standard Fortran 548 storage. That is, the stored row and column indices can begin at 549 either one (as in Fortran) or zero. 550 551 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 552 @*/ 553 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 554 { 555 PetscMPIInt size; 556 557 PetscFunctionBegin; 558 PetscCall(MatCreate(comm, A)); 559 PetscCall(MatSetSizes(*A, m, n, M, N)); 560 PetscCallMPI(MPI_Comm_size(comm, &size)); 561 if (size > 1) { 562 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 563 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 564 } else { 565 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 566 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 567 } 568 PetscFunctionReturn(PETSC_SUCCESS); 569 } 570 571 /*MC 572 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 573 574 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 575 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 576 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 577 578 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 579 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 580 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 581 for communicators controlling multiple processes. It is recommended that you call both of 582 the above preallocation routines for simplicity. 583 584 Options Database Keys: 585 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` 586 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid). 587 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 588 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 589 590 Level: beginner 591 592 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 593 M*/ 594 595 /*MC 596 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 597 598 Level: beginner 599 600 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 601 M*/ 602