1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 #include <thrust/advance.h> 8 #include <thrust/partition.h> 9 #include <thrust/sort.h> 10 #include <thrust/unique.h> 11 #include <petscsf.h> 12 13 struct VecCUDAEquals { 14 template <typename Tuple> 15 __host__ __device__ void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data) 22 { 23 MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data; 24 25 PetscFunctionBegin; 26 PetscCall(PetscSFDestroy(&coo->sf)); 27 PetscCallCUDA(cudaFree(coo->Ajmap1)); 28 PetscCallCUDA(cudaFree(coo->Aperm1)); 29 PetscCallCUDA(cudaFree(coo->Bjmap1)); 30 PetscCallCUDA(cudaFree(coo->Bperm1)); 31 PetscCallCUDA(cudaFree(coo->Aimap2)); 32 PetscCallCUDA(cudaFree(coo->Ajmap2)); 33 PetscCallCUDA(cudaFree(coo->Aperm2)); 34 PetscCallCUDA(cudaFree(coo->Bimap2)); 35 PetscCallCUDA(cudaFree(coo->Bjmap2)); 36 PetscCallCUDA(cudaFree(coo->Bperm2)); 37 PetscCallCUDA(cudaFree(coo->Cperm1)); 38 PetscCallCUDA(cudaFree(coo->sendbuf)); 39 PetscCallCUDA(cudaFree(coo->recvbuf)); 40 PetscCall(PetscFree(coo)); 41 PetscFunctionReturn(PETSC_SUCCESS); 42 } 43 44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 45 { 46 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 47 PetscBool dev_ij = PETSC_FALSE; 48 PetscMemType mtype = PETSC_MEMTYPE_HOST; 49 PetscInt *i, *j; 50 PetscContainer container_h, container_d; 51 MatCOOStruct_MPIAIJ *coo_h, *coo_d; 52 53 PetscFunctionBegin; 54 PetscCall(PetscFree(mpiaij->garray)); 55 PetscCall(VecDestroy(&mpiaij->lvec)); 56 #if defined(PETSC_USE_CTABLE) 57 PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 58 #else 59 PetscCall(PetscFree(mpiaij->colmap)); 60 #endif 61 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 62 mat->assembled = PETSC_FALSE; 63 mat->was_assembled = PETSC_FALSE; 64 PetscCall(PetscGetMemType(coo_i, &mtype)); 65 if (PetscMemTypeDevice(mtype)) { 66 dev_ij = PETSC_TRUE; 67 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 68 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 69 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 70 } else { 71 i = coo_i; 72 j = coo_j; 73 } 74 75 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j)); 76 if (dev_ij) PetscCall(PetscFree2(i, j)); 77 mat->offloadmask = PETSC_OFFLOAD_CPU; 78 // Create the GPU memory 79 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 80 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 81 82 // Copy the COO struct to device 83 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 84 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 85 PetscCall(PetscMalloc1(1, &coo_d)); 86 *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d 87 88 PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d 89 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount))); 90 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount))); 91 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount))); 92 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount))); 93 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount))); 94 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount))); 95 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount))); 96 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount))); 97 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount))); 98 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount))); 99 PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount))); 100 PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar))); 101 PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar))); 102 103 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 104 PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 105 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 106 PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 107 PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 108 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 109 PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 110 PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 111 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 112 PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 113 PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 114 115 // Put the COO struct in a container and then attach that to the matrix 116 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 117 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 118 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE)); 119 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 120 PetscCall(PetscContainerDestroy(&container_d)); 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 125 { 126 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 127 const PetscCount grid_size = gridDim.x * blockDim.x; 128 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 129 } 130 131 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 132 { 133 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 134 const PetscCount grid_size = gridDim.x * blockDim.x; 135 for (; i < Annz + Bnnz; i += grid_size) { 136 PetscScalar sum = 0.0; 137 if (i < Annz) { 138 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 139 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 140 } else { 141 i -= Annz; 142 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 143 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 144 } 145 } 146 } 147 148 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 149 { 150 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 151 const PetscCount grid_size = gridDim.x * blockDim.x; 152 for (; i < Annz2 + Bnnz2; i += grid_size) { 153 if (i < Annz2) { 154 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 155 } else { 156 i -= Annz2; 157 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 158 } 159 } 160 } 161 162 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 163 { 164 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 165 Mat A = mpiaij->A, B = mpiaij->B; 166 PetscScalar *Aa, *Ba; 167 const PetscScalar *v1 = v; 168 PetscMemType memtype; 169 PetscContainer container; 170 MatCOOStruct_MPIAIJ *coo; 171 172 PetscFunctionBegin; 173 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 174 PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix"); 175 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 176 177 const auto &Annz = coo->Annz; 178 const auto &Annz2 = coo->Annz2; 179 const auto &Bnnz = coo->Bnnz; 180 const auto &Bnnz2 = coo->Bnnz2; 181 const auto &vsend = coo->sendbuf; 182 const auto &v2 = coo->recvbuf; 183 const auto &Ajmap1 = coo->Ajmap1; 184 const auto &Ajmap2 = coo->Ajmap2; 185 const auto &Aimap2 = coo->Aimap2; 186 const auto &Bjmap1 = coo->Bjmap1; 187 const auto &Bjmap2 = coo->Bjmap2; 188 const auto &Bimap2 = coo->Bimap2; 189 const auto &Aperm1 = coo->Aperm1; 190 const auto &Aperm2 = coo->Aperm2; 191 const auto &Bperm1 = coo->Bperm1; 192 const auto &Bperm2 = coo->Bperm2; 193 const auto &Cperm1 = coo->Cperm1; 194 195 PetscCall(PetscGetMemType(v, &memtype)); 196 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 197 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 198 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 199 } 200 201 if (imode == INSERT_VALUES) { 202 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 203 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 204 } else { 205 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 206 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 207 } 208 209 PetscCall(PetscLogGpuTimeBegin()); 210 /* Pack entries to be sent to remote */ 211 if (coo->sendlen) { 212 MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend); 213 PetscCallCUDA(cudaPeekAtLastError()); 214 } 215 216 /* Send remote entries to their owner and overlap the communication with local computation */ 217 PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 218 /* Add local entries to A and B */ 219 if (Annz + Bnnz > 0) { 220 MatAddLocalCOOValues<<<(int)((Annz + Bnnz + 255) / 256), 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 221 PetscCallCUDA(cudaPeekAtLastError()); 222 } 223 PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 224 225 /* Add received remote entries to A and B */ 226 if (Annz2 + Bnnz2 > 0) { 227 MatAddRemoteCOOValues<<<(int)((Annz2 + Bnnz2 + 255) / 256), 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 228 PetscCallCUDA(cudaPeekAtLastError()); 229 } 230 PetscCall(PetscLogGpuTimeEnd()); 231 232 if (imode == INSERT_VALUES) { 233 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 234 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 235 } else { 236 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 237 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 238 } 239 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 240 mat->offloadmask = PETSC_OFFLOAD_GPU; 241 PetscFunctionReturn(PETSC_SUCCESS); 242 } 243 244 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 245 { 246 Mat Ad, Ao; 247 const PetscInt *cmap; 248 249 PetscFunctionBegin; 250 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 251 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 252 if (glob) { 253 PetscInt cst, i, dn, on, *gidx; 254 255 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 256 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 257 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 258 PetscCall(PetscMalloc1(dn + on, &gidx)); 259 for (i = 0; i < dn; i++) gidx[i] = cst + i; 260 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 261 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 262 } 263 PetscFunctionReturn(PETSC_SUCCESS); 264 } 265 266 static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 267 { 268 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 269 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 270 PetscInt i; 271 272 PetscFunctionBegin; 273 if (B->hash_active) { 274 B->ops[0] = b->cops; 275 B->hash_active = PETSC_FALSE; 276 } 277 PetscCall(PetscLayoutSetUp(B->rmap)); 278 PetscCall(PetscLayoutSetUp(B->cmap)); 279 if (PetscDefined(USE_DEBUG) && d_nnz) { 280 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 281 } 282 if (PetscDefined(USE_DEBUG) && o_nnz) { 283 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 284 } 285 #if defined(PETSC_USE_CTABLE) 286 PetscCall(PetscHMapIDestroy(&b->colmap)); 287 #else 288 PetscCall(PetscFree(b->colmap)); 289 #endif 290 PetscCall(PetscFree(b->garray)); 291 PetscCall(VecDestroy(&b->lvec)); 292 PetscCall(VecScatterDestroy(&b->Mvctx)); 293 /* Because the B will have been resized we simply destroy it and create a new one each time */ 294 PetscCall(MatDestroy(&b->B)); 295 if (!b->A) { 296 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 297 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 298 } 299 if (!b->B) { 300 PetscMPIInt size; 301 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 302 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 303 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 304 } 305 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 306 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 307 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 308 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 309 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 310 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 311 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 312 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 313 B->preallocated = PETSC_TRUE; 314 B->was_assembled = PETSC_FALSE; 315 B->assembled = PETSC_FALSE; 316 PetscFunctionReturn(PETSC_SUCCESS); 317 } 318 319 static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 320 { 321 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 322 323 PetscFunctionBegin; 324 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 325 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 326 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 327 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 328 PetscFunctionReturn(PETSC_SUCCESS); 329 } 330 331 static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 332 { 333 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 334 335 PetscFunctionBegin; 336 PetscCall(MatZeroEntries(l->A)); 337 PetscCall(MatZeroEntries(l->B)); 338 PetscFunctionReturn(PETSC_SUCCESS); 339 } 340 341 static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 342 { 343 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 344 345 PetscFunctionBegin; 346 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 347 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 348 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 349 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 350 PetscFunctionReturn(PETSC_SUCCESS); 351 } 352 353 static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 354 { 355 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 356 357 PetscFunctionBegin; 358 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 359 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 360 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 361 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 362 PetscFunctionReturn(PETSC_SUCCESS); 363 } 364 365 static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 366 { 367 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 368 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 369 370 PetscFunctionBegin; 371 switch (op) { 372 case MAT_CUSPARSE_MULT_DIAG: 373 cusparseStruct->diagGPUMatFormat = format; 374 break; 375 case MAT_CUSPARSE_MULT_OFFDIAG: 376 cusparseStruct->offdiagGPUMatFormat = format; 377 break; 378 case MAT_CUSPARSE_ALL: 379 cusparseStruct->diagGPUMatFormat = format; 380 cusparseStruct->offdiagGPUMatFormat = format; 381 break; 382 default: 383 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 384 } 385 PetscFunctionReturn(PETSC_SUCCESS); 386 } 387 388 static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 389 { 390 MatCUSPARSEStorageFormat format; 391 PetscBool flg; 392 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 393 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 394 395 PetscFunctionBegin; 396 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 397 if (A->factortype == MAT_FACTOR_NONE) { 398 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 399 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 400 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 401 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 402 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 403 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 404 } 405 PetscOptionsHeadEnd(); 406 PetscFunctionReturn(PETSC_SUCCESS); 407 } 408 409 static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 410 { 411 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 412 413 PetscFunctionBegin; 414 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 415 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 416 PetscFunctionReturn(PETSC_SUCCESS); 417 } 418 419 static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 420 { 421 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 422 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 423 424 PetscFunctionBegin; 425 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 426 PetscCallCXX(delete cusparseStruct); 427 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 428 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 429 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 430 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 431 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 432 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 433 PetscCall(MatDestroy_MPIAIJ(A)); 434 PetscFunctionReturn(PETSC_SUCCESS); 435 } 436 437 /* defines MatSetValues_MPICUSPARSE_Hash() */ 438 #define TYPE AIJ 439 #define TYPE_AIJ 440 #define SUB_TYPE_CUSPARSE 441 #include "../src/mat/impls/aij/mpi/mpihashmat.h" 442 #undef TYPE 443 #undef TYPE_AIJ 444 #undef SUB_TYPE_CUSPARSE 445 446 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 447 { 448 Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 449 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 450 451 PetscFunctionBegin; 452 PetscCall(MatSetUp_MPI_Hash(A)); 453 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 454 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 455 A->preallocated = PETSC_TRUE; 456 PetscFunctionReturn(PETSC_SUCCESS); 457 } 458 459 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 460 { 461 Mat_MPIAIJ *a; 462 Mat A; 463 464 PetscFunctionBegin; 465 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 466 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 467 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 468 A = *newmat; 469 A->boundtocpu = PETSC_FALSE; 470 PetscCall(PetscFree(A->defaultvectype)); 471 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 472 473 a = (Mat_MPIAIJ *)A->data; 474 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 475 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 476 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 477 478 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 479 480 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 481 A->ops->mult = MatMult_MPIAIJCUSPARSE; 482 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 483 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 484 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 485 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 486 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 487 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 488 A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 489 490 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 491 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 492 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 493 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 494 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 495 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 496 #if defined(PETSC_HAVE_HYPRE) 497 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 498 #endif 499 PetscFunctionReturn(PETSC_SUCCESS); 500 } 501 502 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 503 { 504 PetscFunctionBegin; 505 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 506 PetscCall(MatCreate_MPIAIJ(A)); 507 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 508 PetscFunctionReturn(PETSC_SUCCESS); 509 } 510 511 /*@ 512 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 513 (the default parallel PETSc format). This matrix will ultimately pushed down 514 to NVIDIA GPUs and use the CuSPARSE library for calculations. 515 516 Collective 517 518 Input Parameters: 519 + comm - MPI communicator, set to `PETSC_COMM_SELF` 520 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 521 This value should be the same as the local size used in creating the 522 y vector for the matrix-vector product y = Ax. 523 . n - This value should be the same as the local size used in creating the 524 x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 525 calculated if `N` is given) For square matrices `n` is almost always `m`. 526 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 527 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 528 . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 529 (same value is used for all local rows) 530 . d_nnz - array containing the number of nonzeros in the various rows of the 531 DIAGONAL portion of the local submatrix (possibly different for each row) 532 or `NULL`, if `d_nz` is used to specify the nonzero structure. 533 The size of this array is equal to the number of local rows, i.e `m`. 534 For matrices you plan to factor you must leave room for the diagonal entry and 535 put in the entry even if it is zero. 536 . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 537 submatrix (same value is used for all local rows). 538 - o_nnz - array containing the number of nonzeros in the various rows of the 539 OFF-DIAGONAL portion of the local submatrix (possibly different for 540 each row) or `NULL`, if `o_nz` is used to specify the nonzero 541 structure. The size of this array is equal to the number 542 of local rows, i.e `m`. 543 544 Output Parameter: 545 . A - the matrix 546 547 Level: intermediate 548 549 Notes: 550 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 551 MatXXXXSetPreallocation() paradigm instead of this routine directly. 552 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 553 554 The AIJ format, also called the 555 compressed row storage), is fully compatible with standard Fortran 556 storage. That is, the stored row and column indices can begin at 557 either one (as in Fortran) or zero. 558 559 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE` 560 @*/ 561 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 562 { 563 PetscMPIInt size; 564 565 PetscFunctionBegin; 566 PetscCall(MatCreate(comm, A)); 567 PetscCall(MatSetSizes(*A, m, n, M, N)); 568 PetscCallMPI(MPI_Comm_size(comm, &size)); 569 if (size > 1) { 570 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 571 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 572 } else { 573 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 574 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 575 } 576 PetscFunctionReturn(PETSC_SUCCESS); 577 } 578 579 /*MC 580 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 581 582 A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 583 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 584 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 585 586 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 587 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 588 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 589 for communicators controlling multiple processes. It is recommended that you call both of 590 the above preallocation routines for simplicity. 591 592 Options Database Keys: 593 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` 594 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid). 595 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 596 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 597 598 Level: beginner 599 600 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 601 M*/ 602 603 /*MC 604 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 605 606 Level: beginner 607 608 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 609 M*/ 610