1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 #include <thrust/advance.h> 8 #include <thrust/partition.h> 9 #include <thrust/sort.h> 10 #include <thrust/unique.h> 11 #include <petscsf.h> 12 13 struct VecCUDAEquals { 14 template <typename Tuple> 15 __host__ __device__ void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data) 22 { 23 MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data; 24 25 PetscFunctionBegin; 26 PetscCall(PetscSFDestroy(&coo->sf)); 27 PetscCallCUDA(cudaFree(coo->Ajmap1)); 28 PetscCallCUDA(cudaFree(coo->Aperm1)); 29 PetscCallCUDA(cudaFree(coo->Bjmap1)); 30 PetscCallCUDA(cudaFree(coo->Bperm1)); 31 PetscCallCUDA(cudaFree(coo->Aimap2)); 32 PetscCallCUDA(cudaFree(coo->Ajmap2)); 33 PetscCallCUDA(cudaFree(coo->Aperm2)); 34 PetscCallCUDA(cudaFree(coo->Bimap2)); 35 PetscCallCUDA(cudaFree(coo->Bjmap2)); 36 PetscCallCUDA(cudaFree(coo->Bperm2)); 37 PetscCallCUDA(cudaFree(coo->Cperm1)); 38 PetscCallCUDA(cudaFree(coo->sendbuf)); 39 PetscCallCUDA(cudaFree(coo->recvbuf)); 40 PetscCall(PetscFree(coo)); 41 PetscFunctionReturn(PETSC_SUCCESS); 42 } 43 44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 45 { 46 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 47 PetscBool dev_ij = PETSC_FALSE; 48 PetscMemType mtype = PETSC_MEMTYPE_HOST; 49 PetscInt *i, *j; 50 PetscContainer container_h, container_d; 51 MatCOOStruct_MPIAIJ *coo_h, *coo_d; 52 53 PetscFunctionBegin; 54 PetscCall(PetscFree(mpiaij->garray)); 55 PetscCall(VecDestroy(&mpiaij->lvec)); 56 #if defined(PETSC_USE_CTABLE) 57 PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 58 #else 59 PetscCall(PetscFree(mpiaij->colmap)); 60 #endif 61 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 62 mat->assembled = PETSC_FALSE; 63 mat->was_assembled = PETSC_FALSE; 64 PetscCall(PetscGetMemType(coo_i, &mtype)); 65 if (PetscMemTypeDevice(mtype)) { 66 dev_ij = PETSC_TRUE; 67 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 68 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 69 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 70 } else { 71 i = coo_i; 72 j = coo_j; 73 } 74 75 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j)); 76 if (dev_ij) PetscCall(PetscFree2(i, j)); 77 mat->offloadmask = PETSC_OFFLOAD_CPU; 78 // Create the GPU memory 79 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 80 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 81 82 // Copy the COO struct to device 83 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 84 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 85 PetscCall(PetscMalloc1(1, &coo_d)); 86 *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d 87 88 PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d 89 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount))); 90 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount))); 91 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount))); 92 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount))); 93 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount))); 94 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount))); 95 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount))); 96 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount))); 97 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount))); 98 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount))); 99 PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount))); 100 PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar))); 101 PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar))); 102 103 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 104 PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 105 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 106 PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 107 PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 108 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 109 PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 110 PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 111 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 112 PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 113 PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 114 115 // Put the COO struct in a container and then attach that to the matrix 116 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 117 PetscCall(PetscContainerSetPointer(container_d, coo_d)); 118 PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE)); 119 PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 120 PetscCall(PetscContainerDestroy(&container_d)); 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 125 { 126 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 127 const PetscCount grid_size = gridDim.x * blockDim.x; 128 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 129 } 130 131 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 132 { 133 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 134 const PetscCount grid_size = gridDim.x * blockDim.x; 135 for (; i < Annz + Bnnz; i += grid_size) { 136 PetscScalar sum = 0.0; 137 if (i < Annz) { 138 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 139 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 140 } else { 141 i -= Annz; 142 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 143 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 144 } 145 } 146 } 147 148 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 149 { 150 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 151 const PetscCount grid_size = gridDim.x * blockDim.x; 152 for (; i < Annz2 + Bnnz2; i += grid_size) { 153 if (i < Annz2) { 154 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 155 } else { 156 i -= Annz2; 157 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 158 } 159 } 160 } 161 162 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 163 { 164 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 165 Mat A = mpiaij->A, B = mpiaij->B; 166 PetscScalar *Aa, *Ba; 167 const PetscScalar *v1 = v; 168 PetscMemType memtype; 169 PetscContainer container; 170 MatCOOStruct_MPIAIJ *coo; 171 172 PetscFunctionBegin; 173 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 174 PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix"); 175 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 176 177 const auto &Annz = coo->Annz; 178 const auto &Annz2 = coo->Annz2; 179 const auto &Bnnz = coo->Bnnz; 180 const auto &Bnnz2 = coo->Bnnz2; 181 const auto &vsend = coo->sendbuf; 182 const auto &v2 = coo->recvbuf; 183 const auto &Ajmap1 = coo->Ajmap1; 184 const auto &Ajmap2 = coo->Ajmap2; 185 const auto &Aimap2 = coo->Aimap2; 186 const auto &Bjmap1 = coo->Bjmap1; 187 const auto &Bjmap2 = coo->Bjmap2; 188 const auto &Bimap2 = coo->Bimap2; 189 const auto &Aperm1 = coo->Aperm1; 190 const auto &Aperm2 = coo->Aperm2; 191 const auto &Bperm1 = coo->Bperm1; 192 const auto &Bperm2 = coo->Bperm2; 193 const auto &Cperm1 = coo->Cperm1; 194 195 PetscCall(PetscGetMemType(v, &memtype)); 196 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 197 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 198 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 199 } 200 201 if (imode == INSERT_VALUES) { 202 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 203 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 204 } else { 205 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 206 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 207 } 208 209 /* Pack entries to be sent to remote */ 210 if (coo->sendlen) { 211 MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend); 212 PetscCallCUDA(cudaPeekAtLastError()); 213 } 214 215 /* Send remote entries to their owner and overlap the communication with local computation */ 216 PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 217 /* Add local entries to A and B */ 218 if (Annz + Bnnz > 0) { 219 MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 220 PetscCallCUDA(cudaPeekAtLastError()); 221 } 222 PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 223 224 /* Add received remote entries to A and B */ 225 if (Annz2 + Bnnz2 > 0) { 226 MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 227 PetscCallCUDA(cudaPeekAtLastError()); 228 } 229 230 if (imode == INSERT_VALUES) { 231 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 232 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 233 } else { 234 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 235 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 236 } 237 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 238 mat->offloadmask = PETSC_OFFLOAD_GPU; 239 PetscFunctionReturn(PETSC_SUCCESS); 240 } 241 242 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 243 { 244 Mat Ad, Ao; 245 const PetscInt *cmap; 246 247 PetscFunctionBegin; 248 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 249 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 250 if (glob) { 251 PetscInt cst, i, dn, on, *gidx; 252 253 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 254 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 255 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 256 PetscCall(PetscMalloc1(dn + on, &gidx)); 257 for (i = 0; i < dn; i++) gidx[i] = cst + i; 258 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 259 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 260 } 261 PetscFunctionReturn(PETSC_SUCCESS); 262 } 263 264 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 265 { 266 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 267 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 268 PetscInt i; 269 270 PetscFunctionBegin; 271 if (B->hash_active) { 272 B->ops[0] = b->cops; 273 B->hash_active = PETSC_FALSE; 274 } 275 PetscCall(PetscLayoutSetUp(B->rmap)); 276 PetscCall(PetscLayoutSetUp(B->cmap)); 277 if (PetscDefined(USE_DEBUG) && d_nnz) { 278 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 279 } 280 if (PetscDefined(USE_DEBUG) && o_nnz) { 281 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 282 } 283 #if defined(PETSC_USE_CTABLE) 284 PetscCall(PetscHMapIDestroy(&b->colmap)); 285 #else 286 PetscCall(PetscFree(b->colmap)); 287 #endif 288 PetscCall(PetscFree(b->garray)); 289 PetscCall(VecDestroy(&b->lvec)); 290 PetscCall(VecScatterDestroy(&b->Mvctx)); 291 /* Because the B will have been resized we simply destroy it and create a new one each time */ 292 PetscCall(MatDestroy(&b->B)); 293 if (!b->A) { 294 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 295 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 296 } 297 if (!b->B) { 298 PetscMPIInt size; 299 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 300 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 301 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 302 } 303 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 304 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 305 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 306 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 307 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 308 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 309 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 310 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 311 B->preallocated = PETSC_TRUE; 312 B->was_assembled = PETSC_FALSE; 313 B->assembled = PETSC_FALSE; 314 PetscFunctionReturn(PETSC_SUCCESS); 315 } 316 317 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 318 { 319 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 320 321 PetscFunctionBegin; 322 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 323 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 324 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 325 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 326 PetscFunctionReturn(PETSC_SUCCESS); 327 } 328 329 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 330 { 331 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 332 333 PetscFunctionBegin; 334 PetscCall(MatZeroEntries(l->A)); 335 PetscCall(MatZeroEntries(l->B)); 336 PetscFunctionReturn(PETSC_SUCCESS); 337 } 338 339 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 340 { 341 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 342 343 PetscFunctionBegin; 344 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 345 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 346 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 347 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 348 PetscFunctionReturn(PETSC_SUCCESS); 349 } 350 351 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 352 { 353 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 354 355 PetscFunctionBegin; 356 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 357 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 358 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 359 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 360 PetscFunctionReturn(PETSC_SUCCESS); 361 } 362 363 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 364 { 365 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 366 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 367 368 PetscFunctionBegin; 369 switch (op) { 370 case MAT_CUSPARSE_MULT_DIAG: 371 cusparseStruct->diagGPUMatFormat = format; 372 break; 373 case MAT_CUSPARSE_MULT_OFFDIAG: 374 cusparseStruct->offdiagGPUMatFormat = format; 375 break; 376 case MAT_CUSPARSE_ALL: 377 cusparseStruct->diagGPUMatFormat = format; 378 cusparseStruct->offdiagGPUMatFormat = format; 379 break; 380 default: 381 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 382 } 383 PetscFunctionReturn(PETSC_SUCCESS); 384 } 385 386 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 387 { 388 MatCUSPARSEStorageFormat format; 389 PetscBool flg; 390 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 391 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 392 393 PetscFunctionBegin; 394 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 395 if (A->factortype == MAT_FACTOR_NONE) { 396 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 397 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 398 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 399 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 400 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 401 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 402 } 403 PetscOptionsHeadEnd(); 404 PetscFunctionReturn(PETSC_SUCCESS); 405 } 406 407 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 408 { 409 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 410 411 PetscFunctionBegin; 412 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 413 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 414 PetscFunctionReturn(PETSC_SUCCESS); 415 } 416 417 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 418 { 419 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 420 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 421 422 PetscFunctionBegin; 423 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 424 PetscCallCXX(delete cusparseStruct); 425 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 426 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 427 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 428 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 429 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 430 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 431 PetscCall(MatDestroy_MPIAIJ(A)); 432 PetscFunctionReturn(PETSC_SUCCESS); 433 } 434 435 /* defines MatSetValues_MPICUSPARSE_Hash() */ 436 #define TYPE AIJ 437 #define TYPE_AIJ 438 #define SUB_TYPE_CUSPARSE 439 #include "../src/mat/impls/aij/mpi/mpihashmat.h" 440 #undef TYPE 441 #undef TYPE_AIJ 442 #undef SUB_TYPE_CUSPARSE 443 444 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 445 { 446 Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 447 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 448 449 PetscFunctionBegin; 450 PetscCall(MatSetUp_MPI_Hash(A)); 451 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 452 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 453 A->preallocated = PETSC_TRUE; 454 PetscFunctionReturn(PETSC_SUCCESS); 455 } 456 457 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 458 { 459 Mat_MPIAIJ *a; 460 Mat A; 461 462 PetscFunctionBegin; 463 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 464 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 465 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 466 A = *newmat; 467 A->boundtocpu = PETSC_FALSE; 468 PetscCall(PetscFree(A->defaultvectype)); 469 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 470 471 a = (Mat_MPIAIJ *)A->data; 472 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 473 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 474 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 475 476 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 477 478 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 479 A->ops->mult = MatMult_MPIAIJCUSPARSE; 480 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 481 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 482 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 483 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 484 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 485 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 486 A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 487 488 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 489 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 490 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 491 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 492 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 493 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 494 #if defined(PETSC_HAVE_HYPRE) 495 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 496 #endif 497 PetscFunctionReturn(PETSC_SUCCESS); 498 } 499 500 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 501 { 502 PetscFunctionBegin; 503 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 504 PetscCall(MatCreate_MPIAIJ(A)); 505 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 506 PetscFunctionReturn(PETSC_SUCCESS); 507 } 508 509 /*@ 510 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 511 (the default parallel PETSc format). This matrix will ultimately pushed down 512 to NVIDIA GPUs and use the CuSPARSE library for calculations. 513 514 Collective 515 516 Input Parameters: 517 + comm - MPI communicator, set to `PETSC_COMM_SELF` 518 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 519 This value should be the same as the local size used in creating the 520 y vector for the matrix-vector product y = Ax. 521 . n - This value should be the same as the local size used in creating the 522 x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 523 calculated if `N` is given) For square matrices `n` is almost always `m`. 524 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 525 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 526 . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 527 (same value is used for all local rows) 528 . d_nnz - array containing the number of nonzeros in the various rows of the 529 DIAGONAL portion of the local submatrix (possibly different for each row) 530 or `NULL`, if `d_nz` is used to specify the nonzero structure. 531 The size of this array is equal to the number of local rows, i.e `m`. 532 For matrices you plan to factor you must leave room for the diagonal entry and 533 put in the entry even if it is zero. 534 . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 535 submatrix (same value is used for all local rows). 536 - o_nnz - array containing the number of nonzeros in the various rows of the 537 OFF-DIAGONAL portion of the local submatrix (possibly different for 538 each row) or `NULL`, if `o_nz` is used to specify the nonzero 539 structure. The size of this array is equal to the number 540 of local rows, i.e `m`. 541 542 Output Parameter: 543 . A - the matrix 544 545 Level: intermediate 546 547 Notes: 548 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 549 MatXXXXSetPreallocation() paradigm instead of this routine directly. 550 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 551 552 The AIJ format, also called the 553 compressed row storage), is fully compatible with standard Fortran 554 storage. That is, the stored row and column indices can begin at 555 either one (as in Fortran) or zero. 556 557 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 558 @*/ 559 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 560 { 561 PetscMPIInt size; 562 563 PetscFunctionBegin; 564 PetscCall(MatCreate(comm, A)); 565 PetscCall(MatSetSizes(*A, m, n, M, N)); 566 PetscCallMPI(MPI_Comm_size(comm, &size)); 567 if (size > 1) { 568 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 569 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 570 } else { 571 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 572 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 573 } 574 PetscFunctionReturn(PETSC_SUCCESS); 575 } 576 577 /*MC 578 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 579 580 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 581 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 582 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 583 584 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 585 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 586 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 587 for communicators controlling multiple processes. It is recommended that you call both of 588 the above preallocation routines for simplicity. 589 590 Options Database Keys: 591 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` 592 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid). 593 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 594 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 595 596 Level: beginner 597 598 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 599 M*/ 600 601 /*MC 602 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 603 604 Level: beginner 605 606 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 607 M*/ 608