1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 #include <thrust/advance.h> 8 #include <thrust/partition.h> 9 #include <thrust/sort.h> 10 #include <thrust/unique.h> 11 #include <petscsf.h> 12 13 struct VecCUDAEquals { 14 template <typename Tuple> 15 __host__ __device__ void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 22 { 23 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 24 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 25 26 PetscFunctionBegin; 27 if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS); 28 if (cusparseStruct->use_extended_coo) { 29 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 30 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 31 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 32 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 41 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 42 } 43 cusparseStruct->use_extended_coo = PETSC_FALSE; 44 delete cusparseStruct->coo_p; 45 delete cusparseStruct->coo_pw; 46 cusparseStruct->coo_p = NULL; 47 cusparseStruct->coo_pw = NULL; 48 PetscFunctionReturn(PETSC_SUCCESS); 49 } 50 51 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 52 { 53 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 54 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 55 PetscInt n = cusp->coo_nd + cusp->coo_no; 56 57 PetscFunctionBegin; 58 if (cusp->coo_p && v) { 59 thrust::device_ptr<const PetscScalar> d_v; 60 THRUSTARRAY *w = NULL; 61 62 if (isCudaMem(v)) { 63 d_v = thrust::device_pointer_cast(v); 64 } else { 65 w = new THRUSTARRAY(n); 66 w->assign(v, v + n); 67 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 68 d_v = w->data(); 69 } 70 71 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 72 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 73 PetscCall(PetscLogGpuTimeBegin()); 74 thrust::for_each(zibit, zieit, VecCUDAEquals()); 75 PetscCall(PetscLogGpuTimeEnd()); 76 delete w; 77 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 78 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 79 } else { 80 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 81 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 82 } 83 PetscFunctionReturn(PETSC_SUCCESS); 84 } 85 86 template <typename Tuple> 87 struct IsNotOffDiagT { 88 PetscInt _cstart, _cend; 89 90 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 91 __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 92 }; 93 94 struct IsOffDiag { 95 PetscInt _cstart, _cend; 96 97 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 98 __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 99 }; 100 101 struct GlobToLoc { 102 PetscInt _start; 103 104 GlobToLoc(PetscInt start) : _start(start) { } 105 __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 106 }; 107 108 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 109 { 110 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 111 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 112 PetscInt N, *jj; 113 size_t noff = 0; 114 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 115 THRUSTINTARRAY d_j(n); 116 ISLocalToGlobalMapping l2g; 117 118 PetscFunctionBegin; 119 PetscCall(MatDestroy(&b->A)); 120 PetscCall(MatDestroy(&b->B)); 121 122 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 123 d_i.assign(coo_i, coo_i + n); 124 d_j.assign(coo_j, coo_j + n); 125 PetscCall(PetscLogGpuTimeBegin()); 126 auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 127 auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 128 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 129 cusp->coo_p = new THRUSTINTARRAY(n); 130 cusp->coo_pw = new THRUSTARRAY(n); 131 thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 132 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 133 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 134 auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 135 firstoffd = mzipp.get_iterator_tuple().get<1>(); 136 } 137 cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 138 cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 139 140 /* from global to local */ 141 thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 142 thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 143 PetscCall(PetscLogGpuTimeEnd()); 144 145 /* copy offdiag column indices to map on the CPU */ 146 PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 147 PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 148 auto o_j = d_j.begin(); 149 PetscCall(PetscLogGpuTimeBegin()); 150 thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 151 thrust::sort(thrust::device, o_j, d_j.end()); 152 auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 153 PetscCall(PetscLogGpuTimeEnd()); 154 noff = thrust::distance(o_j, wit); 155 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 156 PetscCall(PetscMalloc1(noff, &b->garray)); 157 PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 158 PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 159 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 160 PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 161 PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 162 PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 163 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 164 165 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 166 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 167 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 168 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 169 PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 170 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 171 172 /* GPU memory, cusparse specific call handles it internally */ 173 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 174 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 175 PetscCall(PetscFree(jj)); 176 177 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 178 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 179 180 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 181 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 182 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 183 PetscFunctionReturn(PETSC_SUCCESS); 184 } 185 186 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 187 { 188 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 189 Mat_MPIAIJCUSPARSE *mpidev; 190 PetscBool coo_basic = PETSC_TRUE; 191 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 192 PetscInt rstart, rend; 193 194 PetscFunctionBegin; 195 PetscCall(PetscFree(mpiaij->garray)); 196 PetscCall(VecDestroy(&mpiaij->lvec)); 197 #if defined(PETSC_USE_CTABLE) 198 PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 199 #else 200 PetscCall(PetscFree(mpiaij->colmap)); 201 #endif 202 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 203 mat->assembled = PETSC_FALSE; 204 mat->was_assembled = PETSC_FALSE; 205 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 206 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 207 if (coo_i) { 208 PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 209 PetscCall(PetscGetMemType(coo_i, &mtype)); 210 if (PetscMemTypeHost(mtype)) { 211 for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 212 if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 213 coo_basic = PETSC_FALSE; 214 break; 215 } 216 } 217 } 218 } 219 /* All ranks must agree on the value of coo_basic */ 220 PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 221 if (coo_basic) { 222 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 223 } else { 224 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 225 mat->offloadmask = PETSC_OFFLOAD_CPU; 226 /* creates the GPU memory */ 227 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 228 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 229 mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 230 mpidev->use_extended_coo = PETSC_TRUE; 231 232 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 233 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 234 235 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 236 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 237 238 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 239 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 240 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 241 242 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 243 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 244 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 245 246 PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 247 PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 248 PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 249 250 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 251 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 252 253 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 254 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 255 256 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 257 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 258 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 259 260 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 261 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 262 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 263 264 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 265 } 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 270 { 271 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 272 const PetscCount grid_size = gridDim.x * blockDim.x; 273 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 274 } 275 276 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 277 { 278 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 279 const PetscCount grid_size = gridDim.x * blockDim.x; 280 for (; i < Annz + Bnnz; i += grid_size) { 281 PetscScalar sum = 0.0; 282 if (i < Annz) { 283 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 284 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 285 } else { 286 i -= Annz; 287 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 288 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 289 } 290 } 291 } 292 293 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 294 { 295 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 296 const PetscCount grid_size = gridDim.x * blockDim.x; 297 for (; i < Annz2 + Bnnz2; i += grid_size) { 298 if (i < Annz2) { 299 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 300 } else { 301 i -= Annz2; 302 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 303 } 304 } 305 } 306 307 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 308 { 309 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 310 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 311 Mat A = mpiaij->A, B = mpiaij->B; 312 PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 313 PetscScalar *Aa, *Ba = NULL; 314 PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 315 const PetscScalar *v1 = v; 316 const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 317 const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 318 const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 319 const PetscCount *Cperm1 = mpidev->Cperm1_d; 320 PetscMemType memtype; 321 322 PetscFunctionBegin; 323 if (mpidev->use_extended_coo) { 324 PetscMPIInt size; 325 326 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 327 PetscCall(PetscGetMemType(v, &memtype)); 328 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 329 PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 330 PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 331 } 332 333 if (imode == INSERT_VALUES) { 334 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 335 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 336 } else { 337 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 338 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 339 } 340 341 /* Pack entries to be sent to remote */ 342 if (mpiaij->sendlen) { 343 MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 344 PetscCallCUDA(cudaPeekAtLastError()); 345 } 346 347 /* Send remote entries to their owner and overlap the communication with local computation */ 348 PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 349 /* Add local entries to A and B */ 350 if (Annz + Bnnz > 0) { 351 MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 352 PetscCallCUDA(cudaPeekAtLastError()); 353 } 354 PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 355 356 /* Add received remote entries to A and B */ 357 if (Annz2 + Bnnz2 > 0) { 358 MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 359 PetscCallCUDA(cudaPeekAtLastError()); 360 } 361 362 if (imode == INSERT_VALUES) { 363 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 364 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 365 } else { 366 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 367 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 368 } 369 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 370 } else { 371 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 372 } 373 mat->offloadmask = PETSC_OFFLOAD_GPU; 374 PetscFunctionReturn(PETSC_SUCCESS); 375 } 376 377 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 378 { 379 Mat Ad, Ao; 380 const PetscInt *cmap; 381 382 PetscFunctionBegin; 383 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 384 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 385 if (glob) { 386 PetscInt cst, i, dn, on, *gidx; 387 388 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 389 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 390 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 391 PetscCall(PetscMalloc1(dn + on, &gidx)); 392 for (i = 0; i < dn; i++) gidx[i] = cst + i; 393 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 394 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 395 } 396 PetscFunctionReturn(PETSC_SUCCESS); 397 } 398 399 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 400 { 401 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 402 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 403 PetscInt i; 404 405 PetscFunctionBegin; 406 PetscCall(PetscLayoutSetUp(B->rmap)); 407 PetscCall(PetscLayoutSetUp(B->cmap)); 408 if (PetscDefined(USE_DEBUG) && d_nnz) { 409 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 410 } 411 if (PetscDefined(USE_DEBUG) && o_nnz) { 412 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 413 } 414 #if defined(PETSC_USE_CTABLE) 415 PetscCall(PetscHMapIDestroy(&b->colmap)); 416 #else 417 PetscCall(PetscFree(b->colmap)); 418 #endif 419 PetscCall(PetscFree(b->garray)); 420 PetscCall(VecDestroy(&b->lvec)); 421 PetscCall(VecScatterDestroy(&b->Mvctx)); 422 /* Because the B will have been resized we simply destroy it and create a new one each time */ 423 PetscCall(MatDestroy(&b->B)); 424 if (!b->A) { 425 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 426 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 427 } 428 if (!b->B) { 429 PetscMPIInt size; 430 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 431 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 432 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 433 } 434 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 435 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 436 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 437 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 438 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 439 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 440 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 441 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 442 B->preallocated = PETSC_TRUE; 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 447 { 448 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 449 450 PetscFunctionBegin; 451 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 452 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 453 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 454 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 455 PetscFunctionReturn(PETSC_SUCCESS); 456 } 457 458 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 459 { 460 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 461 462 PetscFunctionBegin; 463 PetscCall(MatZeroEntries(l->A)); 464 PetscCall(MatZeroEntries(l->B)); 465 PetscFunctionReturn(PETSC_SUCCESS); 466 } 467 468 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 469 { 470 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 471 472 PetscFunctionBegin; 473 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 474 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 475 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 476 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 477 PetscFunctionReturn(PETSC_SUCCESS); 478 } 479 480 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 481 { 482 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 483 484 PetscFunctionBegin; 485 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 486 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 487 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 488 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 489 PetscFunctionReturn(PETSC_SUCCESS); 490 } 491 492 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 493 { 494 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 495 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 496 497 PetscFunctionBegin; 498 switch (op) { 499 case MAT_CUSPARSE_MULT_DIAG: 500 cusparseStruct->diagGPUMatFormat = format; 501 break; 502 case MAT_CUSPARSE_MULT_OFFDIAG: 503 cusparseStruct->offdiagGPUMatFormat = format; 504 break; 505 case MAT_CUSPARSE_ALL: 506 cusparseStruct->diagGPUMatFormat = format; 507 cusparseStruct->offdiagGPUMatFormat = format; 508 break; 509 default: 510 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 511 } 512 PetscFunctionReturn(PETSC_SUCCESS); 513 } 514 515 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 516 { 517 MatCUSPARSEStorageFormat format; 518 PetscBool flg; 519 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 520 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 521 522 PetscFunctionBegin; 523 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 524 if (A->factortype == MAT_FACTOR_NONE) { 525 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 526 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 527 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 528 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 529 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 530 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 531 } 532 PetscOptionsHeadEnd(); 533 PetscFunctionReturn(PETSC_SUCCESS); 534 } 535 536 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 537 { 538 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 539 540 PetscFunctionBegin; 541 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 542 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 543 PetscFunctionReturn(PETSC_SUCCESS); 544 } 545 546 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 547 { 548 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 549 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 550 551 PetscFunctionBegin; 552 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 553 /* Free COO */ 554 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 555 PetscCallCXX(delete cusparseStruct); 556 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 557 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 558 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 559 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 560 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 561 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 562 PetscCall(MatDestroy_MPIAIJ(A)); 563 PetscFunctionReturn(PETSC_SUCCESS); 564 } 565 566 /* defines MatSetValues_MPICUSPARSE_Hash() */ 567 #define TYPE AIJ 568 #define TYPE_AIJ 569 #define SUB_TYPE_CUSPARSE 570 #include "../src/mat/impls/aij/mpi/mpihashmat.h" 571 #undef TYPE 572 #undef TYPE_AIJ 573 #undef SUB_TYPE_CUSPARSE 574 575 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 576 { 577 Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 578 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 579 580 PetscFunctionBegin; 581 PetscCall(MatSetUp_MPI_Hash(A)); 582 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 583 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 584 A->preallocated = PETSC_TRUE; 585 PetscFunctionReturn(PETSC_SUCCESS); 586 } 587 588 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 589 { 590 Mat_MPIAIJ *a; 591 Mat A; 592 593 PetscFunctionBegin; 594 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 595 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 596 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 597 A = *newmat; 598 A->boundtocpu = PETSC_FALSE; 599 PetscCall(PetscFree(A->defaultvectype)); 600 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 601 602 a = (Mat_MPIAIJ *)A->data; 603 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 604 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 605 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 606 607 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 608 609 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 610 A->ops->mult = MatMult_MPIAIJCUSPARSE; 611 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 612 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 613 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 614 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 615 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 616 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 617 A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 618 619 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 620 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 621 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 622 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 623 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 624 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 625 #if defined(PETSC_HAVE_HYPRE) 626 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 627 #endif 628 PetscFunctionReturn(PETSC_SUCCESS); 629 } 630 631 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 632 { 633 PetscFunctionBegin; 634 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 635 PetscCall(MatCreate_MPIAIJ(A)); 636 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 637 PetscFunctionReturn(PETSC_SUCCESS); 638 } 639 640 /*@ 641 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 642 (the default parallel PETSc format). This matrix will ultimately pushed down 643 to NVIDIA GPUs and use the CuSPARSE library for calculations. 644 645 Collective 646 647 Input Parameters: 648 + comm - MPI communicator, set to `PETSC_COMM_SELF` 649 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 650 This value should be the same as the local size used in creating the 651 y vector for the matrix-vector product y = Ax. 652 . n - This value should be the same as the local size used in creating the 653 x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 654 calculated if `N` is given) For square matrices `n` is almost always `m`. 655 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 656 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 657 . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 658 (same value is used for all local rows) 659 . d_nnz - array containing the number of nonzeros in the various rows of the 660 DIAGONAL portion of the local submatrix (possibly different for each row) 661 or `NULL`, if `d_nz` is used to specify the nonzero structure. 662 The size of this array is equal to the number of local rows, i.e `m`. 663 For matrices you plan to factor you must leave room for the diagonal entry and 664 put in the entry even if it is zero. 665 . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 666 submatrix (same value is used for all local rows). 667 - o_nnz - array containing the number of nonzeros in the various rows of the 668 OFF-DIAGONAL portion of the local submatrix (possibly different for 669 each row) or `NULL`, if `o_nz` is used to specify the nonzero 670 structure. The size of this array is equal to the number 671 of local rows, i.e `m`. 672 673 Output Parameter: 674 . A - the matrix 675 676 Level: intermediate 677 678 Notes: 679 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 680 MatXXXXSetPreallocation() paradigm instead of this routine directly. 681 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 682 683 The AIJ format, also called the 684 compressed row storage), is fully compatible with standard Fortran 685 storage. That is, the stored row and column indices can begin at 686 either one (as in Fortran) or zero. 687 688 .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 689 @*/ 690 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 691 { 692 PetscMPIInt size; 693 694 PetscFunctionBegin; 695 PetscCall(MatCreate(comm, A)); 696 PetscCall(MatSetSizes(*A, m, n, M, N)); 697 PetscCallMPI(MPI_Comm_size(comm, &size)); 698 if (size > 1) { 699 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 700 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 701 } else { 702 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 703 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 704 } 705 PetscFunctionReturn(PETSC_SUCCESS); 706 } 707 708 /*MC 709 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 710 711 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 712 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 713 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 714 715 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 716 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 717 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 718 for communicators controlling multiple processes. It is recommended that you call both of 719 the above preallocation routines for simplicity. 720 721 Options Database Keys: 722 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` 723 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid). 724 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 725 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 726 727 Level: beginner 728 729 .seealso: [](chapter_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 730 M*/ 731 732 /*MC 733 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 734 735 Level: beginner 736 737 .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 738 M*/ 739