1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals { 15 template <typename Tuple> 16 __host__ __device__ void operator()(Tuple t) { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) { 22 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 23 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 24 25 PetscFunctionBegin; 26 if (!cusparseStruct) PetscFunctionReturn(0); 27 if (cusparseStruct->use_extended_coo) { 28 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 29 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 30 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 31 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 32 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 41 } 42 cusparseStruct->use_extended_coo = PETSC_FALSE; 43 delete cusparseStruct->coo_p; 44 delete cusparseStruct->coo_pw; 45 cusparseStruct->coo_p = NULL; 46 cusparseStruct->coo_pw = NULL; 47 PetscFunctionReturn(0); 48 } 49 50 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) { 51 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 52 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 53 PetscInt n = cusp->coo_nd + cusp->coo_no; 54 55 PetscFunctionBegin; 56 if (cusp->coo_p && v) { 57 thrust::device_ptr<const PetscScalar> d_v; 58 THRUSTARRAY *w = NULL; 59 60 if (isCudaMem(v)) { 61 d_v = thrust::device_pointer_cast(v); 62 } else { 63 w = new THRUSTARRAY(n); 64 w->assign(v, v + n); 65 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 66 d_v = w->data(); 67 } 68 69 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 70 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 71 PetscCall(PetscLogGpuTimeBegin()); 72 thrust::for_each(zibit, zieit, VecCUDAEquals()); 73 PetscCall(PetscLogGpuTimeEnd()); 74 delete w; 75 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 76 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 77 } else { 78 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 79 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 80 } 81 PetscFunctionReturn(0); 82 } 83 84 template <typename Tuple> 85 struct IsNotOffDiagT { 86 PetscInt _cstart, _cend; 87 88 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 89 __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 90 }; 91 92 struct IsOffDiag { 93 PetscInt _cstart, _cend; 94 95 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 96 __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 97 }; 98 99 struct GlobToLoc { 100 PetscInt _start; 101 102 GlobToLoc(PetscInt start) : _start(start) { } 103 __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 104 }; 105 106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) { 107 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 108 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 109 PetscInt N, *jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 115 PetscFunctionBegin; 116 PetscCall(MatDestroy(&b->A)); 117 PetscCall(MatDestroy(&b->B)); 118 119 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 120 d_i.assign(coo_i, coo_i + n); 121 d_j.assign(coo_j, coo_j + n); 122 PetscCall(PetscLogGpuTimeBegin()); 123 auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 124 auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 125 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 126 cusp->coo_p = new THRUSTINTARRAY(n); 127 cusp->coo_pw = new THRUSTARRAY(n); 128 thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 129 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 130 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 131 auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 132 firstoffd = mzipp.get_iterator_tuple().get<1>(); 133 } 134 cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 135 cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 136 137 /* from global to local */ 138 thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 139 thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 140 PetscCall(PetscLogGpuTimeEnd()); 141 142 /* copy offdiag column indices to map on the CPU */ 143 PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 144 PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 145 auto o_j = d_j.begin(); 146 PetscCall(PetscLogGpuTimeBegin()); 147 thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 148 thrust::sort(thrust::device, o_j, d_j.end()); 149 auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 150 PetscCall(PetscLogGpuTimeEnd()); 151 noff = thrust::distance(o_j, wit); 152 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 153 PetscCall(PetscMalloc1(noff, &b->garray)); 154 PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 155 PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 156 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 157 PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 158 PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 159 PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 160 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 161 162 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 163 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 164 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 165 PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->A)); 166 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 167 PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 168 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 169 PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->B)); 170 171 /* GPU memory, cusparse specific call handles it internally */ 172 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 173 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 174 PetscCall(PetscFree(jj)); 175 176 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 177 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 178 179 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 180 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 181 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 182 PetscFunctionReturn(0); 183 } 184 185 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 186 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 187 Mat_MPIAIJCUSPARSE *mpidev; 188 PetscBool coo_basic = PETSC_TRUE; 189 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 190 PetscInt rstart, rend; 191 192 PetscFunctionBegin; 193 PetscCall(PetscFree(mpiaij->garray)); 194 PetscCall(VecDestroy(&mpiaij->lvec)); 195 #if defined(PETSC_USE_CTABLE) 196 PetscCall(PetscTableDestroy(&mpiaij->colmap)); 197 #else 198 PetscCall(PetscFree(mpiaij->colmap)); 199 #endif 200 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 201 mat->assembled = PETSC_FALSE; 202 mat->was_assembled = PETSC_FALSE; 203 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 204 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 205 if (coo_i) { 206 PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 207 PetscCall(PetscGetMemType(coo_i, &mtype)); 208 if (PetscMemTypeHost(mtype)) { 209 for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 210 if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 211 coo_basic = PETSC_FALSE; 212 break; 213 } 214 } 215 } 216 } 217 /* All ranks must agree on the value of coo_basic */ 218 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 219 if (coo_basic) { 220 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 221 } else { 222 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 223 mat->offloadmask = PETSC_OFFLOAD_CPU; 224 /* creates the GPU memory */ 225 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 226 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 227 mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 228 mpidev->use_extended_coo = PETSC_TRUE; 229 230 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 231 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 232 233 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 234 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 235 236 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 237 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 238 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 239 240 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 241 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 242 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 243 244 PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 245 PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 246 PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 247 248 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 249 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 250 251 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 252 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 253 254 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 255 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 256 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 257 258 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 259 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 260 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 261 262 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 263 } 264 PetscFunctionReturn(0); 265 } 266 267 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) { 268 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 269 const PetscCount grid_size = gridDim.x * blockDim.x; 270 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 271 } 272 273 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) { 274 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 275 const PetscCount grid_size = gridDim.x * blockDim.x; 276 for (; i < Annz + Bnnz; i += grid_size) { 277 PetscScalar sum = 0.0; 278 if (i < Annz) { 279 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 280 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 281 } else { 282 i -= Annz; 283 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 284 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 285 } 286 } 287 } 288 289 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) { 290 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 291 const PetscCount grid_size = gridDim.x * blockDim.x; 292 for (; i < Annz2 + Bnnz2; i += grid_size) { 293 if (i < Annz2) { 294 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 295 } else { 296 i -= Annz2; 297 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 298 } 299 } 300 } 301 302 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) { 303 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 304 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 305 Mat A = mpiaij->A, B = mpiaij->B; 306 PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 307 PetscScalar *Aa, *Ba = NULL; 308 PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 309 const PetscScalar *v1 = v; 310 const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 311 const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 312 const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 313 const PetscCount *Cperm1 = mpidev->Cperm1_d; 314 PetscMemType memtype; 315 316 PetscFunctionBegin; 317 if (mpidev->use_extended_coo) { 318 PetscMPIInt size; 319 320 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 321 PetscCall(PetscGetMemType(v, &memtype)); 322 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 323 PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 324 PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 325 } 326 327 if (imode == INSERT_VALUES) { 328 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 329 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 330 } else { 331 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 332 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 333 } 334 335 /* Pack entries to be sent to remote */ 336 if (mpiaij->sendlen) { 337 MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 338 PetscCallCUDA(cudaPeekAtLastError()); 339 } 340 341 /* Send remote entries to their owner and overlap the communication with local computation */ 342 PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 343 /* Add local entries to A and B */ 344 if (Annz + Bnnz > 0) { 345 MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 346 PetscCallCUDA(cudaPeekAtLastError()); 347 } 348 PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 349 350 /* Add received remote entries to A and B */ 351 if (Annz2 + Bnnz2 > 0) { 352 MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 353 PetscCallCUDA(cudaPeekAtLastError()); 354 } 355 356 if (imode == INSERT_VALUES) { 357 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 358 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 359 } else { 360 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 361 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 362 } 363 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 364 } else { 365 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 366 } 367 mat->offloadmask = PETSC_OFFLOAD_GPU; 368 PetscFunctionReturn(0); 369 } 370 371 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) { 372 Mat Ad, Ao; 373 const PetscInt *cmap; 374 375 PetscFunctionBegin; 376 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 377 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 378 if (glob) { 379 PetscInt cst, i, dn, on, *gidx; 380 381 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 382 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 383 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 384 PetscCall(PetscMalloc1(dn + on, &gidx)); 385 for (i = 0; i < dn; i++) gidx[i] = cst + i; 386 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 387 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 388 } 389 PetscFunctionReturn(0); 390 } 391 392 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) { 393 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 394 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 395 PetscInt i; 396 397 PetscFunctionBegin; 398 PetscCall(PetscLayoutSetUp(B->rmap)); 399 PetscCall(PetscLayoutSetUp(B->cmap)); 400 if (PetscDefined(USE_DEBUG) && d_nnz) { 401 for (i = 0; i < B->rmap->n; i++) { PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); } 402 } 403 if (PetscDefined(USE_DEBUG) && o_nnz) { 404 for (i = 0; i < B->rmap->n; i++) { PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); } 405 } 406 #if defined(PETSC_USE_CTABLE) 407 PetscCall(PetscTableDestroy(&b->colmap)); 408 #else 409 PetscCall(PetscFree(b->colmap)); 410 #endif 411 PetscCall(PetscFree(b->garray)); 412 PetscCall(VecDestroy(&b->lvec)); 413 PetscCall(VecScatterDestroy(&b->Mvctx)); 414 /* Because the B will have been resized we simply destroy it and create a new one each time */ 415 PetscCall(MatDestroy(&b->B)); 416 if (!b->A) { 417 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 418 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 419 PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->A)); 420 } 421 if (!b->B) { 422 PetscMPIInt size; 423 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 424 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 425 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 426 PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->B)); 427 } 428 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 429 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 430 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 431 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 432 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 433 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 434 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 435 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 436 B->preallocated = PETSC_TRUE; 437 PetscFunctionReturn(0); 438 } 439 440 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 441 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 442 443 PetscFunctionBegin; 444 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 445 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 446 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 447 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 448 PetscFunctionReturn(0); 449 } 450 451 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) { 452 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 453 454 PetscFunctionBegin; 455 PetscCall(MatZeroEntries(l->A)); 456 PetscCall(MatZeroEntries(l->B)); 457 PetscFunctionReturn(0); 458 } 459 460 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 461 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 462 463 PetscFunctionBegin; 464 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 465 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 466 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 467 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 468 PetscFunctionReturn(0); 469 } 470 471 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 472 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 473 474 PetscFunctionBegin; 475 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 476 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 477 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 478 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 479 PetscFunctionReturn(0); 480 } 481 482 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 483 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 484 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 485 486 PetscFunctionBegin; 487 switch (op) { 488 case MAT_CUSPARSE_MULT_DIAG: cusparseStruct->diagGPUMatFormat = format; break; 489 case MAT_CUSPARSE_MULT_OFFDIAG: cusparseStruct->offdiagGPUMatFormat = format; break; 490 case MAT_CUSPARSE_ALL: 491 cusparseStruct->diagGPUMatFormat = format; 492 cusparseStruct->offdiagGPUMatFormat = format; 493 break; 494 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 495 } 496 PetscFunctionReturn(0); 497 } 498 499 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) { 500 MatCUSPARSEStorageFormat format; 501 PetscBool flg; 502 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 503 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 504 505 PetscFunctionBegin; 506 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 507 if (A->factortype == MAT_FACTOR_NONE) { 508 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 509 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 510 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 511 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 512 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 513 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 514 } 515 PetscOptionsHeadEnd(); 516 PetscFunctionReturn(0); 517 } 518 519 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) { 520 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 521 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr; 522 PetscObjectState onnz = A->nonzerostate; 523 524 PetscFunctionBegin; 525 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 526 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 527 if (onnz != A->nonzerostate && cusp->deviceMat) { 528 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 529 530 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 531 PetscCall(PetscNew(&h_mat)); 532 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 533 PetscCallCUDA(cudaFree(h_mat->colmap)); 534 if (h_mat->allocated_indices) { 535 PetscCallCUDA(cudaFree(h_mat->diag.i)); 536 PetscCallCUDA(cudaFree(h_mat->diag.j)); 537 if (h_mat->offdiag.j) { 538 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 539 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 540 } 541 } 542 PetscCallCUDA(cudaFree(d_mat)); 543 PetscCall(PetscFree(h_mat)); 544 cusp->deviceMat = NULL; 545 } 546 PetscFunctionReturn(0); 547 } 548 549 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) { 550 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 551 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 552 553 PetscFunctionBegin; 554 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 555 if (cusparseStruct->deviceMat) { 556 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 557 558 PetscCall(PetscInfo(A, "Have device matrix\n")); 559 PetscCall(PetscNew(&h_mat)); 560 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 561 PetscCallCUDA(cudaFree(h_mat->colmap)); 562 if (h_mat->allocated_indices) { 563 PetscCallCUDA(cudaFree(h_mat->diag.i)); 564 PetscCallCUDA(cudaFree(h_mat->diag.j)); 565 if (h_mat->offdiag.j) { 566 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 567 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 568 } 569 } 570 PetscCallCUDA(cudaFree(d_mat)); 571 PetscCall(PetscFree(h_mat)); 572 } 573 /* Free COO */ 574 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 575 PetscCallCXX(delete cusparseStruct); 576 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 577 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 578 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 579 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 580 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 581 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 582 PetscCall(MatDestroy_MPIAIJ(A)); 583 PetscFunctionReturn(0); 584 } 585 586 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat) { 587 Mat_MPIAIJ *a; 588 Mat A; 589 590 PetscFunctionBegin; 591 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 592 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 593 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 594 A = *newmat; 595 A->boundtocpu = PETSC_FALSE; 596 PetscCall(PetscFree(A->defaultvectype)); 597 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 598 599 a = (Mat_MPIAIJ *)A->data; 600 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 601 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 602 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 603 604 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 605 606 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 607 A->ops->mult = MatMult_MPIAIJCUSPARSE; 608 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 609 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 610 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 611 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 612 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 613 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 614 615 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 616 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 617 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 618 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 619 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 620 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 621 #if defined(PETSC_HAVE_HYPRE) 622 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 623 #endif 624 PetscFunctionReturn(0); 625 } 626 627 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) { 628 PetscFunctionBegin; 629 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 630 PetscCall(MatCreate_MPIAIJ(A)); 631 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 632 PetscFunctionReturn(0); 633 } 634 635 /*@ 636 MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 637 (the default parallel PETSc format). This matrix will ultimately pushed down 638 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 639 assembly performance the user should preallocate the matrix storage by setting 640 the parameter nz (or the array nnz). By setting these parameters accurately, 641 performance during matrix assembly can be increased by more than a factor of 50. 642 643 Collective 644 645 Input Parameters: 646 + comm - MPI communicator, set to PETSC_COMM_SELF 647 . m - number of rows 648 . n - number of columns 649 . nz - number of nonzeros per row (same for all rows) 650 - nnz - array containing the number of nonzeros in the various rows 651 (possibly different for each row) or NULL 652 653 Output Parameter: 654 . A - the matrix 655 656 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 657 MatXXXXSetPreallocation() paradigm instead of this routine directly. 658 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 659 660 Notes: 661 If nnz is given then nz is ignored 662 663 The AIJ format (also called the Yale sparse matrix format or 664 compressed row storage), is fully compatible with standard Fortran 77 665 storage. That is, the stored row and column indices can begin at 666 either one (as in Fortran) or zero. See the users' manual for details. 667 668 Specify the preallocated storage with either nz or nnz (not both). 669 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 670 allocation. For large problems you MUST preallocate memory or you 671 will get TERRIBLE performance, see the users' manual chapter on matrices. 672 673 By default, this format uses inodes (identical nodes) when possible, to 674 improve numerical efficiency of matrix-vector products and solves. We 675 search for consecutive rows with the same nonzero structure, thereby 676 reusing matrix information to achieve increased efficiency. 677 678 Level: intermediate 679 680 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 681 @*/ 682 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) { 683 PetscMPIInt size; 684 685 PetscFunctionBegin; 686 PetscCall(MatCreate(comm, A)); 687 PetscCall(MatSetSizes(*A, m, n, M, N)); 688 PetscCallMPI(MPI_Comm_size(comm, &size)); 689 if (size > 1) { 690 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 691 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 692 } else { 693 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 694 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 695 } 696 PetscFunctionReturn(0); 697 } 698 699 /*MC 700 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 701 702 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 703 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 704 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 705 706 This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 707 and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 708 MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 709 for communicators controlling multiple processes. It is recommended that you call both of 710 the above preallocation routines for simplicity. 711 712 Options Database Keys: 713 + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 714 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 715 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 716 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 717 718 Level: beginner 719 720 .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 721 M*/ 722 723 /*MC 724 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 725 726 Level: beginner 727 728 .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 729 M*/ 730 731 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 732 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) { 733 PetscSplitCSRDataStructure d_mat; 734 PetscMPIInt size; 735 int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL; 736 PetscScalar *aa = NULL, *ba = NULL; 737 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 738 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 739 CsrMatrix *matrixA = NULL, *matrixB = NULL; 740 741 PetscFunctionBegin; 742 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix"); 743 if (A->factortype != MAT_FACTOR_NONE) { 744 *B = NULL; 745 PetscFunctionReturn(0); 746 } 747 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 748 // get jaca 749 if (size == 1) { 750 PetscBool isseqaij; 751 752 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij)); 753 if (isseqaij) { 754 jaca = (Mat_SeqAIJ *)A->data; 755 PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 756 cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr; 757 d_mat = cusparsestructA->deviceMat; 758 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 759 } else { 760 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 761 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 762 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 763 jaca = (Mat_SeqAIJ *)aij->A->data; 764 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 765 d_mat = spptr->deviceMat; 766 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 767 } 768 if (cusparsestructA->format == MAT_CUSPARSE_CSR) { 769 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 770 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 771 matrixA = (CsrMatrix *)matstruct->mat; 772 bi = NULL; 773 bj = NULL; 774 ba = NULL; 775 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR"); 776 } else { 777 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 778 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 779 jaca = (Mat_SeqAIJ *)aij->A->data; 780 jacb = (Mat_SeqAIJ *)aij->B->data; 781 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 782 783 PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 784 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 785 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr; 786 PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR"); 787 if (cusparsestructB->format == MAT_CUSPARSE_CSR) { 788 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 789 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 790 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 791 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat; 792 PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 793 PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 794 matrixA = (CsrMatrix *)matstructA->mat; 795 matrixB = (CsrMatrix *)matstructB->mat; 796 if (jacb->compressedrow.use) { 797 if (!cusparsestructB->rowoffsets_gpu) { 798 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 799 cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1); 800 } 801 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 802 } else { 803 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 804 } 805 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 806 ba = thrust::raw_pointer_cast(matrixB->values->data()); 807 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR"); 808 d_mat = spptr->deviceMat; 809 } 810 if (jaca->compressedrow.use) { 811 if (!cusparsestructA->rowoffsets_gpu) { 812 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 813 cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1); 814 } 815 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 816 } else { 817 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 818 } 819 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 820 aa = thrust::raw_pointer_cast(matrixA->values->data()); 821 822 if (!d_mat) { 823 PetscSplitCSRDataStructure h_mat; 824 825 // create and populate strucy on host and copy on device 826 PetscCall(PetscInfo(A, "Create device matrix\n")); 827 PetscCall(PetscNew(&h_mat)); 828 PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat))); 829 if (size > 1) { /* need the colmap array */ 830 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 831 PetscInt *colmap; 832 PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N; 833 834 PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray"); 835 836 PetscCall(PetscCalloc1(N + 1, &colmap)); 837 for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1); 838 #if defined(PETSC_USE_64BIT_INDICES) 839 { // have to make a long version of these 840 int *h_bi32, *h_bj32; 841 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 842 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64)); 843 PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost)); 844 for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i]; 845 PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost)); 846 for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i]; 847 848 PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64))); 849 PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice)); 850 PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64))); 851 PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice)); 852 853 h_mat->offdiag.i = d_bi64; 854 h_mat->offdiag.j = d_bj64; 855 h_mat->allocated_indices = PETSC_TRUE; 856 857 PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64)); 858 } 859 #else 860 h_mat->offdiag.i = (PetscInt *)bi; 861 h_mat->offdiag.j = (PetscInt *)bj; 862 h_mat->allocated_indices = PETSC_FALSE; 863 #endif 864 h_mat->offdiag.a = ba; 865 h_mat->offdiag.n = A->rmap->n; 866 867 PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap))); 868 PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice)); 869 PetscCall(PetscFree(colmap)); 870 } 871 h_mat->rstart = A->rmap->rstart; 872 h_mat->rend = A->rmap->rend; 873 h_mat->cstart = A->cmap->rstart; 874 h_mat->cend = A->cmap->rend; 875 h_mat->M = A->cmap->N; 876 #if defined(PETSC_USE_64BIT_INDICES) 877 { 878 int *h_ai32, *h_aj32; 879 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 880 881 static_assert(sizeof(PetscInt) == 8, ""); 882 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64)); 883 PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost)); 884 for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i]; 885 PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost)); 886 for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i]; 887 888 PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64))); 889 PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice)); 890 PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64))); 891 PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice)); 892 893 h_mat->diag.i = d_ai64; 894 h_mat->diag.j = d_aj64; 895 h_mat->allocated_indices = PETSC_TRUE; 896 897 PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64)); 898 } 899 #else 900 h_mat->diag.i = (PetscInt *)ai; 901 h_mat->diag.j = (PetscInt *)aj; 902 h_mat->allocated_indices = PETSC_FALSE; 903 #endif 904 h_mat->diag.a = aa; 905 h_mat->diag.n = A->rmap->n; 906 h_mat->rank = PetscGlobalRank; 907 // copy pointers and metadata to device 908 PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice)); 909 PetscCall(PetscFree(h_mat)); 910 } else { 911 PetscCall(PetscInfo(A, "Reusing device matrix\n")); 912 } 913 *B = d_mat; 914 A->offloadmask = PETSC_OFFLOAD_GPU; 915 PetscFunctionReturn(0); 916 } 917