1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 #include <thrust/advance.h> 8 #include <thrust/partition.h> 9 #include <thrust/sort.h> 10 #include <thrust/unique.h> 11 #include <petscsf.h> 12 13 struct VecCUDAEquals { 14 template <typename Tuple> 15 __host__ __device__ void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 22 { 23 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 24 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 25 26 PetscFunctionBegin; 27 if (!cusparseStruct) PetscFunctionReturn(0); 28 if (cusparseStruct->use_extended_coo) { 29 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 30 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 31 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 32 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 41 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 42 } 43 cusparseStruct->use_extended_coo = PETSC_FALSE; 44 delete cusparseStruct->coo_p; 45 delete cusparseStruct->coo_pw; 46 cusparseStruct->coo_p = NULL; 47 cusparseStruct->coo_pw = NULL; 48 PetscFunctionReturn(0); 49 } 50 51 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 52 { 53 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 54 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 55 PetscInt n = cusp->coo_nd + cusp->coo_no; 56 57 PetscFunctionBegin; 58 if (cusp->coo_p && v) { 59 thrust::device_ptr<const PetscScalar> d_v; 60 THRUSTARRAY *w = NULL; 61 62 if (isCudaMem(v)) { 63 d_v = thrust::device_pointer_cast(v); 64 } else { 65 w = new THRUSTARRAY(n); 66 w->assign(v, v + n); 67 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 68 d_v = w->data(); 69 } 70 71 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 72 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 73 PetscCall(PetscLogGpuTimeBegin()); 74 thrust::for_each(zibit, zieit, VecCUDAEquals()); 75 PetscCall(PetscLogGpuTimeEnd()); 76 delete w; 77 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 78 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 79 } else { 80 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 81 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 82 } 83 PetscFunctionReturn(0); 84 } 85 86 template <typename Tuple> 87 struct IsNotOffDiagT { 88 PetscInt _cstart, _cend; 89 90 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 91 __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 92 }; 93 94 struct IsOffDiag { 95 PetscInt _cstart, _cend; 96 97 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 98 __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 99 }; 100 101 struct GlobToLoc { 102 PetscInt _start; 103 104 GlobToLoc(PetscInt start) : _start(start) { } 105 __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 106 }; 107 108 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 109 { 110 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 111 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 112 PetscInt N, *jj; 113 size_t noff = 0; 114 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 115 THRUSTINTARRAY d_j(n); 116 ISLocalToGlobalMapping l2g; 117 118 PetscFunctionBegin; 119 PetscCall(MatDestroy(&b->A)); 120 PetscCall(MatDestroy(&b->B)); 121 122 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 123 d_i.assign(coo_i, coo_i + n); 124 d_j.assign(coo_j, coo_j + n); 125 PetscCall(PetscLogGpuTimeBegin()); 126 auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 127 auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 128 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 129 cusp->coo_p = new THRUSTINTARRAY(n); 130 cusp->coo_pw = new THRUSTARRAY(n); 131 thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 132 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 133 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 134 auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 135 firstoffd = mzipp.get_iterator_tuple().get<1>(); 136 } 137 cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 138 cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 139 140 /* from global to local */ 141 thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 142 thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 143 PetscCall(PetscLogGpuTimeEnd()); 144 145 /* copy offdiag column indices to map on the CPU */ 146 PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 147 PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 148 auto o_j = d_j.begin(); 149 PetscCall(PetscLogGpuTimeBegin()); 150 thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 151 thrust::sort(thrust::device, o_j, d_j.end()); 152 auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 153 PetscCall(PetscLogGpuTimeEnd()); 154 noff = thrust::distance(o_j, wit); 155 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 156 PetscCall(PetscMalloc1(noff, &b->garray)); 157 PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 158 PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 159 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 160 PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 161 PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 162 PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 163 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 164 165 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 166 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 167 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 168 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 169 PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 170 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 171 172 /* GPU memory, cusparse specific call handles it internally */ 173 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 174 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 175 PetscCall(PetscFree(jj)); 176 177 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 178 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 179 180 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 181 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 182 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 183 PetscFunctionReturn(0); 184 } 185 186 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 187 { 188 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 189 Mat_MPIAIJCUSPARSE *mpidev; 190 PetscBool coo_basic = PETSC_TRUE; 191 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 192 PetscInt rstart, rend; 193 194 PetscFunctionBegin; 195 PetscCall(PetscFree(mpiaij->garray)); 196 PetscCall(VecDestroy(&mpiaij->lvec)); 197 #if defined(PETSC_USE_CTABLE) 198 PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 199 #else 200 PetscCall(PetscFree(mpiaij->colmap)); 201 #endif 202 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 203 mat->assembled = PETSC_FALSE; 204 mat->was_assembled = PETSC_FALSE; 205 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 206 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 207 if (coo_i) { 208 PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 209 PetscCall(PetscGetMemType(coo_i, &mtype)); 210 if (PetscMemTypeHost(mtype)) { 211 for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 212 if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 213 coo_basic = PETSC_FALSE; 214 break; 215 } 216 } 217 } 218 } 219 /* All ranks must agree on the value of coo_basic */ 220 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 221 if (coo_basic) { 222 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 223 } else { 224 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 225 mat->offloadmask = PETSC_OFFLOAD_CPU; 226 /* creates the GPU memory */ 227 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 228 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 229 mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 230 mpidev->use_extended_coo = PETSC_TRUE; 231 232 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 233 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 234 235 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 236 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 237 238 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 239 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 240 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 241 242 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 243 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 244 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 245 246 PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 247 PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 248 PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 249 250 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 251 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 252 253 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 254 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 255 256 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 257 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 258 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 259 260 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 261 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 262 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 263 264 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 265 } 266 PetscFunctionReturn(0); 267 } 268 269 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 270 { 271 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 272 const PetscCount grid_size = gridDim.x * blockDim.x; 273 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 274 } 275 276 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 277 { 278 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 279 const PetscCount grid_size = gridDim.x * blockDim.x; 280 for (; i < Annz + Bnnz; i += grid_size) { 281 PetscScalar sum = 0.0; 282 if (i < Annz) { 283 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 284 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 285 } else { 286 i -= Annz; 287 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 288 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 289 } 290 } 291 } 292 293 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 294 { 295 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 296 const PetscCount grid_size = gridDim.x * blockDim.x; 297 for (; i < Annz2 + Bnnz2; i += grid_size) { 298 if (i < Annz2) { 299 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 300 } else { 301 i -= Annz2; 302 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 303 } 304 } 305 } 306 307 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 308 { 309 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 310 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 311 Mat A = mpiaij->A, B = mpiaij->B; 312 PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 313 PetscScalar *Aa, *Ba = NULL; 314 PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 315 const PetscScalar *v1 = v; 316 const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 317 const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 318 const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 319 const PetscCount *Cperm1 = mpidev->Cperm1_d; 320 PetscMemType memtype; 321 322 PetscFunctionBegin; 323 if (mpidev->use_extended_coo) { 324 PetscMPIInt size; 325 326 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 327 PetscCall(PetscGetMemType(v, &memtype)); 328 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 329 PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 330 PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 331 } 332 333 if (imode == INSERT_VALUES) { 334 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 335 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 336 } else { 337 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 338 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 339 } 340 341 /* Pack entries to be sent to remote */ 342 if (mpiaij->sendlen) { 343 MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 344 PetscCallCUDA(cudaPeekAtLastError()); 345 } 346 347 /* Send remote entries to their owner and overlap the communication with local computation */ 348 PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 349 /* Add local entries to A and B */ 350 if (Annz + Bnnz > 0) { 351 MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 352 PetscCallCUDA(cudaPeekAtLastError()); 353 } 354 PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 355 356 /* Add received remote entries to A and B */ 357 if (Annz2 + Bnnz2 > 0) { 358 MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 359 PetscCallCUDA(cudaPeekAtLastError()); 360 } 361 362 if (imode == INSERT_VALUES) { 363 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 364 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 365 } else { 366 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 367 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 368 } 369 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 370 } else { 371 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 372 } 373 mat->offloadmask = PETSC_OFFLOAD_GPU; 374 PetscFunctionReturn(0); 375 } 376 377 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 378 { 379 Mat Ad, Ao; 380 const PetscInt *cmap; 381 382 PetscFunctionBegin; 383 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 384 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 385 if (glob) { 386 PetscInt cst, i, dn, on, *gidx; 387 388 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 389 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 390 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 391 PetscCall(PetscMalloc1(dn + on, &gidx)); 392 for (i = 0; i < dn; i++) gidx[i] = cst + i; 393 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 394 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 395 } 396 PetscFunctionReturn(0); 397 } 398 399 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 400 { 401 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 402 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 403 PetscInt i; 404 405 PetscFunctionBegin; 406 PetscCall(PetscLayoutSetUp(B->rmap)); 407 PetscCall(PetscLayoutSetUp(B->cmap)); 408 if (PetscDefined(USE_DEBUG) && d_nnz) { 409 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 410 } 411 if (PetscDefined(USE_DEBUG) && o_nnz) { 412 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 413 } 414 #if defined(PETSC_USE_CTABLE) 415 PetscCall(PetscHMapIDestroy(&b->colmap)); 416 #else 417 PetscCall(PetscFree(b->colmap)); 418 #endif 419 PetscCall(PetscFree(b->garray)); 420 PetscCall(VecDestroy(&b->lvec)); 421 PetscCall(VecScatterDestroy(&b->Mvctx)); 422 /* Because the B will have been resized we simply destroy it and create a new one each time */ 423 PetscCall(MatDestroy(&b->B)); 424 if (!b->A) { 425 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 426 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 427 } 428 if (!b->B) { 429 PetscMPIInt size; 430 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 431 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 432 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 433 } 434 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 435 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 436 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 437 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 438 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 439 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 440 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 441 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 442 B->preallocated = PETSC_TRUE; 443 PetscFunctionReturn(0); 444 } 445 446 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 447 { 448 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 449 450 PetscFunctionBegin; 451 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 452 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 453 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 454 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 455 PetscFunctionReturn(0); 456 } 457 458 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 459 { 460 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 461 462 PetscFunctionBegin; 463 PetscCall(MatZeroEntries(l->A)); 464 PetscCall(MatZeroEntries(l->B)); 465 PetscFunctionReturn(0); 466 } 467 468 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 469 { 470 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 471 472 PetscFunctionBegin; 473 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 474 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 475 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 476 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 477 PetscFunctionReturn(0); 478 } 479 480 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 481 { 482 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 483 484 PetscFunctionBegin; 485 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 486 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 487 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 488 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 489 PetscFunctionReturn(0); 490 } 491 492 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 493 { 494 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 495 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 496 497 PetscFunctionBegin; 498 switch (op) { 499 case MAT_CUSPARSE_MULT_DIAG: 500 cusparseStruct->diagGPUMatFormat = format; 501 break; 502 case MAT_CUSPARSE_MULT_OFFDIAG: 503 cusparseStruct->offdiagGPUMatFormat = format; 504 break; 505 case MAT_CUSPARSE_ALL: 506 cusparseStruct->diagGPUMatFormat = format; 507 cusparseStruct->offdiagGPUMatFormat = format; 508 break; 509 default: 510 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 511 } 512 PetscFunctionReturn(0); 513 } 514 515 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 516 { 517 MatCUSPARSEStorageFormat format; 518 PetscBool flg; 519 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 520 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 521 522 PetscFunctionBegin; 523 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 524 if (A->factortype == MAT_FACTOR_NONE) { 525 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 526 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 527 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 528 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 529 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 530 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 531 } 532 PetscOptionsHeadEnd(); 533 PetscFunctionReturn(0); 534 } 535 536 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 537 { 538 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 539 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr; 540 PetscObjectState onnz = A->nonzerostate; 541 542 PetscFunctionBegin; 543 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 544 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 545 if (onnz != A->nonzerostate && cusp->deviceMat) { 546 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 547 548 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 549 PetscCall(PetscNew(&h_mat)); 550 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 551 PetscCallCUDA(cudaFree(h_mat->colmap)); 552 if (h_mat->allocated_indices) { 553 PetscCallCUDA(cudaFree(h_mat->diag.i)); 554 PetscCallCUDA(cudaFree(h_mat->diag.j)); 555 if (h_mat->offdiag.j) { 556 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 557 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 558 } 559 } 560 PetscCallCUDA(cudaFree(d_mat)); 561 PetscCall(PetscFree(h_mat)); 562 cusp->deviceMat = NULL; 563 } 564 PetscFunctionReturn(0); 565 } 566 567 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 568 { 569 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 570 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 571 572 PetscFunctionBegin; 573 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 574 if (cusparseStruct->deviceMat) { 575 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 576 577 PetscCall(PetscInfo(A, "Have device matrix\n")); 578 PetscCall(PetscNew(&h_mat)); 579 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 580 PetscCallCUDA(cudaFree(h_mat->colmap)); 581 if (h_mat->allocated_indices) { 582 PetscCallCUDA(cudaFree(h_mat->diag.i)); 583 PetscCallCUDA(cudaFree(h_mat->diag.j)); 584 if (h_mat->offdiag.j) { 585 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 586 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 587 } 588 } 589 PetscCallCUDA(cudaFree(d_mat)); 590 PetscCall(PetscFree(h_mat)); 591 } 592 /* Free COO */ 593 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 594 PetscCallCXX(delete cusparseStruct); 595 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 596 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 597 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 598 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 599 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 600 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 601 PetscCall(MatDestroy_MPIAIJ(A)); 602 PetscFunctionReturn(0); 603 } 604 605 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 606 { 607 Mat_MPIAIJ *a; 608 Mat A; 609 610 PetscFunctionBegin; 611 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 612 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 613 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 614 A = *newmat; 615 A->boundtocpu = PETSC_FALSE; 616 PetscCall(PetscFree(A->defaultvectype)); 617 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 618 619 a = (Mat_MPIAIJ *)A->data; 620 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 621 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 622 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 623 624 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 625 626 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 627 A->ops->mult = MatMult_MPIAIJCUSPARSE; 628 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 629 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 630 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 631 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 632 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 633 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 634 635 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 636 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 637 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 638 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 639 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 640 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 641 #if defined(PETSC_HAVE_HYPRE) 642 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 643 #endif 644 PetscFunctionReturn(0); 645 } 646 647 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 648 { 649 PetscFunctionBegin; 650 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 651 PetscCall(MatCreate_MPIAIJ(A)); 652 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 653 PetscFunctionReturn(0); 654 } 655 656 /*@ 657 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 658 (the default parallel PETSc format). This matrix will ultimately pushed down 659 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 660 assembly performance the user should preallocate the matrix storage by setting 661 the parameter nz (or the array nnz). By setting these parameters accurately, 662 performance during matrix assembly can be increased by more than a factor of 50. 663 664 Collective 665 666 Input Parameters: 667 + comm - MPI communicator, set to `PETSC_COMM_SELF` 668 . m - number of rows 669 . n - number of columns 670 . nz - number of nonzeros per row (same for all rows) 671 - nnz - array containing the number of nonzeros in the various rows 672 (possibly different for each row) or NULL 673 674 Output Parameter: 675 . A - the matrix 676 677 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 678 MatXXXXSetPreallocation() paradigm instead of this routine directly. 679 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 680 681 Notes: 682 If nnz is given then nz is ignored 683 684 The AIJ format, also called the 685 compressed row storage), is fully compatible with standard Fortran 77 686 storage. That is, the stored row and column indices can begin at 687 either one (as in Fortran) or zero. See the users' manual for details. 688 689 Specify the preallocated storage with either nz or nnz (not both). 690 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 691 allocation. For large problems you MUST preallocate memory or you 692 will get TERRIBLE performance, see the users' manual chapter on matrices. 693 694 By default, this format uses inodes (identical nodes) when possible, to 695 improve numerical efficiency of matrix-vector products and solves. We 696 search for consecutive rows with the same nonzero structure, thereby 697 reusing matrix information to achieve increased efficiency. 698 699 Level: intermediate 700 701 .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 702 @*/ 703 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 704 { 705 PetscMPIInt size; 706 707 PetscFunctionBegin; 708 PetscCall(MatCreate(comm, A)); 709 PetscCall(MatSetSizes(*A, m, n, M, N)); 710 PetscCallMPI(MPI_Comm_size(comm, &size)); 711 if (size > 1) { 712 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 713 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 714 } else { 715 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 716 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 717 } 718 PetscFunctionReturn(0); 719 } 720 721 /*MC 722 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 723 724 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 725 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 726 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 727 728 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 729 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 730 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 731 for communicators controlling multiple processes. It is recommended that you call both of 732 the above preallocation routines for simplicity. 733 734 Options Database Keys: 735 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()` 736 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 737 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 738 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 739 740 Level: beginner 741 742 .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 743 M*/ 744 745 /*MC 746 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 747 748 Level: beginner 749 750 .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 751 M*/ 752 753 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 754 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 755 { 756 PetscSplitCSRDataStructure d_mat; 757 PetscMPIInt size; 758 int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL; 759 PetscScalar *aa = NULL, *ba = NULL; 760 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 761 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 762 CsrMatrix *matrixA = NULL, *matrixB = NULL; 763 764 PetscFunctionBegin; 765 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix"); 766 if (A->factortype != MAT_FACTOR_NONE) { 767 *B = NULL; 768 PetscFunctionReturn(0); 769 } 770 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 771 // get jaca 772 if (size == 1) { 773 PetscBool isseqaij; 774 775 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij)); 776 if (isseqaij) { 777 jaca = (Mat_SeqAIJ *)A->data; 778 PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 779 cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr; 780 d_mat = cusparsestructA->deviceMat; 781 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 782 } else { 783 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 784 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 785 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 786 jaca = (Mat_SeqAIJ *)aij->A->data; 787 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 788 d_mat = spptr->deviceMat; 789 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 790 } 791 if (cusparsestructA->format == MAT_CUSPARSE_CSR) { 792 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 793 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 794 matrixA = (CsrMatrix *)matstruct->mat; 795 bi = NULL; 796 bj = NULL; 797 ba = NULL; 798 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR"); 799 } else { 800 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 801 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 802 jaca = (Mat_SeqAIJ *)aij->A->data; 803 jacb = (Mat_SeqAIJ *)aij->B->data; 804 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 805 806 PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 807 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 808 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr; 809 PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR"); 810 if (cusparsestructB->format == MAT_CUSPARSE_CSR) { 811 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 812 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 813 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 814 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat; 815 PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 816 PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 817 matrixA = (CsrMatrix *)matstructA->mat; 818 matrixB = (CsrMatrix *)matstructB->mat; 819 if (jacb->compressedrow.use) { 820 if (!cusparsestructB->rowoffsets_gpu) { 821 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 822 cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1); 823 } 824 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 825 } else { 826 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 827 } 828 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 829 ba = thrust::raw_pointer_cast(matrixB->values->data()); 830 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR"); 831 d_mat = spptr->deviceMat; 832 } 833 if (jaca->compressedrow.use) { 834 if (!cusparsestructA->rowoffsets_gpu) { 835 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 836 cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1); 837 } 838 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 839 } else { 840 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 841 } 842 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 843 aa = thrust::raw_pointer_cast(matrixA->values->data()); 844 845 if (!d_mat) { 846 PetscSplitCSRDataStructure h_mat; 847 848 // create and populate strucy on host and copy on device 849 PetscCall(PetscInfo(A, "Create device matrix\n")); 850 PetscCall(PetscNew(&h_mat)); 851 PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat))); 852 if (size > 1) { /* need the colmap array */ 853 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 854 PetscInt *colmap; 855 PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N; 856 857 PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray"); 858 859 PetscCall(PetscCalloc1(N + 1, &colmap)); 860 for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1); 861 #if defined(PETSC_USE_64BIT_INDICES) 862 { // have to make a long version of these 863 int *h_bi32, *h_bj32; 864 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 865 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64)); 866 PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost)); 867 for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i]; 868 PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost)); 869 for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i]; 870 871 PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64))); 872 PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice)); 873 PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64))); 874 PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice)); 875 876 h_mat->offdiag.i = d_bi64; 877 h_mat->offdiag.j = d_bj64; 878 h_mat->allocated_indices = PETSC_TRUE; 879 880 PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64)); 881 } 882 #else 883 h_mat->offdiag.i = (PetscInt *)bi; 884 h_mat->offdiag.j = (PetscInt *)bj; 885 h_mat->allocated_indices = PETSC_FALSE; 886 #endif 887 h_mat->offdiag.a = ba; 888 h_mat->offdiag.n = A->rmap->n; 889 890 PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap))); 891 PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice)); 892 PetscCall(PetscFree(colmap)); 893 } 894 h_mat->rstart = A->rmap->rstart; 895 h_mat->rend = A->rmap->rend; 896 h_mat->cstart = A->cmap->rstart; 897 h_mat->cend = A->cmap->rend; 898 h_mat->M = A->cmap->N; 899 #if defined(PETSC_USE_64BIT_INDICES) 900 { 901 int *h_ai32, *h_aj32; 902 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 903 904 static_assert(sizeof(PetscInt) == 8, ""); 905 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64)); 906 PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost)); 907 for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i]; 908 PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost)); 909 for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i]; 910 911 PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64))); 912 PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice)); 913 PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64))); 914 PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice)); 915 916 h_mat->diag.i = d_ai64; 917 h_mat->diag.j = d_aj64; 918 h_mat->allocated_indices = PETSC_TRUE; 919 920 PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64)); 921 } 922 #else 923 h_mat->diag.i = (PetscInt *)ai; 924 h_mat->diag.j = (PetscInt *)aj; 925 h_mat->allocated_indices = PETSC_FALSE; 926 #endif 927 h_mat->diag.a = aa; 928 h_mat->diag.n = A->rmap->n; 929 h_mat->rank = PetscGlobalRank; 930 // copy pointers and metadata to device 931 PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice)); 932 PetscCall(PetscFree(h_mat)); 933 } else { 934 PetscCall(PetscInfo(A, "Reusing device matrix\n")); 935 } 936 *B = d_mat; 937 A->offloadmask = PETSC_OFFLOAD_GPU; 938 PetscFunctionReturn(0); 939 } 940