1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals { 15 template <typename Tuple> 16 __host__ __device__ void operator()(Tuple t) 17 { 18 thrust::get<1>(t) = thrust::get<0>(t); 19 } 20 }; 21 22 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 23 { 24 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 25 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 26 27 PetscFunctionBegin; 28 if (!cusparseStruct) PetscFunctionReturn(0); 29 if (cusparseStruct->use_extended_coo) { 30 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 31 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 32 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 41 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 42 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 43 } 44 cusparseStruct->use_extended_coo = PETSC_FALSE; 45 delete cusparseStruct->coo_p; 46 delete cusparseStruct->coo_pw; 47 cusparseStruct->coo_p = NULL; 48 cusparseStruct->coo_pw = NULL; 49 PetscFunctionReturn(0); 50 } 51 52 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 53 { 54 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 55 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 56 PetscInt n = cusp->coo_nd + cusp->coo_no; 57 58 PetscFunctionBegin; 59 if (cusp->coo_p && v) { 60 thrust::device_ptr<const PetscScalar> d_v; 61 THRUSTARRAY *w = NULL; 62 63 if (isCudaMem(v)) { 64 d_v = thrust::device_pointer_cast(v); 65 } else { 66 w = new THRUSTARRAY(n); 67 w->assign(v, v + n); 68 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 69 d_v = w->data(); 70 } 71 72 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 73 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 74 PetscCall(PetscLogGpuTimeBegin()); 75 thrust::for_each(zibit, zieit, VecCUDAEquals()); 76 PetscCall(PetscLogGpuTimeEnd()); 77 delete w; 78 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 79 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 80 } else { 81 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 82 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 83 } 84 PetscFunctionReturn(0); 85 } 86 87 template <typename Tuple> 88 struct IsNotOffDiagT { 89 PetscInt _cstart, _cend; 90 91 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 92 __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 93 }; 94 95 struct IsOffDiag { 96 PetscInt _cstart, _cend; 97 98 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 99 __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 100 }; 101 102 struct GlobToLoc { 103 PetscInt _start; 104 105 GlobToLoc(PetscInt start) : _start(start) { } 106 __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 107 }; 108 109 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 110 { 111 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 112 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 113 PetscInt N, *jj; 114 size_t noff = 0; 115 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 116 THRUSTINTARRAY d_j(n); 117 ISLocalToGlobalMapping l2g; 118 119 PetscFunctionBegin; 120 PetscCall(MatDestroy(&b->A)); 121 PetscCall(MatDestroy(&b->B)); 122 123 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 124 d_i.assign(coo_i, coo_i + n); 125 d_j.assign(coo_j, coo_j + n); 126 PetscCall(PetscLogGpuTimeBegin()); 127 auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 128 auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 129 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 130 cusp->coo_p = new THRUSTINTARRAY(n); 131 cusp->coo_pw = new THRUSTARRAY(n); 132 thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 133 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 134 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 135 auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 136 firstoffd = mzipp.get_iterator_tuple().get<1>(); 137 } 138 cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 139 cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 140 141 /* from global to local */ 142 thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 143 thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 144 PetscCall(PetscLogGpuTimeEnd()); 145 146 /* copy offdiag column indices to map on the CPU */ 147 PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 148 PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 149 auto o_j = d_j.begin(); 150 PetscCall(PetscLogGpuTimeBegin()); 151 thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 152 thrust::sort(thrust::device, o_j, d_j.end()); 153 auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 154 PetscCall(PetscLogGpuTimeEnd()); 155 noff = thrust::distance(o_j, wit); 156 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 157 PetscCall(PetscMalloc1(noff, &b->garray)); 158 PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 159 PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 160 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 161 PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 162 PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 163 PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 164 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 165 166 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 167 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 168 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 169 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 170 PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 171 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 172 173 /* GPU memory, cusparse specific call handles it internally */ 174 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 175 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 176 PetscCall(PetscFree(jj)); 177 178 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 179 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 180 181 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 182 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 183 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 184 PetscFunctionReturn(0); 185 } 186 187 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 188 { 189 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 190 Mat_MPIAIJCUSPARSE *mpidev; 191 PetscBool coo_basic = PETSC_TRUE; 192 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 193 PetscInt rstart, rend; 194 195 PetscFunctionBegin; 196 PetscCall(PetscFree(mpiaij->garray)); 197 PetscCall(VecDestroy(&mpiaij->lvec)); 198 #if defined(PETSC_USE_CTABLE) 199 PetscCall(PetscTableDestroy(&mpiaij->colmap)); 200 #else 201 PetscCall(PetscFree(mpiaij->colmap)); 202 #endif 203 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 204 mat->assembled = PETSC_FALSE; 205 mat->was_assembled = PETSC_FALSE; 206 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 207 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 208 if (coo_i) { 209 PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 210 PetscCall(PetscGetMemType(coo_i, &mtype)); 211 if (PetscMemTypeHost(mtype)) { 212 for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 213 if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 214 coo_basic = PETSC_FALSE; 215 break; 216 } 217 } 218 } 219 } 220 /* All ranks must agree on the value of coo_basic */ 221 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 222 if (coo_basic) { 223 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 224 } else { 225 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 226 mat->offloadmask = PETSC_OFFLOAD_CPU; 227 /* creates the GPU memory */ 228 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 229 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 230 mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 231 mpidev->use_extended_coo = PETSC_TRUE; 232 233 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 234 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 235 236 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 237 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 238 239 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 240 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 241 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 242 243 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 244 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 245 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 246 247 PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 248 PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 249 PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 250 251 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 252 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 253 254 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 255 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 256 257 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 258 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 259 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 260 261 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 262 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 263 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 264 265 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 266 } 267 PetscFunctionReturn(0); 268 } 269 270 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 271 { 272 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 273 const PetscCount grid_size = gridDim.x * blockDim.x; 274 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 275 } 276 277 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 278 { 279 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 280 const PetscCount grid_size = gridDim.x * blockDim.x; 281 for (; i < Annz + Bnnz; i += grid_size) { 282 PetscScalar sum = 0.0; 283 if (i < Annz) { 284 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 285 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 286 } else { 287 i -= Annz; 288 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 289 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 290 } 291 } 292 } 293 294 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 295 { 296 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 297 const PetscCount grid_size = gridDim.x * blockDim.x; 298 for (; i < Annz2 + Bnnz2; i += grid_size) { 299 if (i < Annz2) { 300 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 301 } else { 302 i -= Annz2; 303 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 304 } 305 } 306 } 307 308 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 309 { 310 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 311 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 312 Mat A = mpiaij->A, B = mpiaij->B; 313 PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 314 PetscScalar *Aa, *Ba = NULL; 315 PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 316 const PetscScalar *v1 = v; 317 const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 318 const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 319 const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 320 const PetscCount *Cperm1 = mpidev->Cperm1_d; 321 PetscMemType memtype; 322 323 PetscFunctionBegin; 324 if (mpidev->use_extended_coo) { 325 PetscMPIInt size; 326 327 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 328 PetscCall(PetscGetMemType(v, &memtype)); 329 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 330 PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 331 PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 332 } 333 334 if (imode == INSERT_VALUES) { 335 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 336 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 337 } else { 338 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 339 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 340 } 341 342 /* Pack entries to be sent to remote */ 343 if (mpiaij->sendlen) { 344 MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 345 PetscCallCUDA(cudaPeekAtLastError()); 346 } 347 348 /* Send remote entries to their owner and overlap the communication with local computation */ 349 PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 350 /* Add local entries to A and B */ 351 if (Annz + Bnnz > 0) { 352 MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 353 PetscCallCUDA(cudaPeekAtLastError()); 354 } 355 PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 356 357 /* Add received remote entries to A and B */ 358 if (Annz2 + Bnnz2 > 0) { 359 MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 360 PetscCallCUDA(cudaPeekAtLastError()); 361 } 362 363 if (imode == INSERT_VALUES) { 364 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 365 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 366 } else { 367 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 368 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 369 } 370 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 371 } else { 372 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 373 } 374 mat->offloadmask = PETSC_OFFLOAD_GPU; 375 PetscFunctionReturn(0); 376 } 377 378 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 379 { 380 Mat Ad, Ao; 381 const PetscInt *cmap; 382 383 PetscFunctionBegin; 384 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 385 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 386 if (glob) { 387 PetscInt cst, i, dn, on, *gidx; 388 389 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 390 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 391 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 392 PetscCall(PetscMalloc1(dn + on, &gidx)); 393 for (i = 0; i < dn; i++) gidx[i] = cst + i; 394 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 395 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 396 } 397 PetscFunctionReturn(0); 398 } 399 400 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 401 { 402 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 403 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 404 PetscInt i; 405 406 PetscFunctionBegin; 407 PetscCall(PetscLayoutSetUp(B->rmap)); 408 PetscCall(PetscLayoutSetUp(B->cmap)); 409 if (PetscDefined(USE_DEBUG) && d_nnz) { 410 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 411 } 412 if (PetscDefined(USE_DEBUG) && o_nnz) { 413 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 414 } 415 #if defined(PETSC_USE_CTABLE) 416 PetscCall(PetscTableDestroy(&b->colmap)); 417 #else 418 PetscCall(PetscFree(b->colmap)); 419 #endif 420 PetscCall(PetscFree(b->garray)); 421 PetscCall(VecDestroy(&b->lvec)); 422 PetscCall(VecScatterDestroy(&b->Mvctx)); 423 /* Because the B will have been resized we simply destroy it and create a new one each time */ 424 PetscCall(MatDestroy(&b->B)); 425 if (!b->A) { 426 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 427 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 428 } 429 if (!b->B) { 430 PetscMPIInt size; 431 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 432 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 433 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 434 } 435 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 436 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 437 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 438 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 439 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 440 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 441 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 442 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 443 B->preallocated = PETSC_TRUE; 444 PetscFunctionReturn(0); 445 } 446 447 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 448 { 449 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 450 451 PetscFunctionBegin; 452 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 453 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 454 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 455 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 456 PetscFunctionReturn(0); 457 } 458 459 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 460 { 461 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 462 463 PetscFunctionBegin; 464 PetscCall(MatZeroEntries(l->A)); 465 PetscCall(MatZeroEntries(l->B)); 466 PetscFunctionReturn(0); 467 } 468 469 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 470 { 471 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 472 473 PetscFunctionBegin; 474 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 475 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 476 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 477 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 478 PetscFunctionReturn(0); 479 } 480 481 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 482 { 483 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 484 485 PetscFunctionBegin; 486 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 487 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 488 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 489 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 490 PetscFunctionReturn(0); 491 } 492 493 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 494 { 495 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 496 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 497 498 PetscFunctionBegin; 499 switch (op) { 500 case MAT_CUSPARSE_MULT_DIAG: 501 cusparseStruct->diagGPUMatFormat = format; 502 break; 503 case MAT_CUSPARSE_MULT_OFFDIAG: 504 cusparseStruct->offdiagGPUMatFormat = format; 505 break; 506 case MAT_CUSPARSE_ALL: 507 cusparseStruct->diagGPUMatFormat = format; 508 cusparseStruct->offdiagGPUMatFormat = format; 509 break; 510 default: 511 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 512 } 513 PetscFunctionReturn(0); 514 } 515 516 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 517 { 518 MatCUSPARSEStorageFormat format; 519 PetscBool flg; 520 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 521 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 522 523 PetscFunctionBegin; 524 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 525 if (A->factortype == MAT_FACTOR_NONE) { 526 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 527 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 528 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 529 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 530 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 531 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 532 } 533 PetscOptionsHeadEnd(); 534 PetscFunctionReturn(0); 535 } 536 537 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 538 { 539 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 540 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr; 541 PetscObjectState onnz = A->nonzerostate; 542 543 PetscFunctionBegin; 544 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 545 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 546 if (onnz != A->nonzerostate && cusp->deviceMat) { 547 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 548 549 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 550 PetscCall(PetscNew(&h_mat)); 551 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 552 PetscCallCUDA(cudaFree(h_mat->colmap)); 553 if (h_mat->allocated_indices) { 554 PetscCallCUDA(cudaFree(h_mat->diag.i)); 555 PetscCallCUDA(cudaFree(h_mat->diag.j)); 556 if (h_mat->offdiag.j) { 557 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 558 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 559 } 560 } 561 PetscCallCUDA(cudaFree(d_mat)); 562 PetscCall(PetscFree(h_mat)); 563 cusp->deviceMat = NULL; 564 } 565 PetscFunctionReturn(0); 566 } 567 568 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 569 { 570 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 571 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 572 573 PetscFunctionBegin; 574 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 575 if (cusparseStruct->deviceMat) { 576 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 577 578 PetscCall(PetscInfo(A, "Have device matrix\n")); 579 PetscCall(PetscNew(&h_mat)); 580 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 581 PetscCallCUDA(cudaFree(h_mat->colmap)); 582 if (h_mat->allocated_indices) { 583 PetscCallCUDA(cudaFree(h_mat->diag.i)); 584 PetscCallCUDA(cudaFree(h_mat->diag.j)); 585 if (h_mat->offdiag.j) { 586 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 587 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 588 } 589 } 590 PetscCallCUDA(cudaFree(d_mat)); 591 PetscCall(PetscFree(h_mat)); 592 } 593 /* Free COO */ 594 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 595 PetscCallCXX(delete cusparseStruct); 596 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 597 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 598 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 599 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 600 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 601 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 602 PetscCall(MatDestroy_MPIAIJ(A)); 603 PetscFunctionReturn(0); 604 } 605 606 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat) 607 { 608 Mat_MPIAIJ *a; 609 Mat A; 610 611 PetscFunctionBegin; 612 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 613 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 614 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 615 A = *newmat; 616 A->boundtocpu = PETSC_FALSE; 617 PetscCall(PetscFree(A->defaultvectype)); 618 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 619 620 a = (Mat_MPIAIJ *)A->data; 621 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 622 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 623 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 624 625 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 626 627 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 628 A->ops->mult = MatMult_MPIAIJCUSPARSE; 629 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 630 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 631 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 632 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 633 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 634 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 635 636 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 637 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 638 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 639 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 640 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 641 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 642 #if defined(PETSC_HAVE_HYPRE) 643 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 644 #endif 645 PetscFunctionReturn(0); 646 } 647 648 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 649 { 650 PetscFunctionBegin; 651 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 652 PetscCall(MatCreate_MPIAIJ(A)); 653 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 654 PetscFunctionReturn(0); 655 } 656 657 /*@ 658 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 659 (the default parallel PETSc format). This matrix will ultimately pushed down 660 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 661 assembly performance the user should preallocate the matrix storage by setting 662 the parameter nz (or the array nnz). By setting these parameters accurately, 663 performance during matrix assembly can be increased by more than a factor of 50. 664 665 Collective 666 667 Input Parameters: 668 + comm - MPI communicator, set to `PETSC_COMM_SELF` 669 . m - number of rows 670 . n - number of columns 671 . nz - number of nonzeros per row (same for all rows) 672 - nnz - array containing the number of nonzeros in the various rows 673 (possibly different for each row) or NULL 674 675 Output Parameter: 676 . A - the matrix 677 678 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 679 MatXXXXSetPreallocation() paradigm instead of this routine directly. 680 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 681 682 Notes: 683 If nnz is given then nz is ignored 684 685 The AIJ format, also called the 686 compressed row storage), is fully compatible with standard Fortran 77 687 storage. That is, the stored row and column indices can begin at 688 either one (as in Fortran) or zero. See the users' manual for details. 689 690 Specify the preallocated storage with either nz or nnz (not both). 691 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 692 allocation. For large problems you MUST preallocate memory or you 693 will get TERRIBLE performance, see the users' manual chapter on matrices. 694 695 By default, this format uses inodes (identical nodes) when possible, to 696 improve numerical efficiency of matrix-vector products and solves. We 697 search for consecutive rows with the same nonzero structure, thereby 698 reusing matrix information to achieve increased efficiency. 699 700 Level: intermediate 701 702 .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 703 @*/ 704 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 705 { 706 PetscMPIInt size; 707 708 PetscFunctionBegin; 709 PetscCall(MatCreate(comm, A)); 710 PetscCall(MatSetSizes(*A, m, n, M, N)); 711 PetscCallMPI(MPI_Comm_size(comm, &size)); 712 if (size > 1) { 713 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 714 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 715 } else { 716 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 717 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 718 } 719 PetscFunctionReturn(0); 720 } 721 722 /*MC 723 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 724 725 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 726 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 727 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 728 729 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 730 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 731 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 732 for communicators controlling multiple processes. It is recommended that you call both of 733 the above preallocation routines for simplicity. 734 735 Options Database Keys: 736 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()` 737 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 738 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 739 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 740 741 Level: beginner 742 743 .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 744 M*/ 745 746 /*MC 747 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 748 749 Level: beginner 750 751 .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 752 M*/ 753 754 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 755 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 756 { 757 PetscSplitCSRDataStructure d_mat; 758 PetscMPIInt size; 759 int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL; 760 PetscScalar *aa = NULL, *ba = NULL; 761 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 762 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 763 CsrMatrix *matrixA = NULL, *matrixB = NULL; 764 765 PetscFunctionBegin; 766 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix"); 767 if (A->factortype != MAT_FACTOR_NONE) { 768 *B = NULL; 769 PetscFunctionReturn(0); 770 } 771 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 772 // get jaca 773 if (size == 1) { 774 PetscBool isseqaij; 775 776 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij)); 777 if (isseqaij) { 778 jaca = (Mat_SeqAIJ *)A->data; 779 PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 780 cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr; 781 d_mat = cusparsestructA->deviceMat; 782 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 783 } else { 784 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 785 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 786 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 787 jaca = (Mat_SeqAIJ *)aij->A->data; 788 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 789 d_mat = spptr->deviceMat; 790 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 791 } 792 if (cusparsestructA->format == MAT_CUSPARSE_CSR) { 793 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 794 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 795 matrixA = (CsrMatrix *)matstruct->mat; 796 bi = NULL; 797 bj = NULL; 798 ba = NULL; 799 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR"); 800 } else { 801 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 802 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 803 jaca = (Mat_SeqAIJ *)aij->A->data; 804 jacb = (Mat_SeqAIJ *)aij->B->data; 805 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 806 807 PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 808 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 809 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr; 810 PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR"); 811 if (cusparsestructB->format == MAT_CUSPARSE_CSR) { 812 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 813 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 814 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 815 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat; 816 PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 817 PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 818 matrixA = (CsrMatrix *)matstructA->mat; 819 matrixB = (CsrMatrix *)matstructB->mat; 820 if (jacb->compressedrow.use) { 821 if (!cusparsestructB->rowoffsets_gpu) { 822 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 823 cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1); 824 } 825 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 826 } else { 827 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 828 } 829 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 830 ba = thrust::raw_pointer_cast(matrixB->values->data()); 831 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR"); 832 d_mat = spptr->deviceMat; 833 } 834 if (jaca->compressedrow.use) { 835 if (!cusparsestructA->rowoffsets_gpu) { 836 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 837 cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1); 838 } 839 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 840 } else { 841 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 842 } 843 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 844 aa = thrust::raw_pointer_cast(matrixA->values->data()); 845 846 if (!d_mat) { 847 PetscSplitCSRDataStructure h_mat; 848 849 // create and populate strucy on host and copy on device 850 PetscCall(PetscInfo(A, "Create device matrix\n")); 851 PetscCall(PetscNew(&h_mat)); 852 PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat))); 853 if (size > 1) { /* need the colmap array */ 854 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 855 PetscInt *colmap; 856 PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N; 857 858 PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray"); 859 860 PetscCall(PetscCalloc1(N + 1, &colmap)); 861 for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1); 862 #if defined(PETSC_USE_64BIT_INDICES) 863 { // have to make a long version of these 864 int *h_bi32, *h_bj32; 865 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 866 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64)); 867 PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost)); 868 for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i]; 869 PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost)); 870 for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i]; 871 872 PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64))); 873 PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice)); 874 PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64))); 875 PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice)); 876 877 h_mat->offdiag.i = d_bi64; 878 h_mat->offdiag.j = d_bj64; 879 h_mat->allocated_indices = PETSC_TRUE; 880 881 PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64)); 882 } 883 #else 884 h_mat->offdiag.i = (PetscInt *)bi; 885 h_mat->offdiag.j = (PetscInt *)bj; 886 h_mat->allocated_indices = PETSC_FALSE; 887 #endif 888 h_mat->offdiag.a = ba; 889 h_mat->offdiag.n = A->rmap->n; 890 891 PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap))); 892 PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice)); 893 PetscCall(PetscFree(colmap)); 894 } 895 h_mat->rstart = A->rmap->rstart; 896 h_mat->rend = A->rmap->rend; 897 h_mat->cstart = A->cmap->rstart; 898 h_mat->cend = A->cmap->rend; 899 h_mat->M = A->cmap->N; 900 #if defined(PETSC_USE_64BIT_INDICES) 901 { 902 int *h_ai32, *h_aj32; 903 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 904 905 static_assert(sizeof(PetscInt) == 8, ""); 906 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64)); 907 PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost)); 908 for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i]; 909 PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost)); 910 for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i]; 911 912 PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64))); 913 PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice)); 914 PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64))); 915 PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice)); 916 917 h_mat->diag.i = d_ai64; 918 h_mat->diag.j = d_aj64; 919 h_mat->allocated_indices = PETSC_TRUE; 920 921 PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64)); 922 } 923 #else 924 h_mat->diag.i = (PetscInt *)ai; 925 h_mat->diag.j = (PetscInt *)aj; 926 h_mat->allocated_indices = PETSC_FALSE; 927 #endif 928 h_mat->diag.a = aa; 929 h_mat->diag.n = A->rmap->n; 930 h_mat->rank = PetscGlobalRank; 931 // copy pointers and metadata to device 932 PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice)); 933 PetscCall(PetscFree(h_mat)); 934 } else { 935 PetscCall(PetscInfo(A, "Reusing device matrix\n")); 936 } 937 *B = d_mat; 938 A->offloadmask = PETSC_OFFLOAD_GPU; 939 PetscFunctionReturn(0); 940 } 941