1 #define PETSC_SKIP_SPINLOCK 2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 3 4 #include <petscconf.h> 5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 8 #include <thrust/advance.h> 9 #include <thrust/partition.h> 10 #include <thrust/sort.h> 11 #include <thrust/unique.h> 12 #include <petscsf.h> 13 14 struct VecCUDAEquals { 15 template <typename Tuple> 16 __host__ __device__ void operator()(Tuple t) { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) { 22 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 23 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 24 25 PetscFunctionBegin; 26 if (!cusparseStruct) PetscFunctionReturn(0); 27 if (cusparseStruct->use_extended_coo) { 28 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 29 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 30 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 31 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 32 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 41 } 42 cusparseStruct->use_extended_coo = PETSC_FALSE; 43 delete cusparseStruct->coo_p; 44 delete cusparseStruct->coo_pw; 45 cusparseStruct->coo_p = NULL; 46 cusparseStruct->coo_pw = NULL; 47 PetscFunctionReturn(0); 48 } 49 50 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) { 51 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 52 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 53 PetscInt n = cusp->coo_nd + cusp->coo_no; 54 55 PetscFunctionBegin; 56 if (cusp->coo_p && v) { 57 thrust::device_ptr<const PetscScalar> d_v; 58 THRUSTARRAY *w = NULL; 59 60 if (isCudaMem(v)) { 61 d_v = thrust::device_pointer_cast(v); 62 } else { 63 w = new THRUSTARRAY(n); 64 w->assign(v, v + n); 65 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 66 d_v = w->data(); 67 } 68 69 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 70 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 71 PetscCall(PetscLogGpuTimeBegin()); 72 thrust::for_each(zibit, zieit, VecCUDAEquals()); 73 PetscCall(PetscLogGpuTimeEnd()); 74 delete w; 75 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 76 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 77 } else { 78 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 79 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 80 } 81 PetscFunctionReturn(0); 82 } 83 84 template <typename Tuple> 85 struct IsNotOffDiagT { 86 PetscInt _cstart, _cend; 87 88 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 89 __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 90 }; 91 92 struct IsOffDiag { 93 PetscInt _cstart, _cend; 94 95 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 96 __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 97 }; 98 99 struct GlobToLoc { 100 PetscInt _start; 101 102 GlobToLoc(PetscInt start) : _start(start) { } 103 __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 104 }; 105 106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) { 107 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 108 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 109 PetscInt N, *jj; 110 size_t noff = 0; 111 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 112 THRUSTINTARRAY d_j(n); 113 ISLocalToGlobalMapping l2g; 114 115 PetscFunctionBegin; 116 PetscCall(MatDestroy(&b->A)); 117 PetscCall(MatDestroy(&b->B)); 118 119 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 120 d_i.assign(coo_i, coo_i + n); 121 d_j.assign(coo_j, coo_j + n); 122 PetscCall(PetscLogGpuTimeBegin()); 123 auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 124 auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 125 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 126 cusp->coo_p = new THRUSTINTARRAY(n); 127 cusp->coo_pw = new THRUSTARRAY(n); 128 thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 129 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 130 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 131 auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 132 firstoffd = mzipp.get_iterator_tuple().get<1>(); 133 } 134 cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 135 cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 136 137 /* from global to local */ 138 thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 139 thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 140 PetscCall(PetscLogGpuTimeEnd()); 141 142 /* copy offdiag column indices to map on the CPU */ 143 PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 144 PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 145 auto o_j = d_j.begin(); 146 PetscCall(PetscLogGpuTimeBegin()); 147 thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 148 thrust::sort(thrust::device, o_j, d_j.end()); 149 auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 150 PetscCall(PetscLogGpuTimeEnd()); 151 noff = thrust::distance(o_j, wit); 152 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 153 PetscCall(PetscMalloc1(noff, &b->garray)); 154 PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 155 PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 156 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 157 PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 158 PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 159 PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 160 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 161 162 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 163 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 164 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 165 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 166 PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 167 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 168 169 /* GPU memory, cusparse specific call handles it internally */ 170 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 171 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 172 PetscCall(PetscFree(jj)); 173 174 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 175 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 176 177 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 178 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 179 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 180 PetscFunctionReturn(0); 181 } 182 183 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 184 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 185 Mat_MPIAIJCUSPARSE *mpidev; 186 PetscBool coo_basic = PETSC_TRUE; 187 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 188 PetscInt rstart, rend; 189 190 PetscFunctionBegin; 191 PetscCall(PetscFree(mpiaij->garray)); 192 PetscCall(VecDestroy(&mpiaij->lvec)); 193 #if defined(PETSC_USE_CTABLE) 194 PetscCall(PetscTableDestroy(&mpiaij->colmap)); 195 #else 196 PetscCall(PetscFree(mpiaij->colmap)); 197 #endif 198 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 199 mat->assembled = PETSC_FALSE; 200 mat->was_assembled = PETSC_FALSE; 201 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 202 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 203 if (coo_i) { 204 PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 205 PetscCall(PetscGetMemType(coo_i, &mtype)); 206 if (PetscMemTypeHost(mtype)) { 207 for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 208 if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 209 coo_basic = PETSC_FALSE; 210 break; 211 } 212 } 213 } 214 } 215 /* All ranks must agree on the value of coo_basic */ 216 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 217 if (coo_basic) { 218 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 219 } else { 220 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 221 mat->offloadmask = PETSC_OFFLOAD_CPU; 222 /* creates the GPU memory */ 223 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 224 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 225 mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 226 mpidev->use_extended_coo = PETSC_TRUE; 227 228 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 229 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 230 231 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 232 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 233 234 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 235 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 236 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 237 238 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 239 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 240 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 241 242 PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 243 PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 244 PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 245 246 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 247 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 248 249 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 250 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 251 252 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 253 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 254 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 255 256 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 257 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 258 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 259 260 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 261 } 262 PetscFunctionReturn(0); 263 } 264 265 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) { 266 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 267 const PetscCount grid_size = gridDim.x * blockDim.x; 268 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 269 } 270 271 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) { 272 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 273 const PetscCount grid_size = gridDim.x * blockDim.x; 274 for (; i < Annz + Bnnz; i += grid_size) { 275 PetscScalar sum = 0.0; 276 if (i < Annz) { 277 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 278 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 279 } else { 280 i -= Annz; 281 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 282 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 283 } 284 } 285 } 286 287 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) { 288 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 289 const PetscCount grid_size = gridDim.x * blockDim.x; 290 for (; i < Annz2 + Bnnz2; i += grid_size) { 291 if (i < Annz2) { 292 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 293 } else { 294 i -= Annz2; 295 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 296 } 297 } 298 } 299 300 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) { 301 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 302 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 303 Mat A = mpiaij->A, B = mpiaij->B; 304 PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 305 PetscScalar *Aa, *Ba = NULL; 306 PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 307 const PetscScalar *v1 = v; 308 const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 309 const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 310 const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 311 const PetscCount *Cperm1 = mpidev->Cperm1_d; 312 PetscMemType memtype; 313 314 PetscFunctionBegin; 315 if (mpidev->use_extended_coo) { 316 PetscMPIInt size; 317 318 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 319 PetscCall(PetscGetMemType(v, &memtype)); 320 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 321 PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 322 PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 323 } 324 325 if (imode == INSERT_VALUES) { 326 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 327 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 328 } else { 329 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 330 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 331 } 332 333 /* Pack entries to be sent to remote */ 334 if (mpiaij->sendlen) { 335 MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 336 PetscCallCUDA(cudaPeekAtLastError()); 337 } 338 339 /* Send remote entries to their owner and overlap the communication with local computation */ 340 PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 341 /* Add local entries to A and B */ 342 if (Annz + Bnnz > 0) { 343 MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 344 PetscCallCUDA(cudaPeekAtLastError()); 345 } 346 PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 347 348 /* Add received remote entries to A and B */ 349 if (Annz2 + Bnnz2 > 0) { 350 MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 351 PetscCallCUDA(cudaPeekAtLastError()); 352 } 353 354 if (imode == INSERT_VALUES) { 355 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 356 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 357 } else { 358 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 359 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 360 } 361 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 362 } else { 363 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 364 } 365 mat->offloadmask = PETSC_OFFLOAD_GPU; 366 PetscFunctionReturn(0); 367 } 368 369 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) { 370 Mat Ad, Ao; 371 const PetscInt *cmap; 372 373 PetscFunctionBegin; 374 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 375 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 376 if (glob) { 377 PetscInt cst, i, dn, on, *gidx; 378 379 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 380 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 381 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 382 PetscCall(PetscMalloc1(dn + on, &gidx)); 383 for (i = 0; i < dn; i++) gidx[i] = cst + i; 384 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 385 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 386 } 387 PetscFunctionReturn(0); 388 } 389 390 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) { 391 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 392 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 393 PetscInt i; 394 395 PetscFunctionBegin; 396 PetscCall(PetscLayoutSetUp(B->rmap)); 397 PetscCall(PetscLayoutSetUp(B->cmap)); 398 if (PetscDefined(USE_DEBUG) && d_nnz) { 399 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 400 } 401 if (PetscDefined(USE_DEBUG) && o_nnz) { 402 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 403 } 404 #if defined(PETSC_USE_CTABLE) 405 PetscCall(PetscTableDestroy(&b->colmap)); 406 #else 407 PetscCall(PetscFree(b->colmap)); 408 #endif 409 PetscCall(PetscFree(b->garray)); 410 PetscCall(VecDestroy(&b->lvec)); 411 PetscCall(VecScatterDestroy(&b->Mvctx)); 412 /* Because the B will have been resized we simply destroy it and create a new one each time */ 413 PetscCall(MatDestroy(&b->B)); 414 if (!b->A) { 415 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 416 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 417 } 418 if (!b->B) { 419 PetscMPIInt size; 420 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 421 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 422 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 423 } 424 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 425 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 426 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 427 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 428 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 429 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 430 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 431 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 432 B->preallocated = PETSC_TRUE; 433 PetscFunctionReturn(0); 434 } 435 436 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 437 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 438 439 PetscFunctionBegin; 440 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 441 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 442 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 443 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 444 PetscFunctionReturn(0); 445 } 446 447 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) { 448 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 449 450 PetscFunctionBegin; 451 PetscCall(MatZeroEntries(l->A)); 452 PetscCall(MatZeroEntries(l->B)); 453 PetscFunctionReturn(0); 454 } 455 456 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 457 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 458 459 PetscFunctionBegin; 460 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 461 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 462 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 463 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 464 PetscFunctionReturn(0); 465 } 466 467 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 468 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 469 470 PetscFunctionBegin; 471 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 472 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 473 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 474 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 475 PetscFunctionReturn(0); 476 } 477 478 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 479 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 480 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 481 482 PetscFunctionBegin; 483 switch (op) { 484 case MAT_CUSPARSE_MULT_DIAG: cusparseStruct->diagGPUMatFormat = format; break; 485 case MAT_CUSPARSE_MULT_OFFDIAG: cusparseStruct->offdiagGPUMatFormat = format; break; 486 case MAT_CUSPARSE_ALL: 487 cusparseStruct->diagGPUMatFormat = format; 488 cusparseStruct->offdiagGPUMatFormat = format; 489 break; 490 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 491 } 492 PetscFunctionReturn(0); 493 } 494 495 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) { 496 MatCUSPARSEStorageFormat format; 497 PetscBool flg; 498 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 499 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 500 501 PetscFunctionBegin; 502 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 503 if (A->factortype == MAT_FACTOR_NONE) { 504 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 505 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 506 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 507 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 508 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 509 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 510 } 511 PetscOptionsHeadEnd(); 512 PetscFunctionReturn(0); 513 } 514 515 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) { 516 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 517 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr; 518 PetscObjectState onnz = A->nonzerostate; 519 520 PetscFunctionBegin; 521 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 522 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 523 if (onnz != A->nonzerostate && cusp->deviceMat) { 524 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 525 526 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 527 PetscCall(PetscNew(&h_mat)); 528 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 529 PetscCallCUDA(cudaFree(h_mat->colmap)); 530 if (h_mat->allocated_indices) { 531 PetscCallCUDA(cudaFree(h_mat->diag.i)); 532 PetscCallCUDA(cudaFree(h_mat->diag.j)); 533 if (h_mat->offdiag.j) { 534 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 535 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 536 } 537 } 538 PetscCallCUDA(cudaFree(d_mat)); 539 PetscCall(PetscFree(h_mat)); 540 cusp->deviceMat = NULL; 541 } 542 PetscFunctionReturn(0); 543 } 544 545 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) { 546 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 547 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 548 549 PetscFunctionBegin; 550 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 551 if (cusparseStruct->deviceMat) { 552 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 553 554 PetscCall(PetscInfo(A, "Have device matrix\n")); 555 PetscCall(PetscNew(&h_mat)); 556 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 557 PetscCallCUDA(cudaFree(h_mat->colmap)); 558 if (h_mat->allocated_indices) { 559 PetscCallCUDA(cudaFree(h_mat->diag.i)); 560 PetscCallCUDA(cudaFree(h_mat->diag.j)); 561 if (h_mat->offdiag.j) { 562 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 563 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 564 } 565 } 566 PetscCallCUDA(cudaFree(d_mat)); 567 PetscCall(PetscFree(h_mat)); 568 } 569 /* Free COO */ 570 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 571 PetscCallCXX(delete cusparseStruct); 572 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 573 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 574 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 575 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 576 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 577 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 578 PetscCall(MatDestroy_MPIAIJ(A)); 579 PetscFunctionReturn(0); 580 } 581 582 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat) { 583 Mat_MPIAIJ *a; 584 Mat A; 585 586 PetscFunctionBegin; 587 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 588 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 589 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 590 A = *newmat; 591 A->boundtocpu = PETSC_FALSE; 592 PetscCall(PetscFree(A->defaultvectype)); 593 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 594 595 a = (Mat_MPIAIJ *)A->data; 596 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 597 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 598 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 599 600 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 601 602 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 603 A->ops->mult = MatMult_MPIAIJCUSPARSE; 604 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 605 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 606 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 607 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 608 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 609 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 610 611 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 612 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 613 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 614 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 615 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 616 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 617 #if defined(PETSC_HAVE_HYPRE) 618 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 619 #endif 620 PetscFunctionReturn(0); 621 } 622 623 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) { 624 PetscFunctionBegin; 625 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 626 PetscCall(MatCreate_MPIAIJ(A)); 627 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 628 PetscFunctionReturn(0); 629 } 630 631 /*@ 632 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 633 (the default parallel PETSc format). This matrix will ultimately pushed down 634 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 635 assembly performance the user should preallocate the matrix storage by setting 636 the parameter nz (or the array nnz). By setting these parameters accurately, 637 performance during matrix assembly can be increased by more than a factor of 50. 638 639 Collective 640 641 Input Parameters: 642 + comm - MPI communicator, set to `PETSC_COMM_SELF` 643 . m - number of rows 644 . n - number of columns 645 . nz - number of nonzeros per row (same for all rows) 646 - nnz - array containing the number of nonzeros in the various rows 647 (possibly different for each row) or NULL 648 649 Output Parameter: 650 . A - the matrix 651 652 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 653 MatXXXXSetPreallocation() paradigm instead of this routine directly. 654 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 655 656 Notes: 657 If nnz is given then nz is ignored 658 659 The AIJ format, also called the 660 compressed row storage), is fully compatible with standard Fortran 77 661 storage. That is, the stored row and column indices can begin at 662 either one (as in Fortran) or zero. See the users' manual for details. 663 664 Specify the preallocated storage with either nz or nnz (not both). 665 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 666 allocation. For large problems you MUST preallocate memory or you 667 will get TERRIBLE performance, see the users' manual chapter on matrices. 668 669 By default, this format uses inodes (identical nodes) when possible, to 670 improve numerical efficiency of matrix-vector products and solves. We 671 search for consecutive rows with the same nonzero structure, thereby 672 reusing matrix information to achieve increased efficiency. 673 674 Level: intermediate 675 676 .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 677 @*/ 678 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) { 679 PetscMPIInt size; 680 681 PetscFunctionBegin; 682 PetscCall(MatCreate(comm, A)); 683 PetscCall(MatSetSizes(*A, m, n, M, N)); 684 PetscCallMPI(MPI_Comm_size(comm, &size)); 685 if (size > 1) { 686 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 687 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 688 } else { 689 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 690 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 691 } 692 PetscFunctionReturn(0); 693 } 694 695 /*MC 696 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 697 698 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 699 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 700 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 701 702 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 703 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 704 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 705 for communicators controlling multiple processes. It is recommended that you call both of 706 the above preallocation routines for simplicity. 707 708 Options Database Keys: 709 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()` 710 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 711 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 712 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 713 714 Level: beginner 715 716 .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 717 M*/ 718 719 /*MC 720 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 721 722 Level: beginner 723 724 .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 725 M*/ 726 727 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 728 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) { 729 PetscSplitCSRDataStructure d_mat; 730 PetscMPIInt size; 731 int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL; 732 PetscScalar *aa = NULL, *ba = NULL; 733 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 734 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 735 CsrMatrix *matrixA = NULL, *matrixB = NULL; 736 737 PetscFunctionBegin; 738 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix"); 739 if (A->factortype != MAT_FACTOR_NONE) { 740 *B = NULL; 741 PetscFunctionReturn(0); 742 } 743 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 744 // get jaca 745 if (size == 1) { 746 PetscBool isseqaij; 747 748 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij)); 749 if (isseqaij) { 750 jaca = (Mat_SeqAIJ *)A->data; 751 PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 752 cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr; 753 d_mat = cusparsestructA->deviceMat; 754 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 755 } else { 756 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 757 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 758 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 759 jaca = (Mat_SeqAIJ *)aij->A->data; 760 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 761 d_mat = spptr->deviceMat; 762 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 763 } 764 if (cusparsestructA->format == MAT_CUSPARSE_CSR) { 765 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 766 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 767 matrixA = (CsrMatrix *)matstruct->mat; 768 bi = NULL; 769 bj = NULL; 770 ba = NULL; 771 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR"); 772 } else { 773 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 774 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 775 jaca = (Mat_SeqAIJ *)aij->A->data; 776 jacb = (Mat_SeqAIJ *)aij->B->data; 777 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 778 779 PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 780 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 781 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr; 782 PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR"); 783 if (cusparsestructB->format == MAT_CUSPARSE_CSR) { 784 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 785 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 786 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 787 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat; 788 PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 789 PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 790 matrixA = (CsrMatrix *)matstructA->mat; 791 matrixB = (CsrMatrix *)matstructB->mat; 792 if (jacb->compressedrow.use) { 793 if (!cusparsestructB->rowoffsets_gpu) { 794 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 795 cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1); 796 } 797 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 798 } else { 799 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 800 } 801 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 802 ba = thrust::raw_pointer_cast(matrixB->values->data()); 803 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR"); 804 d_mat = spptr->deviceMat; 805 } 806 if (jaca->compressedrow.use) { 807 if (!cusparsestructA->rowoffsets_gpu) { 808 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 809 cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1); 810 } 811 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 812 } else { 813 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 814 } 815 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 816 aa = thrust::raw_pointer_cast(matrixA->values->data()); 817 818 if (!d_mat) { 819 PetscSplitCSRDataStructure h_mat; 820 821 // create and populate strucy on host and copy on device 822 PetscCall(PetscInfo(A, "Create device matrix\n")); 823 PetscCall(PetscNew(&h_mat)); 824 PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat))); 825 if (size > 1) { /* need the colmap array */ 826 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 827 PetscInt *colmap; 828 PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N; 829 830 PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray"); 831 832 PetscCall(PetscCalloc1(N + 1, &colmap)); 833 for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1); 834 #if defined(PETSC_USE_64BIT_INDICES) 835 { // have to make a long version of these 836 int *h_bi32, *h_bj32; 837 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 838 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64)); 839 PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost)); 840 for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i]; 841 PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost)); 842 for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i]; 843 844 PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64))); 845 PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice)); 846 PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64))); 847 PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice)); 848 849 h_mat->offdiag.i = d_bi64; 850 h_mat->offdiag.j = d_bj64; 851 h_mat->allocated_indices = PETSC_TRUE; 852 853 PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64)); 854 } 855 #else 856 h_mat->offdiag.i = (PetscInt *)bi; 857 h_mat->offdiag.j = (PetscInt *)bj; 858 h_mat->allocated_indices = PETSC_FALSE; 859 #endif 860 h_mat->offdiag.a = ba; 861 h_mat->offdiag.n = A->rmap->n; 862 863 PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap))); 864 PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice)); 865 PetscCall(PetscFree(colmap)); 866 } 867 h_mat->rstart = A->rmap->rstart; 868 h_mat->rend = A->rmap->rend; 869 h_mat->cstart = A->cmap->rstart; 870 h_mat->cend = A->cmap->rend; 871 h_mat->M = A->cmap->N; 872 #if defined(PETSC_USE_64BIT_INDICES) 873 { 874 int *h_ai32, *h_aj32; 875 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 876 877 static_assert(sizeof(PetscInt) == 8, ""); 878 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64)); 879 PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost)); 880 for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i]; 881 PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost)); 882 for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i]; 883 884 PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64))); 885 PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice)); 886 PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64))); 887 PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice)); 888 889 h_mat->diag.i = d_ai64; 890 h_mat->diag.j = d_aj64; 891 h_mat->allocated_indices = PETSC_TRUE; 892 893 PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64)); 894 } 895 #else 896 h_mat->diag.i = (PetscInt *)ai; 897 h_mat->diag.j = (PetscInt *)aj; 898 h_mat->allocated_indices = PETSC_FALSE; 899 #endif 900 h_mat->diag.a = aa; 901 h_mat->diag.n = A->rmap->n; 902 h_mat->rank = PetscGlobalRank; 903 // copy pointers and metadata to device 904 PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice)); 905 PetscCall(PetscFree(h_mat)); 906 } else { 907 PetscCall(PetscInfo(A, "Reusing device matrix\n")); 908 } 909 *B = d_mat; 910 A->offloadmask = PETSC_OFFLOAD_GPU; 911 PetscFunctionReturn(0); 912 } 913