1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 #include <thrust/advance.h> 8 #include <thrust/partition.h> 9 #include <thrust/sort.h> 10 #include <thrust/unique.h> 11 #include <petscsf.h> 12 13 struct VecCUDAEquals { 14 template <typename Tuple> 15 __host__ __device__ void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 22 { 23 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 24 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 25 26 PetscFunctionBegin; 27 if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS); 28 if (cusparseStruct->use_extended_coo) { 29 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 30 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 31 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 32 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 41 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 42 } 43 cusparseStruct->use_extended_coo = PETSC_FALSE; 44 delete cusparseStruct->coo_p; 45 delete cusparseStruct->coo_pw; 46 cusparseStruct->coo_p = NULL; 47 cusparseStruct->coo_pw = NULL; 48 PetscFunctionReturn(PETSC_SUCCESS); 49 } 50 51 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 52 { 53 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 54 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 55 PetscInt n = cusp->coo_nd + cusp->coo_no; 56 57 PetscFunctionBegin; 58 if (cusp->coo_p && v) { 59 thrust::device_ptr<const PetscScalar> d_v; 60 THRUSTARRAY *w = NULL; 61 62 if (isCudaMem(v)) { 63 d_v = thrust::device_pointer_cast(v); 64 } else { 65 w = new THRUSTARRAY(n); 66 w->assign(v, v + n); 67 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 68 d_v = w->data(); 69 } 70 71 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 72 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 73 PetscCall(PetscLogGpuTimeBegin()); 74 thrust::for_each(zibit, zieit, VecCUDAEquals()); 75 PetscCall(PetscLogGpuTimeEnd()); 76 delete w; 77 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 78 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 79 } else { 80 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 81 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 82 } 83 PetscFunctionReturn(PETSC_SUCCESS); 84 } 85 86 template <typename Tuple> 87 struct IsNotOffDiagT { 88 PetscInt _cstart, _cend; 89 90 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 91 __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 92 }; 93 94 struct IsOffDiag { 95 PetscInt _cstart, _cend; 96 97 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 98 __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 99 }; 100 101 struct GlobToLoc { 102 PetscInt _start; 103 104 GlobToLoc(PetscInt start) : _start(start) { } 105 __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 106 }; 107 108 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 109 { 110 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 111 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 112 PetscInt N, *jj; 113 size_t noff = 0; 114 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 115 THRUSTINTARRAY d_j(n); 116 ISLocalToGlobalMapping l2g; 117 118 PetscFunctionBegin; 119 PetscCall(MatDestroy(&b->A)); 120 PetscCall(MatDestroy(&b->B)); 121 122 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 123 d_i.assign(coo_i, coo_i + n); 124 d_j.assign(coo_j, coo_j + n); 125 PetscCall(PetscLogGpuTimeBegin()); 126 auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 127 auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 128 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 129 cusp->coo_p = new THRUSTINTARRAY(n); 130 cusp->coo_pw = new THRUSTARRAY(n); 131 thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 132 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 133 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 134 auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 135 firstoffd = mzipp.get_iterator_tuple().get<1>(); 136 } 137 cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 138 cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 139 140 /* from global to local */ 141 thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 142 thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 143 PetscCall(PetscLogGpuTimeEnd()); 144 145 /* copy offdiag column indices to map on the CPU */ 146 PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 147 PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 148 auto o_j = d_j.begin(); 149 PetscCall(PetscLogGpuTimeBegin()); 150 thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 151 thrust::sort(thrust::device, o_j, d_j.end()); 152 auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 153 PetscCall(PetscLogGpuTimeEnd()); 154 noff = thrust::distance(o_j, wit); 155 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 156 PetscCall(PetscMalloc1(noff, &b->garray)); 157 PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 158 PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 159 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 160 PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 161 PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 162 PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 163 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 164 165 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 166 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 167 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 168 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 169 PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 170 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 171 172 /* GPU memory, cusparse specific call handles it internally */ 173 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 174 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 175 PetscCall(PetscFree(jj)); 176 177 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 178 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 179 180 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 181 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 182 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 183 PetscFunctionReturn(PETSC_SUCCESS); 184 } 185 186 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 187 { 188 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 189 Mat_MPIAIJCUSPARSE *mpidev; 190 PetscBool coo_basic = PETSC_TRUE; 191 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 192 PetscInt rstart, rend; 193 194 PetscFunctionBegin; 195 PetscCall(PetscFree(mpiaij->garray)); 196 PetscCall(VecDestroy(&mpiaij->lvec)); 197 #if defined(PETSC_USE_CTABLE) 198 PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 199 #else 200 PetscCall(PetscFree(mpiaij->colmap)); 201 #endif 202 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 203 mat->assembled = PETSC_FALSE; 204 mat->was_assembled = PETSC_FALSE; 205 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 206 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 207 if (coo_i) { 208 PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 209 PetscCall(PetscGetMemType(coo_i, &mtype)); 210 if (PetscMemTypeHost(mtype)) { 211 for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 212 if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 213 coo_basic = PETSC_FALSE; 214 break; 215 } 216 } 217 } 218 } 219 /* All ranks must agree on the value of coo_basic */ 220 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 221 if (coo_basic) { 222 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 223 } else { 224 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 225 mat->offloadmask = PETSC_OFFLOAD_CPU; 226 /* creates the GPU memory */ 227 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 228 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 229 mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 230 mpidev->use_extended_coo = PETSC_TRUE; 231 232 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 233 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 234 235 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 236 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 237 238 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 239 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 240 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 241 242 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 243 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 244 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 245 246 PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 247 PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 248 PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 249 250 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 251 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 252 253 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 254 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 255 256 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 257 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 258 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 259 260 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 261 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 262 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 263 264 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 265 } 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 270 { 271 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 272 const PetscCount grid_size = gridDim.x * blockDim.x; 273 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 274 } 275 276 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 277 { 278 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 279 const PetscCount grid_size = gridDim.x * blockDim.x; 280 for (; i < Annz + Bnnz; i += grid_size) { 281 PetscScalar sum = 0.0; 282 if (i < Annz) { 283 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 284 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 285 } else { 286 i -= Annz; 287 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 288 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 289 } 290 } 291 } 292 293 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 294 { 295 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 296 const PetscCount grid_size = gridDim.x * blockDim.x; 297 for (; i < Annz2 + Bnnz2; i += grid_size) { 298 if (i < Annz2) { 299 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 300 } else { 301 i -= Annz2; 302 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 303 } 304 } 305 } 306 307 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 308 { 309 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 310 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 311 Mat A = mpiaij->A, B = mpiaij->B; 312 PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 313 PetscScalar *Aa, *Ba = NULL; 314 PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 315 const PetscScalar *v1 = v; 316 const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 317 const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 318 const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 319 const PetscCount *Cperm1 = mpidev->Cperm1_d; 320 PetscMemType memtype; 321 322 PetscFunctionBegin; 323 if (mpidev->use_extended_coo) { 324 PetscMPIInt size; 325 326 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 327 PetscCall(PetscGetMemType(v, &memtype)); 328 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 329 PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 330 PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 331 } 332 333 if (imode == INSERT_VALUES) { 334 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 335 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 336 } else { 337 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 338 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 339 } 340 341 /* Pack entries to be sent to remote */ 342 if (mpiaij->sendlen) { 343 MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 344 PetscCallCUDA(cudaPeekAtLastError()); 345 } 346 347 /* Send remote entries to their owner and overlap the communication with local computation */ 348 PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 349 /* Add local entries to A and B */ 350 if (Annz + Bnnz > 0) { 351 MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 352 PetscCallCUDA(cudaPeekAtLastError()); 353 } 354 PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 355 356 /* Add received remote entries to A and B */ 357 if (Annz2 + Bnnz2 > 0) { 358 MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 359 PetscCallCUDA(cudaPeekAtLastError()); 360 } 361 362 if (imode == INSERT_VALUES) { 363 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 364 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 365 } else { 366 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 367 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 368 } 369 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 370 } else { 371 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 372 } 373 mat->offloadmask = PETSC_OFFLOAD_GPU; 374 PetscFunctionReturn(PETSC_SUCCESS); 375 } 376 377 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 378 { 379 Mat Ad, Ao; 380 const PetscInt *cmap; 381 382 PetscFunctionBegin; 383 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 384 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 385 if (glob) { 386 PetscInt cst, i, dn, on, *gidx; 387 388 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 389 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 390 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 391 PetscCall(PetscMalloc1(dn + on, &gidx)); 392 for (i = 0; i < dn; i++) gidx[i] = cst + i; 393 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 394 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 395 } 396 PetscFunctionReturn(PETSC_SUCCESS); 397 } 398 399 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 400 { 401 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 402 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 403 PetscInt i; 404 405 PetscFunctionBegin; 406 PetscCall(PetscLayoutSetUp(B->rmap)); 407 PetscCall(PetscLayoutSetUp(B->cmap)); 408 if (PetscDefined(USE_DEBUG) && d_nnz) { 409 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 410 } 411 if (PetscDefined(USE_DEBUG) && o_nnz) { 412 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 413 } 414 #if defined(PETSC_USE_CTABLE) 415 PetscCall(PetscHMapIDestroy(&b->colmap)); 416 #else 417 PetscCall(PetscFree(b->colmap)); 418 #endif 419 PetscCall(PetscFree(b->garray)); 420 PetscCall(VecDestroy(&b->lvec)); 421 PetscCall(VecScatterDestroy(&b->Mvctx)); 422 /* Because the B will have been resized we simply destroy it and create a new one each time */ 423 PetscCall(MatDestroy(&b->B)); 424 if (!b->A) { 425 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 426 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 427 } 428 if (!b->B) { 429 PetscMPIInt size; 430 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 431 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 432 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 433 } 434 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 435 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 436 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 437 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 438 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 439 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 440 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 441 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 442 B->preallocated = PETSC_TRUE; 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 447 { 448 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 449 450 PetscFunctionBegin; 451 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 452 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 453 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 454 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 455 PetscFunctionReturn(PETSC_SUCCESS); 456 } 457 458 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 459 { 460 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 461 462 PetscFunctionBegin; 463 PetscCall(MatZeroEntries(l->A)); 464 PetscCall(MatZeroEntries(l->B)); 465 PetscFunctionReturn(PETSC_SUCCESS); 466 } 467 468 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 469 { 470 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 471 472 PetscFunctionBegin; 473 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 474 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 475 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 476 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 477 PetscFunctionReturn(PETSC_SUCCESS); 478 } 479 480 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 481 { 482 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 483 484 PetscFunctionBegin; 485 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 486 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 487 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 488 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 489 PetscFunctionReturn(PETSC_SUCCESS); 490 } 491 492 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 493 { 494 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 495 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 496 497 PetscFunctionBegin; 498 switch (op) { 499 case MAT_CUSPARSE_MULT_DIAG: 500 cusparseStruct->diagGPUMatFormat = format; 501 break; 502 case MAT_CUSPARSE_MULT_OFFDIAG: 503 cusparseStruct->offdiagGPUMatFormat = format; 504 break; 505 case MAT_CUSPARSE_ALL: 506 cusparseStruct->diagGPUMatFormat = format; 507 cusparseStruct->offdiagGPUMatFormat = format; 508 break; 509 default: 510 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 511 } 512 PetscFunctionReturn(PETSC_SUCCESS); 513 } 514 515 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 516 { 517 MatCUSPARSEStorageFormat format; 518 PetscBool flg; 519 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 520 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 521 522 PetscFunctionBegin; 523 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 524 if (A->factortype == MAT_FACTOR_NONE) { 525 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 526 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 527 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 528 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 529 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 530 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 531 } 532 PetscOptionsHeadEnd(); 533 PetscFunctionReturn(PETSC_SUCCESS); 534 } 535 536 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 537 { 538 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 539 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr; 540 PetscObjectState onnz = A->nonzerostate; 541 542 PetscFunctionBegin; 543 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 544 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 545 if (onnz != A->nonzerostate && cusp->deviceMat) { 546 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 547 548 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 549 PetscCall(PetscNew(&h_mat)); 550 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 551 PetscCallCUDA(cudaFree(h_mat->colmap)); 552 if (h_mat->allocated_indices) { 553 PetscCallCUDA(cudaFree(h_mat->diag.i)); 554 PetscCallCUDA(cudaFree(h_mat->diag.j)); 555 if (h_mat->offdiag.j) { 556 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 557 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 558 } 559 } 560 PetscCallCUDA(cudaFree(d_mat)); 561 PetscCall(PetscFree(h_mat)); 562 cusp->deviceMat = NULL; 563 } 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 567 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 568 { 569 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 570 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 571 572 PetscFunctionBegin; 573 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 574 if (cusparseStruct->deviceMat) { 575 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 576 577 PetscCall(PetscInfo(A, "Have device matrix\n")); 578 PetscCall(PetscNew(&h_mat)); 579 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 580 PetscCallCUDA(cudaFree(h_mat->colmap)); 581 if (h_mat->allocated_indices) { 582 PetscCallCUDA(cudaFree(h_mat->diag.i)); 583 PetscCallCUDA(cudaFree(h_mat->diag.j)); 584 if (h_mat->offdiag.j) { 585 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 586 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 587 } 588 } 589 PetscCallCUDA(cudaFree(d_mat)); 590 PetscCall(PetscFree(h_mat)); 591 } 592 /* Free COO */ 593 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 594 PetscCallCXX(delete cusparseStruct); 595 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 596 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 597 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 598 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 599 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 600 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 601 PetscCall(MatDestroy_MPIAIJ(A)); 602 PetscFunctionReturn(PETSC_SUCCESS); 603 } 604 605 /* defines MatSetValues_MPICUSPARSE_Hash() */ 606 #define TYPE AIJ 607 #define TYPE_AIJ 608 #define SUB_TYPE_CUSPARSE 609 #include "../src/mat/impls/aij/mpi/mpihashmat.h" 610 #undef TYPE 611 #undef TYPE_AIJ 612 #undef SUB_TYPE_CUSPARSE 613 614 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 615 { 616 Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 617 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 618 619 PetscFunctionBegin; 620 PetscCall(MatSetUp_MPI_Hash(A)); 621 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 622 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 623 A->preallocated = PETSC_TRUE; 624 PetscFunctionReturn(PETSC_SUCCESS); 625 } 626 627 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 628 { 629 Mat_MPIAIJ *a; 630 Mat A; 631 632 PetscFunctionBegin; 633 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 634 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 635 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 636 A = *newmat; 637 A->boundtocpu = PETSC_FALSE; 638 PetscCall(PetscFree(A->defaultvectype)); 639 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 640 641 a = (Mat_MPIAIJ *)A->data; 642 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 643 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 644 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 645 646 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 647 648 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 649 A->ops->mult = MatMult_MPIAIJCUSPARSE; 650 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 651 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 652 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 653 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 654 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 655 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 656 A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 657 658 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 659 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 660 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 661 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 662 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 663 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 664 #if defined(PETSC_HAVE_HYPRE) 665 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 666 #endif 667 PetscFunctionReturn(PETSC_SUCCESS); 668 } 669 670 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 671 { 672 PetscFunctionBegin; 673 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 674 PetscCall(MatCreate_MPIAIJ(A)); 675 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 676 PetscFunctionReturn(PETSC_SUCCESS); 677 } 678 679 /*@ 680 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 681 (the default parallel PETSc format). This matrix will ultimately pushed down 682 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 683 assembly performance the user should preallocate the matrix storage by setting 684 the parameter nz (or the array nnz). By setting these parameters accurately, 685 performance during matrix assembly can be increased by more than a factor of 50. 686 687 Collective 688 689 Input Parameters: 690 + comm - MPI communicator, set to `PETSC_COMM_SELF` 691 . m - number of rows 692 . n - number of columns 693 . nz - number of nonzeros per row (same for all rows) 694 - nnz - array containing the number of nonzeros in the various rows 695 (possibly different for each row) or NULL 696 697 Output Parameter: 698 . A - the matrix 699 700 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 701 MatXXXXSetPreallocation() paradigm instead of this routine directly. 702 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 703 704 Notes: 705 If nnz is given then nz is ignored 706 707 The AIJ format, also called the 708 compressed row storage), is fully compatible with standard Fortran 77 709 storage. That is, the stored row and column indices can begin at 710 either one (as in Fortran) or zero. See the users' manual for details. 711 712 Specify the preallocated storage with either nz or nnz (not both). 713 Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 714 allocation. For large problems you MUST preallocate memory or you 715 will get TERRIBLE performance, see the users' manual chapter on matrices. 716 717 By default, this format uses inodes (identical nodes) when possible, to 718 improve numerical efficiency of matrix-vector products and solves. We 719 search for consecutive rows with the same nonzero structure, thereby 720 reusing matrix information to achieve increased efficiency. 721 722 Level: intermediate 723 724 .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 725 @*/ 726 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 727 { 728 PetscMPIInt size; 729 730 PetscFunctionBegin; 731 PetscCall(MatCreate(comm, A)); 732 PetscCall(MatSetSizes(*A, m, n, M, N)); 733 PetscCallMPI(MPI_Comm_size(comm, &size)); 734 if (size > 1) { 735 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 736 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 737 } else { 738 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 739 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 740 } 741 PetscFunctionReturn(PETSC_SUCCESS); 742 } 743 744 /*MC 745 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 746 747 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 748 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 749 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 750 751 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 752 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 753 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 754 for communicators controlling multiple processes. It is recommended that you call both of 755 the above preallocation routines for simplicity. 756 757 Options Database Keys: 758 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()` 759 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 760 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 761 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 762 763 Level: beginner 764 765 .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 766 M*/ 767 768 /*MC 769 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 770 771 Level: beginner 772 773 .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 774 M*/ 775 776 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 777 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 778 { 779 PetscSplitCSRDataStructure d_mat; 780 PetscMPIInt size; 781 int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL; 782 PetscScalar *aa = NULL, *ba = NULL; 783 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 784 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 785 CsrMatrix *matrixA = NULL, *matrixB = NULL; 786 787 PetscFunctionBegin; 788 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix"); 789 if (A->factortype != MAT_FACTOR_NONE) { 790 *B = NULL; 791 PetscFunctionReturn(PETSC_SUCCESS); 792 } 793 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 794 // get jaca 795 if (size == 1) { 796 PetscBool isseqaij; 797 798 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij)); 799 if (isseqaij) { 800 jaca = (Mat_SeqAIJ *)A->data; 801 PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 802 cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr; 803 d_mat = cusparsestructA->deviceMat; 804 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 805 } else { 806 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 807 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 808 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 809 jaca = (Mat_SeqAIJ *)aij->A->data; 810 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 811 d_mat = spptr->deviceMat; 812 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 813 } 814 if (cusparsestructA->format == MAT_CUSPARSE_CSR) { 815 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 816 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 817 matrixA = (CsrMatrix *)matstruct->mat; 818 bi = NULL; 819 bj = NULL; 820 ba = NULL; 821 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR"); 822 } else { 823 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 824 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 825 jaca = (Mat_SeqAIJ *)aij->A->data; 826 jacb = (Mat_SeqAIJ *)aij->B->data; 827 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 828 829 PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 830 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 831 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr; 832 PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR"); 833 if (cusparsestructB->format == MAT_CUSPARSE_CSR) { 834 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 835 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 836 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 837 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat; 838 PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 839 PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 840 matrixA = (CsrMatrix *)matstructA->mat; 841 matrixB = (CsrMatrix *)matstructB->mat; 842 if (jacb->compressedrow.use) { 843 if (!cusparsestructB->rowoffsets_gpu) { 844 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 845 cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1); 846 } 847 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 848 } else { 849 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 850 } 851 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 852 ba = thrust::raw_pointer_cast(matrixB->values->data()); 853 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR"); 854 d_mat = spptr->deviceMat; 855 } 856 if (jaca->compressedrow.use) { 857 if (!cusparsestructA->rowoffsets_gpu) { 858 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 859 cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1); 860 } 861 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 862 } else { 863 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 864 } 865 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 866 aa = thrust::raw_pointer_cast(matrixA->values->data()); 867 868 if (!d_mat) { 869 PetscSplitCSRDataStructure h_mat; 870 871 // create and populate strucy on host and copy on device 872 PetscCall(PetscInfo(A, "Create device matrix\n")); 873 PetscCall(PetscNew(&h_mat)); 874 PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat))); 875 if (size > 1) { /* need the colmap array */ 876 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 877 PetscInt *colmap; 878 PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N; 879 880 PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray"); 881 882 PetscCall(PetscCalloc1(N + 1, &colmap)); 883 for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1); 884 #if defined(PETSC_USE_64BIT_INDICES) 885 { // have to make a long version of these 886 int *h_bi32, *h_bj32; 887 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 888 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64)); 889 PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost)); 890 for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i]; 891 PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost)); 892 for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i]; 893 894 PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64))); 895 PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice)); 896 PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64))); 897 PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice)); 898 899 h_mat->offdiag.i = d_bi64; 900 h_mat->offdiag.j = d_bj64; 901 h_mat->allocated_indices = PETSC_TRUE; 902 903 PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64)); 904 } 905 #else 906 h_mat->offdiag.i = (PetscInt *)bi; 907 h_mat->offdiag.j = (PetscInt *)bj; 908 h_mat->allocated_indices = PETSC_FALSE; 909 #endif 910 h_mat->offdiag.a = ba; 911 h_mat->offdiag.n = A->rmap->n; 912 913 PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap))); 914 PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice)); 915 PetscCall(PetscFree(colmap)); 916 } 917 h_mat->rstart = A->rmap->rstart; 918 h_mat->rend = A->rmap->rend; 919 h_mat->cstart = A->cmap->rstart; 920 h_mat->cend = A->cmap->rend; 921 h_mat->M = A->cmap->N; 922 #if defined(PETSC_USE_64BIT_INDICES) 923 { 924 int *h_ai32, *h_aj32; 925 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 926 927 static_assert(sizeof(PetscInt) == 8, ""); 928 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64)); 929 PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost)); 930 for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i]; 931 PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost)); 932 for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i]; 933 934 PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64))); 935 PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice)); 936 PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64))); 937 PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice)); 938 939 h_mat->diag.i = d_ai64; 940 h_mat->diag.j = d_aj64; 941 h_mat->allocated_indices = PETSC_TRUE; 942 943 PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64)); 944 } 945 #else 946 h_mat->diag.i = (PetscInt *)ai; 947 h_mat->diag.j = (PetscInt *)aj; 948 h_mat->allocated_indices = PETSC_FALSE; 949 #endif 950 h_mat->diag.a = aa; 951 h_mat->diag.n = A->rmap->n; 952 h_mat->rank = PetscGlobalRank; 953 // copy pointers and metadata to device 954 PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice)); 955 PetscCall(PetscFree(h_mat)); 956 } else { 957 PetscCall(PetscInfo(A, "Reusing device matrix\n")); 958 } 959 *B = d_mat; 960 A->offloadmask = PETSC_OFFLOAD_GPU; 961 PetscFunctionReturn(PETSC_SUCCESS); 962 } 963