1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 2 3 #include <petscconf.h> 4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 7 #include <thrust/advance.h> 8 #include <thrust/partition.h> 9 #include <thrust/sort.h> 10 #include <thrust/unique.h> 11 #include <petscsf.h> 12 13 struct VecCUDAEquals { 14 template <typename Tuple> 15 __host__ __device__ void operator()(Tuple t) 16 { 17 thrust::get<1>(t) = thrust::get<0>(t); 18 } 19 }; 20 21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 22 { 23 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 24 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 25 26 PetscFunctionBegin; 27 if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS); 28 if (cusparseStruct->use_extended_coo) { 29 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 30 PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 31 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 32 PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 33 PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 34 PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 35 PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 36 PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 37 PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 38 PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 39 PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 40 PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 41 PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 42 } 43 cusparseStruct->use_extended_coo = PETSC_FALSE; 44 delete cusparseStruct->coo_p; 45 delete cusparseStruct->coo_pw; 46 cusparseStruct->coo_p = NULL; 47 cusparseStruct->coo_pw = NULL; 48 PetscFunctionReturn(PETSC_SUCCESS); 49 } 50 51 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 52 { 53 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 54 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 55 PetscInt n = cusp->coo_nd + cusp->coo_no; 56 57 PetscFunctionBegin; 58 if (cusp->coo_p && v) { 59 thrust::device_ptr<const PetscScalar> d_v; 60 THRUSTARRAY *w = NULL; 61 62 if (isCudaMem(v)) { 63 d_v = thrust::device_pointer_cast(v); 64 } else { 65 w = new THRUSTARRAY(n); 66 w->assign(v, v + n); 67 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 68 d_v = w->data(); 69 } 70 71 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 72 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 73 PetscCall(PetscLogGpuTimeBegin()); 74 thrust::for_each(zibit, zieit, VecCUDAEquals()); 75 PetscCall(PetscLogGpuTimeEnd()); 76 delete w; 77 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 78 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 79 } else { 80 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 81 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 82 } 83 PetscFunctionReturn(PETSC_SUCCESS); 84 } 85 86 template <typename Tuple> 87 struct IsNotOffDiagT { 88 PetscInt _cstart, _cend; 89 90 IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 91 __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 92 }; 93 94 struct IsOffDiag { 95 PetscInt _cstart, _cend; 96 97 IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 98 __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 99 }; 100 101 struct GlobToLoc { 102 PetscInt _start; 103 104 GlobToLoc(PetscInt start) : _start(start) { } 105 __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 106 }; 107 108 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 109 { 110 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 111 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 112 PetscInt N, *jj; 113 size_t noff = 0; 114 THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 115 THRUSTINTARRAY d_j(n); 116 ISLocalToGlobalMapping l2g; 117 118 PetscFunctionBegin; 119 PetscCall(MatDestroy(&b->A)); 120 PetscCall(MatDestroy(&b->B)); 121 122 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 123 d_i.assign(coo_i, coo_i + n); 124 d_j.assign(coo_j, coo_j + n); 125 PetscCall(PetscLogGpuTimeBegin()); 126 auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 127 auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 128 if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 129 cusp->coo_p = new THRUSTINTARRAY(n); 130 cusp->coo_pw = new THRUSTARRAY(n); 131 thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 132 auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 133 auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 134 auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 135 firstoffd = mzipp.get_iterator_tuple().get<1>(); 136 } 137 cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 138 cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 139 140 /* from global to local */ 141 thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 142 thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 143 PetscCall(PetscLogGpuTimeEnd()); 144 145 /* copy offdiag column indices to map on the CPU */ 146 PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 147 PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 148 auto o_j = d_j.begin(); 149 PetscCall(PetscLogGpuTimeBegin()); 150 thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 151 thrust::sort(thrust::device, o_j, d_j.end()); 152 auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 153 PetscCall(PetscLogGpuTimeEnd()); 154 noff = thrust::distance(o_j, wit); 155 /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 156 PetscCall(PetscMalloc1(noff, &b->garray)); 157 PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 158 PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 159 PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 160 PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 161 PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 162 PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 163 PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 164 165 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 166 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 167 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 168 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 169 PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 170 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 171 172 /* GPU memory, cusparse specific call handles it internally */ 173 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 174 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 175 PetscCall(PetscFree(jj)); 176 177 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 178 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 179 180 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 181 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 182 PetscCall(MatSetUpMultiply_MPIAIJ(B)); 183 PetscFunctionReturn(PETSC_SUCCESS); 184 } 185 186 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 187 { 188 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 189 Mat_MPIAIJCUSPARSE *mpidev; 190 PetscBool coo_basic = PETSC_TRUE; 191 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 192 PetscInt rstart, rend; 193 194 PetscFunctionBegin; 195 PetscCall(PetscFree(mpiaij->garray)); 196 PetscCall(VecDestroy(&mpiaij->lvec)); 197 #if defined(PETSC_USE_CTABLE) 198 PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 199 #else 200 PetscCall(PetscFree(mpiaij->colmap)); 201 #endif 202 PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 203 mat->assembled = PETSC_FALSE; 204 mat->was_assembled = PETSC_FALSE; 205 PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 206 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 207 if (coo_i) { 208 PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 209 PetscCall(PetscGetMemType(coo_i, &mtype)); 210 if (PetscMemTypeHost(mtype)) { 211 for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 212 if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 213 coo_basic = PETSC_FALSE; 214 break; 215 } 216 } 217 } 218 } 219 /* All ranks must agree on the value of coo_basic */ 220 PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 221 if (coo_basic) { 222 PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 223 } else { 224 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 225 mat->offloadmask = PETSC_OFFLOAD_CPU; 226 /* creates the GPU memory */ 227 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 228 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 229 mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 230 mpidev->use_extended_coo = PETSC_TRUE; 231 232 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 233 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 234 235 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 236 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 237 238 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 239 PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 240 PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 241 242 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 243 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 244 PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 245 246 PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 247 PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 248 PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 249 250 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 251 PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 252 253 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 254 PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 255 256 PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 257 PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 258 PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 259 260 PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 261 PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 262 PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 263 264 PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 265 } 266 PetscFunctionReturn(PETSC_SUCCESS); 267 } 268 269 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 270 { 271 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 272 const PetscCount grid_size = gridDim.x * blockDim.x; 273 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 274 } 275 276 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 277 { 278 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 279 const PetscCount grid_size = gridDim.x * blockDim.x; 280 for (; i < Annz + Bnnz; i += grid_size) { 281 PetscScalar sum = 0.0; 282 if (i < Annz) { 283 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 284 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 285 } else { 286 i -= Annz; 287 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 288 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 289 } 290 } 291 } 292 293 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 294 { 295 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 296 const PetscCount grid_size = gridDim.x * blockDim.x; 297 for (; i < Annz2 + Bnnz2; i += grid_size) { 298 if (i < Annz2) { 299 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 300 } else { 301 i -= Annz2; 302 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 303 } 304 } 305 } 306 307 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 308 { 309 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 310 Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 311 Mat A = mpiaij->A, B = mpiaij->B; 312 PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 313 PetscScalar *Aa, *Ba = NULL; 314 PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 315 const PetscScalar *v1 = v; 316 const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 317 const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 318 const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 319 const PetscCount *Cperm1 = mpidev->Cperm1_d; 320 PetscMemType memtype; 321 322 PetscFunctionBegin; 323 if (mpidev->use_extended_coo) { 324 PetscMPIInt size; 325 326 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 327 PetscCall(PetscGetMemType(v, &memtype)); 328 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 329 PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 330 PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 331 } 332 333 if (imode == INSERT_VALUES) { 334 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 335 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 336 } else { 337 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 338 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 339 } 340 341 /* Pack entries to be sent to remote */ 342 if (mpiaij->sendlen) { 343 MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 344 PetscCallCUDA(cudaPeekAtLastError()); 345 } 346 347 /* Send remote entries to their owner and overlap the communication with local computation */ 348 PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 349 /* Add local entries to A and B */ 350 if (Annz + Bnnz > 0) { 351 MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 352 PetscCallCUDA(cudaPeekAtLastError()); 353 } 354 PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 355 356 /* Add received remote entries to A and B */ 357 if (Annz2 + Bnnz2 > 0) { 358 MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 359 PetscCallCUDA(cudaPeekAtLastError()); 360 } 361 362 if (imode == INSERT_VALUES) { 363 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 364 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 365 } else { 366 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 367 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 368 } 369 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 370 } else { 371 PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 372 } 373 mat->offloadmask = PETSC_OFFLOAD_GPU; 374 PetscFunctionReturn(PETSC_SUCCESS); 375 } 376 377 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 378 { 379 Mat Ad, Ao; 380 const PetscInt *cmap; 381 382 PetscFunctionBegin; 383 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 384 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 385 if (glob) { 386 PetscInt cst, i, dn, on, *gidx; 387 388 PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 389 PetscCall(MatGetLocalSize(Ao, NULL, &on)); 390 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 391 PetscCall(PetscMalloc1(dn + on, &gidx)); 392 for (i = 0; i < dn; i++) gidx[i] = cst + i; 393 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 394 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 395 } 396 PetscFunctionReturn(PETSC_SUCCESS); 397 } 398 399 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 400 { 401 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 402 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 403 PetscInt i; 404 405 PetscFunctionBegin; 406 PetscCall(PetscLayoutSetUp(B->rmap)); 407 PetscCall(PetscLayoutSetUp(B->cmap)); 408 if (PetscDefined(USE_DEBUG) && d_nnz) { 409 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 410 } 411 if (PetscDefined(USE_DEBUG) && o_nnz) { 412 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 413 } 414 #if defined(PETSC_USE_CTABLE) 415 PetscCall(PetscHMapIDestroy(&b->colmap)); 416 #else 417 PetscCall(PetscFree(b->colmap)); 418 #endif 419 PetscCall(PetscFree(b->garray)); 420 PetscCall(VecDestroy(&b->lvec)); 421 PetscCall(VecScatterDestroy(&b->Mvctx)); 422 /* Because the B will have been resized we simply destroy it and create a new one each time */ 423 PetscCall(MatDestroy(&b->B)); 424 if (!b->A) { 425 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 426 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 427 } 428 if (!b->B) { 429 PetscMPIInt size; 430 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 431 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 432 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 433 } 434 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 435 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 436 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 437 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 438 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 439 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 440 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 441 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 442 B->preallocated = PETSC_TRUE; 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 447 { 448 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 449 450 PetscFunctionBegin; 451 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 452 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 453 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 454 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 455 PetscFunctionReturn(PETSC_SUCCESS); 456 } 457 458 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 459 { 460 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 461 462 PetscFunctionBegin; 463 PetscCall(MatZeroEntries(l->A)); 464 PetscCall(MatZeroEntries(l->B)); 465 PetscFunctionReturn(PETSC_SUCCESS); 466 } 467 468 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 469 { 470 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 471 472 PetscFunctionBegin; 473 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 474 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 475 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 476 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 477 PetscFunctionReturn(PETSC_SUCCESS); 478 } 479 480 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 481 { 482 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 483 484 PetscFunctionBegin; 485 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 486 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 487 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 488 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 489 PetscFunctionReturn(PETSC_SUCCESS); 490 } 491 492 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 493 { 494 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 495 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 496 497 PetscFunctionBegin; 498 switch (op) { 499 case MAT_CUSPARSE_MULT_DIAG: 500 cusparseStruct->diagGPUMatFormat = format; 501 break; 502 case MAT_CUSPARSE_MULT_OFFDIAG: 503 cusparseStruct->offdiagGPUMatFormat = format; 504 break; 505 case MAT_CUSPARSE_ALL: 506 cusparseStruct->diagGPUMatFormat = format; 507 cusparseStruct->offdiagGPUMatFormat = format; 508 break; 509 default: 510 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 511 } 512 PetscFunctionReturn(PETSC_SUCCESS); 513 } 514 515 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 516 { 517 MatCUSPARSEStorageFormat format; 518 PetscBool flg; 519 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 520 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 521 522 PetscFunctionBegin; 523 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 524 if (A->factortype == MAT_FACTOR_NONE) { 525 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 526 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 527 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 528 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 529 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 530 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 531 } 532 PetscOptionsHeadEnd(); 533 PetscFunctionReturn(PETSC_SUCCESS); 534 } 535 536 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 537 { 538 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 539 Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr; 540 PetscObjectState onnz = A->nonzerostate; 541 542 PetscFunctionBegin; 543 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 544 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 545 if (onnz != A->nonzerostate && cusp->deviceMat) { 546 PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 547 548 PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 549 PetscCall(PetscNew(&h_mat)); 550 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 551 PetscCallCUDA(cudaFree(h_mat->colmap)); 552 if (h_mat->allocated_indices) { 553 PetscCallCUDA(cudaFree(h_mat->diag.i)); 554 PetscCallCUDA(cudaFree(h_mat->diag.j)); 555 if (h_mat->offdiag.j) { 556 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 557 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 558 } 559 } 560 PetscCallCUDA(cudaFree(d_mat)); 561 PetscCall(PetscFree(h_mat)); 562 cusp->deviceMat = NULL; 563 } 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 567 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 568 { 569 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 570 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 571 572 PetscFunctionBegin; 573 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 574 if (cusparseStruct->deviceMat) { 575 PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 576 577 PetscCall(PetscInfo(A, "Have device matrix\n")); 578 PetscCall(PetscNew(&h_mat)); 579 PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 580 PetscCallCUDA(cudaFree(h_mat->colmap)); 581 if (h_mat->allocated_indices) { 582 PetscCallCUDA(cudaFree(h_mat->diag.i)); 583 PetscCallCUDA(cudaFree(h_mat->diag.j)); 584 if (h_mat->offdiag.j) { 585 PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 586 PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 587 } 588 } 589 PetscCallCUDA(cudaFree(d_mat)); 590 PetscCall(PetscFree(h_mat)); 591 } 592 /* Free COO */ 593 PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 594 PetscCallCXX(delete cusparseStruct); 595 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 596 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 597 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 598 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 599 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 600 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 601 PetscCall(MatDestroy_MPIAIJ(A)); 602 PetscFunctionReturn(PETSC_SUCCESS); 603 } 604 605 /* defines MatSetValues_MPICUSPARSE_Hash() */ 606 #define TYPE AIJ 607 #define TYPE_AIJ 608 #define SUB_TYPE_CUSPARSE 609 #include "../src/mat/impls/aij/mpi/mpihashmat.h" 610 #undef TYPE 611 #undef TYPE_AIJ 612 #undef SUB_TYPE_CUSPARSE 613 614 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 615 { 616 Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 617 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 618 619 PetscFunctionBegin; 620 PetscCall(MatSetUp_MPI_Hash(A)); 621 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 622 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 623 A->preallocated = PETSC_TRUE; 624 PetscFunctionReturn(PETSC_SUCCESS); 625 } 626 627 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 628 { 629 Mat_MPIAIJ *a; 630 Mat A; 631 632 PetscFunctionBegin; 633 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 634 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 635 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 636 A = *newmat; 637 A->boundtocpu = PETSC_FALSE; 638 PetscCall(PetscFree(A->defaultvectype)); 639 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 640 641 a = (Mat_MPIAIJ *)A->data; 642 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 643 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 644 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 645 646 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 647 648 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 649 A->ops->mult = MatMult_MPIAIJCUSPARSE; 650 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 651 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 652 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 653 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 654 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 655 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 656 A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 657 658 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 659 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 660 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 661 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 662 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 663 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 664 #if defined(PETSC_HAVE_HYPRE) 665 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 666 #endif 667 PetscFunctionReturn(PETSC_SUCCESS); 668 } 669 670 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 671 { 672 PetscFunctionBegin; 673 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 674 PetscCall(MatCreate_MPIAIJ(A)); 675 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 676 PetscFunctionReturn(PETSC_SUCCESS); 677 } 678 679 /*@ 680 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 681 (the default parallel PETSc format). This matrix will ultimately pushed down 682 to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 683 assembly performance the user should preallocate the matrix storage by setting 684 the parameter `nz` (or the array nnz). 685 686 Collective 687 688 Input Parameters: 689 + comm - MPI communicator, set to `PETSC_COMM_SELF` 690 . m - number of rows 691 . n - number of columns 692 . nz - number of nonzeros per row (same for all rows) 693 - nnz - array containing the number of nonzeros in the various rows 694 (possibly different for each row) or `NULL` 695 696 Output Parameter: 697 . A - the matrix 698 699 Level: intermediate 700 701 Notes: 702 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 703 MatXXXXSetPreallocation() paradigm instead of this routine directly. 704 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 705 706 If `nnz` is given then `nz` is ignored 707 708 The AIJ format, also called the 709 compressed row storage), is fully compatible with standard Fortran 710 storage. That is, the stored row and column indices can begin at 711 either one (as in Fortran) or zero. 712 713 Specify the preallocated storage with either `nz` or `nnz` (not both). 714 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 715 allocation. 716 717 By default, this format uses inodes (identical nodes) when possible, to 718 improve numerical efficiency of matrix-vector products and solves. We 719 search for consecutive rows with the same nonzero structure, thereby 720 reusing matrix information to achieve increased efficiency. 721 722 .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 723 @*/ 724 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 725 { 726 PetscMPIInt size; 727 728 PetscFunctionBegin; 729 PetscCall(MatCreate(comm, A)); 730 PetscCall(MatSetSizes(*A, m, n, M, N)); 731 PetscCallMPI(MPI_Comm_size(comm, &size)); 732 if (size > 1) { 733 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 734 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 735 } else { 736 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 737 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 738 } 739 PetscFunctionReturn(PETSC_SUCCESS); 740 } 741 742 /*MC 743 MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 744 745 A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 746 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 747 All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 748 749 This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 750 and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 751 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 752 for communicators controlling multiple processes. It is recommended that you call both of 753 the above preallocation routines for simplicity. 754 755 Options Database Keys: 756 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` 757 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid). 758 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 759 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 760 761 Level: beginner 762 763 .seealso: [](chapter_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 764 M*/ 765 766 /*MC 767 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 768 769 Level: beginner 770 771 .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 772 M*/ 773 774 // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 775 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 776 { 777 PetscSplitCSRDataStructure d_mat; 778 PetscMPIInt size; 779 int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL; 780 PetscScalar *aa = NULL, *ba = NULL; 781 Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 782 Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 783 CsrMatrix *matrixA = NULL, *matrixB = NULL; 784 785 PetscFunctionBegin; 786 PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix"); 787 if (A->factortype != MAT_FACTOR_NONE) { 788 *B = NULL; 789 PetscFunctionReturn(PETSC_SUCCESS); 790 } 791 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 792 // get jaca 793 if (size == 1) { 794 PetscBool isseqaij; 795 796 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij)); 797 if (isseqaij) { 798 jaca = (Mat_SeqAIJ *)A->data; 799 PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 800 cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr; 801 d_mat = cusparsestructA->deviceMat; 802 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 803 } else { 804 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 805 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 806 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 807 jaca = (Mat_SeqAIJ *)aij->A->data; 808 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 809 d_mat = spptr->deviceMat; 810 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 811 } 812 if (cusparsestructA->format == MAT_CUSPARSE_CSR) { 813 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 814 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 815 matrixA = (CsrMatrix *)matstruct->mat; 816 bi = NULL; 817 bj = NULL; 818 ba = NULL; 819 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR"); 820 } else { 821 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 822 PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 823 jaca = (Mat_SeqAIJ *)aij->A->data; 824 jacb = (Mat_SeqAIJ *)aij->B->data; 825 Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 826 827 PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 828 cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 829 Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr; 830 PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR"); 831 if (cusparsestructB->format == MAT_CUSPARSE_CSR) { 832 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 833 PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 834 Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 835 Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat; 836 PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 837 PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 838 matrixA = (CsrMatrix *)matstructA->mat; 839 matrixB = (CsrMatrix *)matstructB->mat; 840 if (jacb->compressedrow.use) { 841 if (!cusparsestructB->rowoffsets_gpu) { 842 cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 843 cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1); 844 } 845 bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 846 } else { 847 bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 848 } 849 bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 850 ba = thrust::raw_pointer_cast(matrixB->values->data()); 851 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR"); 852 d_mat = spptr->deviceMat; 853 } 854 if (jaca->compressedrow.use) { 855 if (!cusparsestructA->rowoffsets_gpu) { 856 cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 857 cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1); 858 } 859 ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 860 } else { 861 ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 862 } 863 aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 864 aa = thrust::raw_pointer_cast(matrixA->values->data()); 865 866 if (!d_mat) { 867 PetscSplitCSRDataStructure h_mat; 868 869 // create and populate strucy on host and copy on device 870 PetscCall(PetscInfo(A, "Create device matrix\n")); 871 PetscCall(PetscNew(&h_mat)); 872 PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat))); 873 if (size > 1) { /* need the colmap array */ 874 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 875 PetscInt *colmap; 876 PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N; 877 878 PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray"); 879 880 PetscCall(PetscCalloc1(N + 1, &colmap)); 881 for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1); 882 #if defined(PETSC_USE_64BIT_INDICES) 883 { // have to make a long version of these 884 int *h_bi32, *h_bj32; 885 PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 886 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64)); 887 PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost)); 888 for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i]; 889 PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost)); 890 for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i]; 891 892 PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64))); 893 PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice)); 894 PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64))); 895 PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice)); 896 897 h_mat->offdiag.i = d_bi64; 898 h_mat->offdiag.j = d_bj64; 899 h_mat->allocated_indices = PETSC_TRUE; 900 901 PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64)); 902 } 903 #else 904 h_mat->offdiag.i = (PetscInt *)bi; 905 h_mat->offdiag.j = (PetscInt *)bj; 906 h_mat->allocated_indices = PETSC_FALSE; 907 #endif 908 h_mat->offdiag.a = ba; 909 h_mat->offdiag.n = A->rmap->n; 910 911 PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap))); 912 PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice)); 913 PetscCall(PetscFree(colmap)); 914 } 915 h_mat->rstart = A->rmap->rstart; 916 h_mat->rend = A->rmap->rend; 917 h_mat->cstart = A->cmap->rstart; 918 h_mat->cend = A->cmap->rend; 919 h_mat->M = A->cmap->N; 920 #if defined(PETSC_USE_64BIT_INDICES) 921 { 922 int *h_ai32, *h_aj32; 923 PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 924 925 static_assert(sizeof(PetscInt) == 8, ""); 926 PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64)); 927 PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost)); 928 for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i]; 929 PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost)); 930 for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i]; 931 932 PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64))); 933 PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice)); 934 PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64))); 935 PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice)); 936 937 h_mat->diag.i = d_ai64; 938 h_mat->diag.j = d_aj64; 939 h_mat->allocated_indices = PETSC_TRUE; 940 941 PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64)); 942 } 943 #else 944 h_mat->diag.i = (PetscInt *)ai; 945 h_mat->diag.j = (PetscInt *)aj; 946 h_mat->allocated_indices = PETSC_FALSE; 947 #endif 948 h_mat->diag.a = aa; 949 h_mat->diag.n = A->rmap->n; 950 h_mat->rank = PetscGlobalRank; 951 // copy pointers and metadata to device 952 PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice)); 953 PetscCall(PetscFree(h_mat)); 954 } else { 955 PetscCall(PetscInfo(A, "Reusing device matrix\n")); 956 } 957 *B = d_mat; 958 A->offloadmask = PETSC_OFFLOAD_GPU; 959 PetscFunctionReturn(PETSC_SUCCESS); 960 } 961