xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 1cc06b555e92f8ec64db10330b8bbd830e5bc876)
199acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
557d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
77e8381f9SStefano Zampini #include <thrust/advance.h>
8a2cee5feSJed Brown #include <thrust/partition.h>
9a2cee5feSJed Brown #include <thrust/sort.h>
10a2cee5feSJed Brown #include <thrust/unique.h>
114eb5378fSStefano Zampini #include <petscsf.h>
127e8381f9SStefano Zampini 
139371c9d4SSatish Balay struct VecCUDAEquals {
147e8381f9SStefano Zampini   template <typename Tuple>
15d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
16d71ae5a4SJacob Faibussowitsch   {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
21d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
22d71ae5a4SJacob Faibussowitsch {
23cbc6b225SStefano Zampini   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
24cbc6b225SStefano Zampini   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
25cbc6b225SStefano Zampini 
26cbc6b225SStefano Zampini   PetscFunctionBegin;
273ba16761SJacob Faibussowitsch   if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS);
28cbc6b225SStefano Zampini   if (cusparseStruct->use_extended_coo) {
299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
359566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
369566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
42cbc6b225SStefano Zampini   }
43cbc6b225SStefano Zampini   cusparseStruct->use_extended_coo = PETSC_FALSE;
44cbc6b225SStefano Zampini   delete cusparseStruct->coo_p;
45cbc6b225SStefano Zampini   delete cusparseStruct->coo_pw;
46cbc6b225SStefano Zampini   cusparseStruct->coo_p  = NULL;
47cbc6b225SStefano Zampini   cusparseStruct->coo_pw = NULL;
483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
49cbc6b225SStefano Zampini }
50cbc6b225SStefano Zampini 
51d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
52d71ae5a4SJacob Faibussowitsch {
537e8381f9SStefano Zampini   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
547e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
557e8381f9SStefano Zampini   PetscInt            n    = cusp->coo_nd + cusp->coo_no;
567e8381f9SStefano Zampini 
577e8381f9SStefano Zampini   PetscFunctionBegin;
58e61fc153SStefano Zampini   if (cusp->coo_p && v) {
5908391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
6008391a17SStefano Zampini     THRUSTARRAY                          *w = NULL;
6108391a17SStefano Zampini 
6208391a17SStefano Zampini     if (isCudaMem(v)) {
6308391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
6408391a17SStefano Zampini     } else {
65e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
66e61fc153SStefano Zampini       w->assign(v, v + n);
679566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
6808391a17SStefano Zampini       d_v = w->data();
6908391a17SStefano Zampini     }
7008391a17SStefano Zampini 
719371c9d4SSatish Balay     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
729371c9d4SSatish Balay     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
739566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
747e8381f9SStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
759566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
76e61fc153SStefano Zampini     delete w;
779566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
789566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
797e8381f9SStefano Zampini   } else {
809566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
819566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
827e8381f9SStefano Zampini   }
833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
847e8381f9SStefano Zampini }
857e8381f9SStefano Zampini 
867e8381f9SStefano Zampini template <typename Tuple>
879371c9d4SSatish Balay struct IsNotOffDiagT {
887e8381f9SStefano Zampini   PetscInt _cstart, _cend;
897e8381f9SStefano Zampini 
907e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
919371c9d4SSatish Balay   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
927e8381f9SStefano Zampini };
937e8381f9SStefano Zampini 
949371c9d4SSatish Balay struct IsOffDiag {
957e8381f9SStefano Zampini   PetscInt _cstart, _cend;
967e8381f9SStefano Zampini 
977e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
989371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
997e8381f9SStefano Zampini };
1007e8381f9SStefano Zampini 
1019371c9d4SSatish Balay struct GlobToLoc {
1027e8381f9SStefano Zampini   PetscInt _start;
1037e8381f9SStefano Zampini 
1047e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) { }
1059371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
1067e8381f9SStefano Zampini };
1077e8381f9SStefano Zampini 
108d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109d71ae5a4SJacob Faibussowitsch {
1107e8381f9SStefano Zampini   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
1117e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
11282a78a4eSJed Brown   PetscInt               N, *jj;
1137e8381f9SStefano Zampini   size_t                 noff = 0;
114ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1157e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1167e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1177e8381f9SStefano Zampini 
1187e8381f9SStefano Zampini   PetscFunctionBegin;
1199566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->A));
1209566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
1217e8381f9SStefano Zampini 
1229566063dSJacob Faibussowitsch   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1237e8381f9SStefano Zampini   d_i.assign(coo_i, coo_i + n);
1247e8381f9SStefano Zampini   d_j.assign(coo_j, coo_j + n);
1259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1267e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
1277e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
1287e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1297e8381f9SStefano Zampini     cusp->coo_p  = new THRUSTINTARRAY(n);
1307e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1317e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
1327e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
1337e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
1347e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
1357e8381f9SStefano Zampini     firstoffd  = mzipp.get_iterator_tuple().get<1>();
1367e8381f9SStefano Zampini   }
1377e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
1387e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd, d_j.end());
1397e8381f9SStefano Zampini 
1407e8381f9SStefano Zampini   /* from global to local */
1417e8381f9SStefano Zampini   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
1427e8381f9SStefano Zampini   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
1439566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1447e8381f9SStefano Zampini 
1457e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
1479566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
1487e8381f9SStefano Zampini   auto o_j = d_j.begin();
1499566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
150ddea5d60SJunchao Zhang   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
1517e8381f9SStefano Zampini   thrust::sort(thrust::device, o_j, d_j.end());
152ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
1539566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1547e8381f9SStefano Zampini   noff = thrust::distance(o_j, wit);
155cbc6b225SStefano Zampini   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
1569566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(noff, &b->garray));
1579566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
1589566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
1599566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
1609566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
1619566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
1622c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
1639566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
1647e8381f9SStefano Zampini 
1659566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
1669566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
1679566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
1689566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
1699566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
1709566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
1717e8381f9SStefano Zampini 
1727e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1739566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
1749566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
1759566063dSJacob Faibussowitsch   PetscCall(PetscFree(jj));
1767e8381f9SStefano Zampini 
1779566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
1789566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
1797e8381f9SStefano Zampini 
1809566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
1819566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
1829566063dSJacob Faibussowitsch   PetscCall(MatSetUpMultiply_MPIAIJ(B));
1833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1847e8381f9SStefano Zampini }
1859ae82921SPaul Mullowney 
186d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
187d71ae5a4SJacob Faibussowitsch {
188cbc6b225SStefano Zampini   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
189219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev;
190cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
191219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
192219fbbafSJunchao Zhang   PetscInt            rstart, rend;
193219fbbafSJunchao Zhang 
194219fbbafSJunchao Zhang   PetscFunctionBegin;
1959566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->garray));
1969566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&mpiaij->lvec));
197cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
198eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
199cbc6b225SStefano Zampini #else
2009566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->colmap));
201cbc6b225SStefano Zampini #endif
2029566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
203cbc6b225SStefano Zampini   mat->assembled     = PETSC_FALSE;
204cbc6b225SStefano Zampini   mat->was_assembled = PETSC_FALSE;
2059566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
2069566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
207219fbbafSJunchao Zhang   if (coo_i) {
2089566063dSJacob Faibussowitsch     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
2099566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
210219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
211219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
2129371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
2139371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
2149371c9d4SSatish Balay           break;
2159371c9d4SSatish Balay         }
216219fbbafSJunchao Zhang       }
217219fbbafSJunchao Zhang     }
218219fbbafSJunchao Zhang   }
219219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
220712fec58SPierre Jolivet   PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
221219fbbafSJunchao Zhang   if (coo_basic) {
2229566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
223219fbbafSJunchao Zhang   } else {
2249566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
225cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
226cbc6b225SStefano Zampini     /* creates the GPU memory */
2279566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
2289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
229cbc6b225SStefano Zampini     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
230219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
231219fbbafSJunchao Zhang 
232158ec288SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
2339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
234219fbbafSJunchao Zhang 
235158ec288SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
2369566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
237219fbbafSJunchao Zhang 
2389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
2399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
2409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
241219fbbafSJunchao Zhang 
2429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
2439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
2449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
245219fbbafSJunchao Zhang 
2469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
2479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
2489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
249219fbbafSJunchao Zhang 
250158ec288SJunchao Zhang     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
252219fbbafSJunchao Zhang 
253158ec288SJunchao Zhang     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2549566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
255219fbbafSJunchao Zhang 
2569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
2579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2589566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
259219fbbafSJunchao Zhang 
2609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
2619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
263219fbbafSJunchao Zhang 
2649566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
265219fbbafSJunchao Zhang   }
2663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
267219fbbafSJunchao Zhang }
268219fbbafSJunchao Zhang 
269d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
270d71ae5a4SJacob Faibussowitsch {
271219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
272219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
273219fbbafSJunchao Zhang   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
274219fbbafSJunchao Zhang }
275219fbbafSJunchao Zhang 
276d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
277d71ae5a4SJacob Faibussowitsch {
278219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
279219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
280158ec288SJunchao Zhang   for (; i < Annz + Bnnz; i += grid_size) {
281158ec288SJunchao Zhang     PetscScalar sum = 0.0;
282158ec288SJunchao Zhang     if (i < Annz) {
283158ec288SJunchao Zhang       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
284158ec288SJunchao Zhang       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
285158ec288SJunchao Zhang     } else {
286158ec288SJunchao Zhang       i -= Annz;
287158ec288SJunchao Zhang       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
288158ec288SJunchao Zhang       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
289158ec288SJunchao Zhang     }
290158ec288SJunchao Zhang   }
291158ec288SJunchao Zhang }
292158ec288SJunchao Zhang 
293d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
294d71ae5a4SJacob Faibussowitsch {
295158ec288SJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
296158ec288SJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
297158ec288SJunchao Zhang   for (; i < Annz2 + Bnnz2; i += grid_size) {
298158ec288SJunchao Zhang     if (i < Annz2) {
299158ec288SJunchao Zhang       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
300158ec288SJunchao Zhang     } else {
301158ec288SJunchao Zhang       i -= Annz2;
302158ec288SJunchao Zhang       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
303158ec288SJunchao Zhang     }
304158ec288SJunchao Zhang   }
305219fbbafSJunchao Zhang }
306219fbbafSJunchao Zhang 
307d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
308d71ae5a4SJacob Faibussowitsch {
309219fbbafSJunchao Zhang   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
310219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
311219fbbafSJunchao Zhang   Mat                 A = mpiaij->A, B = mpiaij->B;
312158ec288SJunchao Zhang   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
313cbc6b225SStefano Zampini   PetscScalar        *Aa, *Ba = NULL;
314219fbbafSJunchao Zhang   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
315219fbbafSJunchao Zhang   const PetscScalar  *v1     = v;
316158ec288SJunchao Zhang   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
317158ec288SJunchao Zhang   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
318219fbbafSJunchao Zhang   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
319219fbbafSJunchao Zhang   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
320219fbbafSJunchao Zhang   PetscMemType        memtype;
321219fbbafSJunchao Zhang 
322219fbbafSJunchao Zhang   PetscFunctionBegin;
323219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
324cbc6b225SStefano Zampini     PetscMPIInt size;
325cbc6b225SStefano Zampini 
3269566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
3279566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
328219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
3299566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
3309566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
331219fbbafSJunchao Zhang     }
332219fbbafSJunchao Zhang 
333219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3349566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
3359566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
336219fbbafSJunchao Zhang     } else {
3379566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
338158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
339219fbbafSJunchao Zhang     }
340219fbbafSJunchao Zhang 
341219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
342cbc6b225SStefano Zampini     if (mpiaij->sendlen) {
3439371c9d4SSatish Balay       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
3449371c9d4SSatish Balay       PetscCallCUDA(cudaPeekAtLastError());
345cbc6b225SStefano Zampini     }
346219fbbafSJunchao Zhang 
347219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
348158ec288SJunchao Zhang     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
349219fbbafSJunchao Zhang     /* Add local entries to A and B */
350158ec288SJunchao Zhang     if (Annz + Bnnz > 0) {
351158ec288SJunchao Zhang       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
3529566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
353cbc6b225SStefano Zampini     }
354158ec288SJunchao Zhang     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
355219fbbafSJunchao Zhang 
356219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
357158ec288SJunchao Zhang     if (Annz2 + Bnnz2 > 0) {
358158ec288SJunchao Zhang       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
3599566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
360cbc6b225SStefano Zampini     }
361219fbbafSJunchao Zhang 
362219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3639566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
364158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
365219fbbafSJunchao Zhang     } else {
3669566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
367158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
368219fbbafSJunchao Zhang     }
3699566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
370219fbbafSJunchao Zhang   } else {
3719566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
372219fbbafSJunchao Zhang   }
373cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
3743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
375219fbbafSJunchao Zhang }
376219fbbafSJunchao Zhang 
377d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
378d71ae5a4SJacob Faibussowitsch {
379ed502f03SStefano Zampini   Mat             Ad, Ao;
380ed502f03SStefano Zampini   const PetscInt *cmap;
381ed502f03SStefano Zampini 
382ed502f03SStefano Zampini   PetscFunctionBegin;
3839566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
3849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
385ed502f03SStefano Zampini   if (glob) {
386ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
387ed502f03SStefano Zampini 
3889566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
3899566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ao, NULL, &on));
3909566063dSJacob Faibussowitsch     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
3919566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(dn + on, &gidx));
392ed502f03SStefano Zampini     for (i = 0; i < dn; i++) gidx[i] = cst + i;
393ed502f03SStefano Zampini     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
3949566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
395ed502f03SStefano Zampini   }
3963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
397ed502f03SStefano Zampini }
398ed502f03SStefano Zampini 
399d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
400d71ae5a4SJacob Faibussowitsch {
401bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
402bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
4039ae82921SPaul Mullowney   PetscInt            i;
4049ae82921SPaul Mullowney 
4059ae82921SPaul Mullowney   PetscFunctionBegin;
4069566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->rmap));
4079566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->cmap));
4087e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
409ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
4109ae82921SPaul Mullowney   }
4117e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
412ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
4139ae82921SPaul Mullowney   }
4146a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
415eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&b->colmap));
4166a29ce69SStefano Zampini #else
4179566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->colmap));
4186a29ce69SStefano Zampini #endif
4199566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->garray));
4209566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&b->lvec));
4219566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&b->Mvctx));
4226a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4239566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
4246a29ce69SStefano Zampini   if (!b->A) {
4259566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
4269566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
4276a29ce69SStefano Zampini   }
4286a29ce69SStefano Zampini   if (!b->B) {
4296a29ce69SStefano Zampini     PetscMPIInt size;
4309566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
4319566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
4329566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
4339ae82921SPaul Mullowney   }
4349566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
4359566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
4369566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
4379566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
4389566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
4399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
4409566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
4419566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
4429ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4449ae82921SPaul Mullowney }
445e057df02SPaul Mullowney 
446d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
447d71ae5a4SJacob Faibussowitsch {
4489ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4499ae82921SPaul Mullowney 
4509ae82921SPaul Mullowney   PetscFunctionBegin;
4519566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4529566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
4539566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4549566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
4553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4569ae82921SPaul Mullowney }
457ca45077fSPaul Mullowney 
458d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
459d71ae5a4SJacob Faibussowitsch {
4603fa6b06aSMark Adams   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
4613fa6b06aSMark Adams 
4623fa6b06aSMark Adams   PetscFunctionBegin;
4639566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->A));
4649566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->B));
4653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4663fa6b06aSMark Adams }
467042217e8SBarry Smith 
468d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
469d71ae5a4SJacob Faibussowitsch {
470fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
471fdc842d1SBarry Smith 
472fdc842d1SBarry Smith   PetscFunctionBegin;
4739566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4749566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
4759566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4769566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
4773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
478fdc842d1SBarry Smith }
479fdc842d1SBarry Smith 
480d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
481d71ae5a4SJacob Faibussowitsch {
482ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
483ca45077fSPaul Mullowney 
484ca45077fSPaul Mullowney   PetscFunctionBegin;
4859566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
4869566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
4879566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
4889566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
4893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
490ca45077fSPaul Mullowney }
4919ae82921SPaul Mullowney 
492d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
493d71ae5a4SJacob Faibussowitsch {
494ca45077fSPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
495bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
496e057df02SPaul Mullowney 
497ca45077fSPaul Mullowney   PetscFunctionBegin;
498e057df02SPaul Mullowney   switch (op) {
499d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_DIAG:
500d71ae5a4SJacob Faibussowitsch     cusparseStruct->diagGPUMatFormat = format;
501d71ae5a4SJacob Faibussowitsch     break;
502d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_OFFDIAG:
503d71ae5a4SJacob Faibussowitsch     cusparseStruct->offdiagGPUMatFormat = format;
504d71ae5a4SJacob Faibussowitsch     break;
505e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
506e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
507e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
508045c96e1SPaul Mullowney     break;
509d71ae5a4SJacob Faibussowitsch   default:
510d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
511045c96e1SPaul Mullowney   }
5123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
513ca45077fSPaul Mullowney }
514e057df02SPaul Mullowney 
515d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
516d71ae5a4SJacob Faibussowitsch {
517e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
518e057df02SPaul Mullowney   PetscBool                flg;
519a183c035SDominic Meiser   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
520a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
5215fd66863SKarl Rupp 
522e057df02SPaul Mullowney   PetscFunctionBegin;
523d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
524e057df02SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
5259371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
5269566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
5279371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
5289566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
5299371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
5309566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
531e057df02SPaul Mullowney   }
532d0609cedSBarry Smith   PetscOptionsHeadEnd();
5333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
534e057df02SPaul Mullowney }
535e057df02SPaul Mullowney 
536d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
537d71ae5a4SJacob Faibussowitsch {
5383fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
539042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
540042217e8SBarry Smith   PetscObjectState    onnz   = A->nonzerostate;
541042217e8SBarry Smith 
54234d6c7a5SJose E. Roman   PetscFunctionBegin;
5439566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
5449566063dSJacob Faibussowitsch   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
545042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
546042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
547042217e8SBarry Smith 
5489566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
5499566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
5509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
5519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
55299551766SMark Adams     if (h_mat->allocated_indices) {
5539566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
5549566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
55599551766SMark Adams       if (h_mat->offdiag.j) {
5569566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
5579566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
55899551766SMark Adams       }
55999551766SMark Adams     }
5609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
5619566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
562042217e8SBarry Smith     cusp->deviceMat = NULL;
5633fa6b06aSMark Adams   }
5643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
56534d6c7a5SJose E. Roman }
56634d6c7a5SJose E. Roman 
567d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
568d71ae5a4SJacob Faibussowitsch {
5693fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
5703fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
571bbf3fe20SPaul Mullowney 
572bbf3fe20SPaul Mullowney   PetscFunctionBegin;
57328b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
5743fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
575042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
576042217e8SBarry Smith 
5779566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Have device matrix\n"));
5789566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
5799566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
5809566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
58199551766SMark Adams     if (h_mat->allocated_indices) {
5829566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
5839566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
58499551766SMark Adams       if (h_mat->offdiag.j) {
5859566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
5869566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
58799551766SMark Adams       }
58899551766SMark Adams     }
5899566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
5909566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
5913fa6b06aSMark Adams   }
592cbc6b225SStefano Zampini   /* Free COO */
5939566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
594acbf8a88SJunchao Zhang   PetscCallCXX(delete cusparseStruct);
5959566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
5969566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
5979566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
5989566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
5999566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
6009566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
6019566063dSJacob Faibussowitsch   PetscCall(MatDestroy_MPIAIJ(A));
6023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
603bbf3fe20SPaul Mullowney }
604ca45077fSPaul Mullowney 
60526cec326SBarry Smith /* defines MatSetValues_MPICUSPARSE_Hash() */
60626cec326SBarry Smith #define TYPE AIJ
60726cec326SBarry Smith #define TYPE_AIJ
60826cec326SBarry Smith #define SUB_TYPE_CUSPARSE
60926cec326SBarry Smith #include "../src/mat/impls/aij/mpi/mpihashmat.h"
61026cec326SBarry Smith #undef TYPE
61126cec326SBarry Smith #undef TYPE_AIJ
61226cec326SBarry Smith #undef SUB_TYPE_CUSPARSE
61326cec326SBarry Smith 
61426cec326SBarry Smith static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
61526cec326SBarry Smith {
61626cec326SBarry Smith   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
61726cec326SBarry Smith   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
61826cec326SBarry Smith 
61926cec326SBarry Smith   PetscFunctionBegin;
62026cec326SBarry Smith   PetscCall(MatSetUp_MPI_Hash(A));
62126cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
62226cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
62326cec326SBarry Smith   A->preallocated = PETSC_TRUE;
62426cec326SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
62526cec326SBarry Smith }
62626cec326SBarry Smith 
6278eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
628d71ae5a4SJacob Faibussowitsch {
629bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
6303338378cSStefano Zampini   Mat         A;
6319ae82921SPaul Mullowney 
6329ae82921SPaul Mullowney   PetscFunctionBegin;
6339566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6349566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
6359566063dSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
6363338378cSStefano Zampini   A             = *newmat;
6376f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
6389566063dSJacob Faibussowitsch   PetscCall(PetscFree(A->defaultvectype));
6399566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
64034136279SStefano Zampini 
641bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ *)A->data;
6429566063dSJacob Faibussowitsch   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
6439566063dSJacob Faibussowitsch   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
6449566063dSJacob Faibussowitsch   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
645d98d7c49SStefano Zampini 
64648a46eb9SPierre Jolivet   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
6472205254eSKarl Rupp 
64834d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
649bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
650fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
651bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
652bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
653bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
6543fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
6554e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
65626cec326SBarry Smith   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
6572205254eSKarl Rupp 
6589566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
6599566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
6609566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
6619566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
6629566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
6639566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
664ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
6659566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
666ae48a8d0SStefano Zampini #endif
6673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6689ae82921SPaul Mullowney }
6699ae82921SPaul Mullowney 
670d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
671d71ae5a4SJacob Faibussowitsch {
6723338378cSStefano Zampini   PetscFunctionBegin;
6739566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6749566063dSJacob Faibussowitsch   PetscCall(MatCreate_MPIAIJ(A));
6759566063dSJacob Faibussowitsch   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
6763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6773338378cSStefano Zampini }
6783338378cSStefano Zampini 
6793f9c0db1SPaul Mullowney /*@
68011a5261eSBarry Smith    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
681e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
68220f4b53cSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations.
6839ae82921SPaul Mullowney 
684d083f849SBarry Smith    Collective
685e057df02SPaul Mullowney 
686e057df02SPaul Mullowney    Input Parameters:
68711a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
68820f4b53cSBarry Smith .  m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
68920f4b53cSBarry Smith            This value should be the same as the local size used in creating the
69020f4b53cSBarry Smith            y vector for the matrix-vector product y = Ax.
69120f4b53cSBarry Smith .  n - This value should be the same as the local size used in creating the
69220f4b53cSBarry Smith        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
69320f4b53cSBarry Smith        calculated if `N` is given) For square matrices `n` is almost always `m`.
69420f4b53cSBarry Smith .  M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
69520f4b53cSBarry Smith .  N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
69620f4b53cSBarry Smith .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
69720f4b53cSBarry Smith            (same value is used for all local rows)
69820f4b53cSBarry Smith .  d_nnz - array containing the number of nonzeros in the various rows of the
69920f4b53cSBarry Smith            DIAGONAL portion of the local submatrix (possibly different for each row)
70020f4b53cSBarry Smith            or `NULL`, if `d_nz` is used to specify the nonzero structure.
70120f4b53cSBarry Smith            The size of this array is equal to the number of local rows, i.e `m`.
70220f4b53cSBarry Smith            For matrices you plan to factor you must leave room for the diagonal entry and
70320f4b53cSBarry Smith            put in the entry even if it is zero.
70420f4b53cSBarry Smith .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
70520f4b53cSBarry Smith            submatrix (same value is used for all local rows).
70620f4b53cSBarry Smith -  o_nnz - array containing the number of nonzeros in the various rows of the
70720f4b53cSBarry Smith            OFF-DIAGONAL portion of the local submatrix (possibly different for
70820f4b53cSBarry Smith            each row) or `NULL`, if `o_nz` is used to specify the nonzero
70920f4b53cSBarry Smith            structure. The size of this array is equal to the number
71020f4b53cSBarry Smith            of local rows, i.e `m`.
711e057df02SPaul Mullowney 
712e057df02SPaul Mullowney    Output Parameter:
713e057df02SPaul Mullowney .  A - the matrix
714e057df02SPaul Mullowney 
7152ef1f0ffSBarry Smith    Level: intermediate
7162ef1f0ffSBarry Smith 
7172ef1f0ffSBarry Smith    Notes:
71811a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
719e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
72011a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
721e057df02SPaul Mullowney 
72211a5261eSBarry Smith    The AIJ format, also called the
7232ef1f0ffSBarry Smith    compressed row storage), is fully compatible with standard Fortran
724e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
7252ef1f0ffSBarry Smith    either one (as in Fortran) or zero.
726e057df02SPaul Mullowney 
727*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
728e057df02SPaul Mullowney @*/
729d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
730d71ae5a4SJacob Faibussowitsch {
7319ae82921SPaul Mullowney   PetscMPIInt size;
7329ae82921SPaul Mullowney 
7339ae82921SPaul Mullowney   PetscFunctionBegin;
7349566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
7359566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, M, N));
7369566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
7379ae82921SPaul Mullowney   if (size > 1) {
7389566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
7399566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
7409ae82921SPaul Mullowney   } else {
7419566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
7429566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
7439ae82921SPaul Mullowney   }
7443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
7459ae82921SPaul Mullowney }
7469ae82921SPaul Mullowney 
7473ca39a21SBarry Smith /*MC
74811a5261eSBarry Smith    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
749e057df02SPaul Mullowney 
75011a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
7512692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
75211a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
7539ae82921SPaul Mullowney 
75411a5261eSBarry Smith    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
75511a5261eSBarry Smith    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
75611a5261eSBarry Smith    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
7579ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
7589ae82921SPaul Mullowney    the above preallocation routines for simplicity.
7599ae82921SPaul Mullowney 
7609ae82921SPaul Mullowney    Options Database Keys:
7612ef1f0ffSBarry Smith +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
7622ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
7632ef1f0ffSBarry Smith .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
7642ef1f0ffSBarry Smith -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
7659ae82921SPaul Mullowney 
7669ae82921SPaul Mullowney   Level: beginner
7679ae82921SPaul Mullowney 
768*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
7696bb69460SJunchao Zhang M*/
7706bb69460SJunchao Zhang 
7716bb69460SJunchao Zhang /*MC
77211a5261eSBarry Smith    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
7736bb69460SJunchao Zhang 
7746bb69460SJunchao Zhang   Level: beginner
7756bb69460SJunchao Zhang 
776*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
7779ae82921SPaul Mullowney M*/
7783fa6b06aSMark Adams 
779042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
780d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
781d71ae5a4SJacob Faibussowitsch {
782042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
783042217e8SBarry Smith   PetscMPIInt                size;
784042217e8SBarry Smith   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
785042217e8SBarry Smith   PetscScalar               *aa = NULL, *ba = NULL;
78699551766SMark Adams   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
787042217e8SBarry Smith   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
788042217e8SBarry Smith   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;
7893fa6b06aSMark Adams 
7903fa6b06aSMark Adams   PetscFunctionBegin;
79128b400f6SJacob Faibussowitsch   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
792042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
7933fa6b06aSMark Adams     *B = NULL;
7943ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
7953fa6b06aSMark Adams   }
7969566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
79799551766SMark Adams   // get jaca
7983fa6b06aSMark Adams   if (size == 1) {
799042217e8SBarry Smith     PetscBool isseqaij;
800042217e8SBarry Smith 
8019566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
802042217e8SBarry Smith     if (isseqaij) {
8033fa6b06aSMark Adams       jaca = (Mat_SeqAIJ *)A->data;
80428b400f6SJacob Faibussowitsch       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
805042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
806042217e8SBarry Smith       d_mat           = cusparsestructA->deviceMat;
8079566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
8083fa6b06aSMark Adams     } else {
8093fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
81028b400f6SJacob Faibussowitsch       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
811042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
8123fa6b06aSMark Adams       jaca                      = (Mat_SeqAIJ *)aij->A->data;
813042217e8SBarry Smith       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
814042217e8SBarry Smith       d_mat                     = spptr->deviceMat;
8159566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
8163fa6b06aSMark Adams     }
817042217e8SBarry Smith     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
818042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
81928b400f6SJacob Faibussowitsch       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
820042217e8SBarry Smith       matrixA = (CsrMatrix *)matstruct->mat;
821042217e8SBarry Smith       bi      = NULL;
822042217e8SBarry Smith       bj      = NULL;
823042217e8SBarry Smith       ba      = NULL;
824042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
825042217e8SBarry Smith   } else {
826042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
82728b400f6SJacob Faibussowitsch     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
828042217e8SBarry Smith     jaca                      = (Mat_SeqAIJ *)aij->A->data;
82999551766SMark Adams     jacb                      = (Mat_SeqAIJ *)aij->B->data;
830042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
8313fa6b06aSMark Adams 
83208401ef6SPierre Jolivet     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
833042217e8SBarry Smith     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
834042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
83508401ef6SPierre Jolivet     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
836042217e8SBarry Smith     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
8379566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
8389566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
839042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
840042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
84128b400f6SJacob Faibussowitsch       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
84228b400f6SJacob Faibussowitsch       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
843042217e8SBarry Smith       matrixA = (CsrMatrix *)matstructA->mat;
844042217e8SBarry Smith       matrixB = (CsrMatrix *)matstructB->mat;
8453fa6b06aSMark Adams       if (jacb->compressedrow.use) {
846042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
847042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
848042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
8493fa6b06aSMark Adams         }
850042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
851042217e8SBarry Smith       } else {
852042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
853042217e8SBarry Smith       }
854042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
855042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
856042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
857042217e8SBarry Smith     d_mat = spptr->deviceMat;
858042217e8SBarry Smith   }
8593fa6b06aSMark Adams   if (jaca->compressedrow.use) {
860042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
861042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
862042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
8633fa6b06aSMark Adams     }
864042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
8653fa6b06aSMark Adams   } else {
866042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
8673fa6b06aSMark Adams   }
868042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
869042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
870042217e8SBarry Smith 
871042217e8SBarry Smith   if (!d_mat) {
872042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
873042217e8SBarry Smith 
874042217e8SBarry Smith     // create and populate strucy on host and copy on device
8759566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Create device matrix\n"));
8769566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
8779566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
878042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
879042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
88099551766SMark Adams       PetscInt   *colmap;
881042217e8SBarry Smith       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;
882042217e8SBarry Smith 
88308401ef6SPierre Jolivet       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
884042217e8SBarry Smith 
8859566063dSJacob Faibussowitsch       PetscCall(PetscCalloc1(N + 1, &colmap));
886042217e8SBarry Smith       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
887365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
888365b711fSMark Adams       { // have to make a long version of these
88999551766SMark Adams         int      *h_bi32, *h_bj32;
89099551766SMark Adams         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
8919566063dSJacob Faibussowitsch         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
8929566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
89399551766SMark Adams         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
8949566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
89599551766SMark Adams         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
89699551766SMark Adams 
8979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
8989566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
8999566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
9009566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
90199551766SMark Adams 
90299551766SMark Adams         h_mat->offdiag.i         = d_bi64;
90399551766SMark Adams         h_mat->offdiag.j         = d_bj64;
90499551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
90599551766SMark Adams 
9069566063dSJacob Faibussowitsch         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
907365b711fSMark Adams       }
908365b711fSMark Adams #else
90999551766SMark Adams       h_mat->offdiag.i         = (PetscInt *)bi;
91099551766SMark Adams       h_mat->offdiag.j         = (PetscInt *)bj;
91199551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
912365b711fSMark Adams #endif
913042217e8SBarry Smith       h_mat->offdiag.a = ba;
914042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
915042217e8SBarry Smith 
9169566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
9179566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
9189566063dSJacob Faibussowitsch       PetscCall(PetscFree(colmap));
919042217e8SBarry Smith     }
920042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
921042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
922042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
923042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
92449b994a9SMark Adams     h_mat->M      = A->cmap->N;
925365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
926365b711fSMark Adams     {
92799551766SMark Adams       int      *h_ai32, *h_aj32;
92899551766SMark Adams       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
92963a3b9bcSJacob Faibussowitsch 
93063a3b9bcSJacob Faibussowitsch       static_assert(sizeof(PetscInt) == 8, "");
9319566063dSJacob Faibussowitsch       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
9329566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
93399551766SMark Adams       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
9349566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
93599551766SMark Adams       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
93699551766SMark Adams 
9379566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
9389566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
9399566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
9409566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
94199551766SMark Adams 
94299551766SMark Adams       h_mat->diag.i            = d_ai64;
94399551766SMark Adams       h_mat->diag.j            = d_aj64;
94499551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
94599551766SMark Adams 
9469566063dSJacob Faibussowitsch       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
947365b711fSMark Adams     }
948365b711fSMark Adams #else
94999551766SMark Adams     h_mat->diag.i            = (PetscInt *)ai;
95099551766SMark Adams     h_mat->diag.j            = (PetscInt *)aj;
95199551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
952365b711fSMark Adams #endif
953042217e8SBarry Smith     h_mat->diag.a = aa;
954042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
955042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
956042217e8SBarry Smith     // copy pointers and metadata to device
9579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
9589566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
959042217e8SBarry Smith   } else {
9609566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
961042217e8SBarry Smith   }
962042217e8SBarry Smith   *B             = d_mat;
963042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
9643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
9653fa6b06aSMark Adams }
966