xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 37d05b0256c1e9ba4bc423c4eccb3df226931ef0)
1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12 
13 struct VecCUDAEquals {
14   template <typename Tuple>
15   __host__ __device__ void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
22 {
23   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
24   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
25 
26   PetscFunctionBegin;
27   if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS);
28   if (cusparseStruct->use_extended_coo) {
29     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
30     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
31     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
32     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
33     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
34     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
35     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
36     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
37     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
38     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
39     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
40     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
41     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
42   }
43   cusparseStruct->use_extended_coo = PETSC_FALSE;
44   delete cusparseStruct->coo_p;
45   delete cusparseStruct->coo_pw;
46   cusparseStruct->coo_p  = NULL;
47   cusparseStruct->coo_pw = NULL;
48   PetscFunctionReturn(PETSC_SUCCESS);
49 }
50 
51 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
52 {
53   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
54   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
55   PetscInt            n    = cusp->coo_nd + cusp->coo_no;
56 
57   PetscFunctionBegin;
58   if (cusp->coo_p && v) {
59     thrust::device_ptr<const PetscScalar> d_v;
60     THRUSTARRAY                          *w = NULL;
61 
62     if (isCudaMem(v)) {
63       d_v = thrust::device_pointer_cast(v);
64     } else {
65       w = new THRUSTARRAY(n);
66       w->assign(v, v + n);
67       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
68       d_v = w->data();
69     }
70 
71     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
72     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
73     PetscCall(PetscLogGpuTimeBegin());
74     thrust::for_each(zibit, zieit, VecCUDAEquals());
75     PetscCall(PetscLogGpuTimeEnd());
76     delete w;
77     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
78     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
79   } else {
80     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
81     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
82   }
83   PetscFunctionReturn(PETSC_SUCCESS);
84 }
85 
86 template <typename Tuple>
87 struct IsNotOffDiagT {
88   PetscInt _cstart, _cend;
89 
90   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
91   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
92 };
93 
94 struct IsOffDiag {
95   PetscInt _cstart, _cend;
96 
97   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
98   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
99 };
100 
101 struct GlobToLoc {
102   PetscInt _start;
103 
104   GlobToLoc(PetscInt start) : _start(start) { }
105   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
106 };
107 
108 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109 {
110   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
111   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
112   PetscInt               N, *jj;
113   size_t                 noff = 0;
114   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
115   THRUSTINTARRAY         d_j(n);
116   ISLocalToGlobalMapping l2g;
117 
118   PetscFunctionBegin;
119   PetscCall(MatDestroy(&b->A));
120   PetscCall(MatDestroy(&b->B));
121 
122   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
123   d_i.assign(coo_i, coo_i + n);
124   d_j.assign(coo_j, coo_j + n);
125   PetscCall(PetscLogGpuTimeBegin());
126   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
127   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
128   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
129     cusp->coo_p  = new THRUSTINTARRAY(n);
130     cusp->coo_pw = new THRUSTARRAY(n);
131     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
132     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
133     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
134     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
135     firstoffd  = mzipp.get_iterator_tuple().get<1>();
136   }
137   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
138   cusp->coo_no = thrust::distance(firstoffd, d_j.end());
139 
140   /* from global to local */
141   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
142   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
143   PetscCall(PetscLogGpuTimeEnd());
144 
145   /* copy offdiag column indices to map on the CPU */
146   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
147   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
148   auto o_j = d_j.begin();
149   PetscCall(PetscLogGpuTimeBegin());
150   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
151   thrust::sort(thrust::device, o_j, d_j.end());
152   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
153   PetscCall(PetscLogGpuTimeEnd());
154   noff = thrust::distance(o_j, wit);
155   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
156   PetscCall(PetscMalloc1(noff, &b->garray));
157   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
158   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
159   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
160   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
161   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
162   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
163   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
164 
165   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
166   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
167   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
168   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
169   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
170   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
171 
172   /* GPU memory, cusparse specific call handles it internally */
173   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
174   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
175   PetscCall(PetscFree(jj));
176 
177   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
178   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
179 
180   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
181   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
182   PetscCall(MatSetUpMultiply_MPIAIJ(B));
183   PetscFunctionReturn(PETSC_SUCCESS);
184 }
185 
186 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
187 {
188   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
189   Mat_MPIAIJCUSPARSE *mpidev;
190   PetscBool           coo_basic = PETSC_TRUE;
191   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
192   PetscInt            rstart, rend;
193 
194   PetscFunctionBegin;
195   PetscCall(PetscFree(mpiaij->garray));
196   PetscCall(VecDestroy(&mpiaij->lvec));
197 #if defined(PETSC_USE_CTABLE)
198   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
199 #else
200   PetscCall(PetscFree(mpiaij->colmap));
201 #endif
202   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
203   mat->assembled     = PETSC_FALSE;
204   mat->was_assembled = PETSC_FALSE;
205   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
206   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
207   if (coo_i) {
208     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
209     PetscCall(PetscGetMemType(coo_i, &mtype));
210     if (PetscMemTypeHost(mtype)) {
211       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
212         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
213           coo_basic = PETSC_FALSE;
214           break;
215         }
216       }
217     }
218   }
219   /* All ranks must agree on the value of coo_basic */
220   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
221   if (coo_basic) {
222     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
223   } else {
224     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
225     mat->offloadmask = PETSC_OFFLOAD_CPU;
226     /* creates the GPU memory */
227     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
228     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
229     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
230     mpidev->use_extended_coo = PETSC_TRUE;
231 
232     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
233     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
234 
235     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
236     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
237 
238     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
239     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
240     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
241 
242     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
243     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
244     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
245 
246     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
247     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
248     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
249 
250     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
251     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
252 
253     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
254     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
255 
256     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
257     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
258     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
259 
260     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
261     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
262     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
263 
264     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
265   }
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
270 {
271   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
272   const PetscCount grid_size = gridDim.x * blockDim.x;
273   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
274 }
275 
276 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
277 {
278   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
279   const PetscCount grid_size = gridDim.x * blockDim.x;
280   for (; i < Annz + Bnnz; i += grid_size) {
281     PetscScalar sum = 0.0;
282     if (i < Annz) {
283       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
284       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
285     } else {
286       i -= Annz;
287       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
288       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
289     }
290   }
291 }
292 
293 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
294 {
295   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
296   const PetscCount grid_size = gridDim.x * blockDim.x;
297   for (; i < Annz2 + Bnnz2; i += grid_size) {
298     if (i < Annz2) {
299       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
300     } else {
301       i -= Annz2;
302       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
303     }
304   }
305 }
306 
307 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
308 {
309   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
310   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
311   Mat                 A = mpiaij->A, B = mpiaij->B;
312   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
313   PetscScalar        *Aa, *Ba = NULL;
314   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
315   const PetscScalar  *v1     = v;
316   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
317   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
318   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
319   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
320   PetscMemType        memtype;
321 
322   PetscFunctionBegin;
323   if (mpidev->use_extended_coo) {
324     PetscMPIInt size;
325 
326     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
327     PetscCall(PetscGetMemType(v, &memtype));
328     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
329       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
330       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
331     }
332 
333     if (imode == INSERT_VALUES) {
334       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
335       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
336     } else {
337       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
338       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
339     }
340 
341     /* Pack entries to be sent to remote */
342     if (mpiaij->sendlen) {
343       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
344       PetscCallCUDA(cudaPeekAtLastError());
345     }
346 
347     /* Send remote entries to their owner and overlap the communication with local computation */
348     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
349     /* Add local entries to A and B */
350     if (Annz + Bnnz > 0) {
351       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
352       PetscCallCUDA(cudaPeekAtLastError());
353     }
354     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
355 
356     /* Add received remote entries to A and B */
357     if (Annz2 + Bnnz2 > 0) {
358       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
359       PetscCallCUDA(cudaPeekAtLastError());
360     }
361 
362     if (imode == INSERT_VALUES) {
363       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
364       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
365     } else {
366       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
367       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
368     }
369     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
370   } else {
371     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
372   }
373   mat->offloadmask = PETSC_OFFLOAD_GPU;
374   PetscFunctionReturn(PETSC_SUCCESS);
375 }
376 
377 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
378 {
379   Mat             Ad, Ao;
380   const PetscInt *cmap;
381 
382   PetscFunctionBegin;
383   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
384   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
385   if (glob) {
386     PetscInt cst, i, dn, on, *gidx;
387 
388     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
389     PetscCall(MatGetLocalSize(Ao, NULL, &on));
390     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
391     PetscCall(PetscMalloc1(dn + on, &gidx));
392     for (i = 0; i < dn; i++) gidx[i] = cst + i;
393     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
394     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
395   }
396   PetscFunctionReturn(PETSC_SUCCESS);
397 }
398 
399 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
400 {
401   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
402   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
403   PetscInt            i;
404 
405   PetscFunctionBegin;
406   PetscCall(PetscLayoutSetUp(B->rmap));
407   PetscCall(PetscLayoutSetUp(B->cmap));
408   if (PetscDefined(USE_DEBUG) && d_nnz) {
409     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
410   }
411   if (PetscDefined(USE_DEBUG) && o_nnz) {
412     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
413   }
414 #if defined(PETSC_USE_CTABLE)
415   PetscCall(PetscHMapIDestroy(&b->colmap));
416 #else
417   PetscCall(PetscFree(b->colmap));
418 #endif
419   PetscCall(PetscFree(b->garray));
420   PetscCall(VecDestroy(&b->lvec));
421   PetscCall(VecScatterDestroy(&b->Mvctx));
422   /* Because the B will have been resized we simply destroy it and create a new one each time */
423   PetscCall(MatDestroy(&b->B));
424   if (!b->A) {
425     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
426     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
427   }
428   if (!b->B) {
429     PetscMPIInt size;
430     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
431     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
432     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
433   }
434   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
435   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
436   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
437   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
438   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
439   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
440   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
441   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
442   B->preallocated = PETSC_TRUE;
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
447 {
448   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
449 
450   PetscFunctionBegin;
451   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
452   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
453   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
454   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
455   PetscFunctionReturn(PETSC_SUCCESS);
456 }
457 
458 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
459 {
460   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
461 
462   PetscFunctionBegin;
463   PetscCall(MatZeroEntries(l->A));
464   PetscCall(MatZeroEntries(l->B));
465   PetscFunctionReturn(PETSC_SUCCESS);
466 }
467 
468 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
469 {
470   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
471 
472   PetscFunctionBegin;
473   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
474   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
475   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
476   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
477   PetscFunctionReturn(PETSC_SUCCESS);
478 }
479 
480 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
481 {
482   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
483 
484   PetscFunctionBegin;
485   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
486   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
487   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
488   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
489   PetscFunctionReturn(PETSC_SUCCESS);
490 }
491 
492 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
493 {
494   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
495   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
496 
497   PetscFunctionBegin;
498   switch (op) {
499   case MAT_CUSPARSE_MULT_DIAG:
500     cusparseStruct->diagGPUMatFormat = format;
501     break;
502   case MAT_CUSPARSE_MULT_OFFDIAG:
503     cusparseStruct->offdiagGPUMatFormat = format;
504     break;
505   case MAT_CUSPARSE_ALL:
506     cusparseStruct->diagGPUMatFormat    = format;
507     cusparseStruct->offdiagGPUMatFormat = format;
508     break;
509   default:
510     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
511   }
512   PetscFunctionReturn(PETSC_SUCCESS);
513 }
514 
515 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
516 {
517   MatCUSPARSEStorageFormat format;
518   PetscBool                flg;
519   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
520   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
521 
522   PetscFunctionBegin;
523   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
524   if (A->factortype == MAT_FACTOR_NONE) {
525     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
526     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
527     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
528     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
529     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
530     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
531   }
532   PetscOptionsHeadEnd();
533   PetscFunctionReturn(PETSC_SUCCESS);
534 }
535 
536 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
537 {
538   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
539   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
540   PetscObjectState    onnz   = A->nonzerostate;
541 
542   PetscFunctionBegin;
543   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
544   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
545   if (onnz != A->nonzerostate && cusp->deviceMat) {
546     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
547 
548     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
549     PetscCall(PetscNew(&h_mat));
550     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
551     PetscCallCUDA(cudaFree(h_mat->colmap));
552     if (h_mat->allocated_indices) {
553       PetscCallCUDA(cudaFree(h_mat->diag.i));
554       PetscCallCUDA(cudaFree(h_mat->diag.j));
555       if (h_mat->offdiag.j) {
556         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
557         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
558       }
559     }
560     PetscCallCUDA(cudaFree(d_mat));
561     PetscCall(PetscFree(h_mat));
562     cusp->deviceMat = NULL;
563   }
564   PetscFunctionReturn(PETSC_SUCCESS);
565 }
566 
567 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
568 {
569   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
570   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
571 
572   PetscFunctionBegin;
573   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
574   if (cusparseStruct->deviceMat) {
575     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
576 
577     PetscCall(PetscInfo(A, "Have device matrix\n"));
578     PetscCall(PetscNew(&h_mat));
579     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
580     PetscCallCUDA(cudaFree(h_mat->colmap));
581     if (h_mat->allocated_indices) {
582       PetscCallCUDA(cudaFree(h_mat->diag.i));
583       PetscCallCUDA(cudaFree(h_mat->diag.j));
584       if (h_mat->offdiag.j) {
585         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
586         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
587       }
588     }
589     PetscCallCUDA(cudaFree(d_mat));
590     PetscCall(PetscFree(h_mat));
591   }
592   /* Free COO */
593   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
594   PetscCallCXX(delete cusparseStruct);
595   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
596   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
597   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
598   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
599   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
600   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
601   PetscCall(MatDestroy_MPIAIJ(A));
602   PetscFunctionReturn(PETSC_SUCCESS);
603 }
604 
605 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
606 {
607   Mat_MPIAIJ *a;
608   Mat         A;
609 
610   PetscFunctionBegin;
611   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
612   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
613   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
614   A             = *newmat;
615   A->boundtocpu = PETSC_FALSE;
616   PetscCall(PetscFree(A->defaultvectype));
617   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
618 
619   a = (Mat_MPIAIJ *)A->data;
620   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
621   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
622   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
623 
624   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
625 
626   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
627   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
628   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
629   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
630   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
631   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
632   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
633   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
634 
635   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
636   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
637   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
638   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
639   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
640   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
641 #if defined(PETSC_HAVE_HYPRE)
642   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
643 #endif
644   PetscFunctionReturn(PETSC_SUCCESS);
645 }
646 
647 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
648 {
649   PetscFunctionBegin;
650   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
651   PetscCall(MatCreate_MPIAIJ(A));
652   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
653   PetscFunctionReturn(PETSC_SUCCESS);
654 }
655 
656 /*@
657    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
658    (the default parallel PETSc format).  This matrix will ultimately pushed down
659    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
660    assembly performance the user should preallocate the matrix storage by setting
661    the parameter nz (or the array nnz).  By setting these parameters accurately,
662    performance during matrix assembly can be increased by more than a factor of 50.
663 
664    Collective
665 
666    Input Parameters:
667 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
668 .  m - number of rows
669 .  n - number of columns
670 .  nz - number of nonzeros per row (same for all rows)
671 -  nnz - array containing the number of nonzeros in the various rows
672          (possibly different for each row) or NULL
673 
674    Output Parameter:
675 .  A - the matrix
676 
677    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
678    MatXXXXSetPreallocation() paradigm instead of this routine directly.
679    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
680 
681    Notes:
682    If nnz is given then nz is ignored
683 
684    The AIJ format, also called the
685    compressed row storage), is fully compatible with standard Fortran 77
686    storage.  That is, the stored row and column indices can begin at
687    either one (as in Fortran) or zero.  See the users' manual for details.
688 
689    Specify the preallocated storage with either nz or nnz (not both).
690    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
691    allocation.  For large problems you MUST preallocate memory or you
692    will get TERRIBLE performance, see the users' manual chapter on matrices.
693 
694    By default, this format uses inodes (identical nodes) when possible, to
695    improve numerical efficiency of matrix-vector products and solves. We
696    search for consecutive rows with the same nonzero structure, thereby
697    reusing matrix information to achieve increased efficiency.
698 
699    Level: intermediate
700 
701 .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
702 @*/
703 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
704 {
705   PetscMPIInt size;
706 
707   PetscFunctionBegin;
708   PetscCall(MatCreate(comm, A));
709   PetscCall(MatSetSizes(*A, m, n, M, N));
710   PetscCallMPI(MPI_Comm_size(comm, &size));
711   if (size > 1) {
712     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
713     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
714   } else {
715     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
716     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
717   }
718   PetscFunctionReturn(PETSC_SUCCESS);
719 }
720 
721 /*MC
722    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
723 
724    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
725    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
726    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
727 
728    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
729    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
730    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
731    for communicators controlling multiple processes.  It is recommended that you call both of
732    the above preallocation routines for simplicity.
733 
734    Options Database Keys:
735 +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()`
736 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
737 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
738 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
739 
740   Level: beginner
741 
742  .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
743 M*/
744 
745 /*MC
746    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
747 
748   Level: beginner
749 
750  .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
751 M*/
752 
753 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
754 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
755 {
756   PetscSplitCSRDataStructure d_mat;
757   PetscMPIInt                size;
758   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
759   PetscScalar               *aa = NULL, *ba = NULL;
760   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
761   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
762   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;
763 
764   PetscFunctionBegin;
765   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
766   if (A->factortype != MAT_FACTOR_NONE) {
767     *B = NULL;
768     PetscFunctionReturn(PETSC_SUCCESS);
769   }
770   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
771   // get jaca
772   if (size == 1) {
773     PetscBool isseqaij;
774 
775     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
776     if (isseqaij) {
777       jaca = (Mat_SeqAIJ *)A->data;
778       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
779       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
780       d_mat           = cusparsestructA->deviceMat;
781       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
782     } else {
783       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
784       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
785       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
786       jaca                      = (Mat_SeqAIJ *)aij->A->data;
787       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
788       d_mat                     = spptr->deviceMat;
789       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
790     }
791     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
792       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
793       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
794       matrixA = (CsrMatrix *)matstruct->mat;
795       bi      = NULL;
796       bj      = NULL;
797       ba      = NULL;
798     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
799   } else {
800     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
801     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
802     jaca                      = (Mat_SeqAIJ *)aij->A->data;
803     jacb                      = (Mat_SeqAIJ *)aij->B->data;
804     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
805 
806     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
807     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
808     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
809     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
810     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
811       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
812       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
813       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
814       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
815       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
816       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
817       matrixA = (CsrMatrix *)matstructA->mat;
818       matrixB = (CsrMatrix *)matstructB->mat;
819       if (jacb->compressedrow.use) {
820         if (!cusparsestructB->rowoffsets_gpu) {
821           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
822           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
823         }
824         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
825       } else {
826         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
827       }
828       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
829       ba = thrust::raw_pointer_cast(matrixB->values->data());
830     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
831     d_mat = spptr->deviceMat;
832   }
833   if (jaca->compressedrow.use) {
834     if (!cusparsestructA->rowoffsets_gpu) {
835       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
836       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
837     }
838     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
839   } else {
840     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
841   }
842   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
843   aa = thrust::raw_pointer_cast(matrixA->values->data());
844 
845   if (!d_mat) {
846     PetscSplitCSRDataStructure h_mat;
847 
848     // create and populate strucy on host and copy on device
849     PetscCall(PetscInfo(A, "Create device matrix\n"));
850     PetscCall(PetscNew(&h_mat));
851     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
852     if (size > 1) { /* need the colmap array */
853       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
854       PetscInt   *colmap;
855       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;
856 
857       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
858 
859       PetscCall(PetscCalloc1(N + 1, &colmap));
860       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
861 #if defined(PETSC_USE_64BIT_INDICES)
862       { // have to make a long version of these
863         int      *h_bi32, *h_bj32;
864         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
865         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
866         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
867         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
868         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
869         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
870 
871         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
872         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
873         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
874         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
875 
876         h_mat->offdiag.i         = d_bi64;
877         h_mat->offdiag.j         = d_bj64;
878         h_mat->allocated_indices = PETSC_TRUE;
879 
880         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
881       }
882 #else
883       h_mat->offdiag.i         = (PetscInt *)bi;
884       h_mat->offdiag.j         = (PetscInt *)bj;
885       h_mat->allocated_indices = PETSC_FALSE;
886 #endif
887       h_mat->offdiag.a = ba;
888       h_mat->offdiag.n = A->rmap->n;
889 
890       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
891       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
892       PetscCall(PetscFree(colmap));
893     }
894     h_mat->rstart = A->rmap->rstart;
895     h_mat->rend   = A->rmap->rend;
896     h_mat->cstart = A->cmap->rstart;
897     h_mat->cend   = A->cmap->rend;
898     h_mat->M      = A->cmap->N;
899 #if defined(PETSC_USE_64BIT_INDICES)
900     {
901       int      *h_ai32, *h_aj32;
902       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
903 
904       static_assert(sizeof(PetscInt) == 8, "");
905       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
906       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
907       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
908       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
909       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
910 
911       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
912       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
913       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
914       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
915 
916       h_mat->diag.i            = d_ai64;
917       h_mat->diag.j            = d_aj64;
918       h_mat->allocated_indices = PETSC_TRUE;
919 
920       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
921     }
922 #else
923     h_mat->diag.i            = (PetscInt *)ai;
924     h_mat->diag.j            = (PetscInt *)aj;
925     h_mat->allocated_indices = PETSC_FALSE;
926 #endif
927     h_mat->diag.a = aa;
928     h_mat->diag.n = A->rmap->n;
929     h_mat->rank   = PetscGlobalRank;
930     // copy pointers and metadata to device
931     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
932     PetscCall(PetscFree(h_mat));
933   } else {
934     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
935   }
936   *B             = d_mat;
937   A->offloadmask = PETSC_OFFLOAD_GPU;
938   PetscFunctionReturn(PETSC_SUCCESS);
939 }
940