xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 7de69702b957b5de648b60762d01f4e5276d32ac)
1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12 
13 struct VecCUDAEquals {
14   template <typename Tuple>
15   __host__ __device__ void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
22 {
23   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
24   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
25 
26   PetscFunctionBegin;
27   if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS);
28   if (cusparseStruct->use_extended_coo) {
29     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
30     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
31     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
32     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
33     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
34     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
35     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
36     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
37     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
38     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
39     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
40     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
41     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
42   }
43   cusparseStruct->use_extended_coo = PETSC_FALSE;
44   delete cusparseStruct->coo_p;
45   delete cusparseStruct->coo_pw;
46   cusparseStruct->coo_p  = NULL;
47   cusparseStruct->coo_pw = NULL;
48   PetscFunctionReturn(PETSC_SUCCESS);
49 }
50 
51 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
52 {
53   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
54   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
55   PetscInt            n    = cusp->coo_nd + cusp->coo_no;
56 
57   PetscFunctionBegin;
58   if (cusp->coo_p && v) {
59     thrust::device_ptr<const PetscScalar> d_v;
60     THRUSTARRAY                          *w = NULL;
61 
62     if (isCudaMem(v)) {
63       d_v = thrust::device_pointer_cast(v);
64     } else {
65       w = new THRUSTARRAY(n);
66       w->assign(v, v + n);
67       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
68       d_v = w->data();
69     }
70 
71     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
72     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
73     PetscCall(PetscLogGpuTimeBegin());
74     thrust::for_each(zibit, zieit, VecCUDAEquals());
75     PetscCall(PetscLogGpuTimeEnd());
76     delete w;
77     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
78     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
79   } else {
80     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
81     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
82   }
83   PetscFunctionReturn(PETSC_SUCCESS);
84 }
85 
86 template <typename Tuple>
87 struct IsNotOffDiagT {
88   PetscInt _cstart, _cend;
89 
90   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
91   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
92 };
93 
94 struct IsOffDiag {
95   PetscInt _cstart, _cend;
96 
97   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
98   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
99 };
100 
101 struct GlobToLoc {
102   PetscInt _start;
103 
104   GlobToLoc(PetscInt start) : _start(start) { }
105   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
106 };
107 
108 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109 {
110   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
111   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
112   PetscInt               N, *jj;
113   size_t                 noff = 0;
114   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
115   THRUSTINTARRAY         d_j(n);
116   ISLocalToGlobalMapping l2g;
117 
118   PetscFunctionBegin;
119   PetscCall(MatDestroy(&b->A));
120   PetscCall(MatDestroy(&b->B));
121 
122   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
123   d_i.assign(coo_i, coo_i + n);
124   d_j.assign(coo_j, coo_j + n);
125   PetscCall(PetscLogGpuTimeBegin());
126   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
127   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
128   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
129     cusp->coo_p  = new THRUSTINTARRAY(n);
130     cusp->coo_pw = new THRUSTARRAY(n);
131     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
132     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
133     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
134     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
135     firstoffd  = mzipp.get_iterator_tuple().get<1>();
136   }
137   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
138   cusp->coo_no = thrust::distance(firstoffd, d_j.end());
139 
140   /* from global to local */
141   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
142   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
143   PetscCall(PetscLogGpuTimeEnd());
144 
145   /* copy offdiag column indices to map on the CPU */
146   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
147   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
148   auto o_j = d_j.begin();
149   PetscCall(PetscLogGpuTimeBegin());
150   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
151   thrust::sort(thrust::device, o_j, d_j.end());
152   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
153   PetscCall(PetscLogGpuTimeEnd());
154   noff = thrust::distance(o_j, wit);
155   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
156   PetscCall(PetscMalloc1(noff, &b->garray));
157   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
158   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
159   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
160   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
161   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
162   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
163   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
164 
165   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
166   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
167   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
168   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
169   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
170   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
171 
172   /* GPU memory, cusparse specific call handles it internally */
173   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
174   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
175   PetscCall(PetscFree(jj));
176 
177   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
178   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
179 
180   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
181   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
182   PetscCall(MatSetUpMultiply_MPIAIJ(B));
183   PetscFunctionReturn(PETSC_SUCCESS);
184 }
185 
186 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
187 {
188   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
189   Mat_MPIAIJCUSPARSE *mpidev;
190   PetscBool           coo_basic = PETSC_TRUE;
191   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
192   PetscInt            rstart, rend;
193 
194   PetscFunctionBegin;
195   PetscCall(PetscFree(mpiaij->garray));
196   PetscCall(VecDestroy(&mpiaij->lvec));
197 #if defined(PETSC_USE_CTABLE)
198   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
199 #else
200   PetscCall(PetscFree(mpiaij->colmap));
201 #endif
202   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
203   mat->assembled     = PETSC_FALSE;
204   mat->was_assembled = PETSC_FALSE;
205   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
206   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
207   if (coo_i) {
208     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
209     PetscCall(PetscGetMemType(coo_i, &mtype));
210     if (PetscMemTypeHost(mtype)) {
211       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
212         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
213           coo_basic = PETSC_FALSE;
214           break;
215         }
216       }
217     }
218   }
219   /* All ranks must agree on the value of coo_basic */
220   PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
221   if (coo_basic) {
222     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
223   } else {
224     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
225     mat->offloadmask = PETSC_OFFLOAD_CPU;
226     /* creates the GPU memory */
227     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
228     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
229     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
230     mpidev->use_extended_coo = PETSC_TRUE;
231 
232     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
233     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
234 
235     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
236     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
237 
238     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
239     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
240     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
241 
242     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
243     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
244     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
245 
246     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
247     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
248     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
249 
250     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
251     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
252 
253     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
254     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
255 
256     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
257     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
258     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
259 
260     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
261     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
262     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
263 
264     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
265   }
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
270 {
271   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
272   const PetscCount grid_size = gridDim.x * blockDim.x;
273   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
274 }
275 
276 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
277 {
278   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
279   const PetscCount grid_size = gridDim.x * blockDim.x;
280   for (; i < Annz + Bnnz; i += grid_size) {
281     PetscScalar sum = 0.0;
282     if (i < Annz) {
283       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
284       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
285     } else {
286       i -= Annz;
287       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
288       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
289     }
290   }
291 }
292 
293 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
294 {
295   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
296   const PetscCount grid_size = gridDim.x * blockDim.x;
297   for (; i < Annz2 + Bnnz2; i += grid_size) {
298     if (i < Annz2) {
299       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
300     } else {
301       i -= Annz2;
302       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
303     }
304   }
305 }
306 
307 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
308 {
309   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
310   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
311   Mat                 A = mpiaij->A, B = mpiaij->B;
312   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
313   PetscScalar        *Aa, *Ba = NULL;
314   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
315   const PetscScalar  *v1     = v;
316   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
317   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
318   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
319   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
320   PetscMemType        memtype;
321 
322   PetscFunctionBegin;
323   if (mpidev->use_extended_coo) {
324     PetscMPIInt size;
325 
326     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
327     PetscCall(PetscGetMemType(v, &memtype));
328     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
329       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
330       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
331     }
332 
333     if (imode == INSERT_VALUES) {
334       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
335       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
336     } else {
337       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
338       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
339     }
340 
341     /* Pack entries to be sent to remote */
342     if (mpiaij->sendlen) {
343       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
344       PetscCallCUDA(cudaPeekAtLastError());
345     }
346 
347     /* Send remote entries to their owner and overlap the communication with local computation */
348     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
349     /* Add local entries to A and B */
350     if (Annz + Bnnz > 0) {
351       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
352       PetscCallCUDA(cudaPeekAtLastError());
353     }
354     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
355 
356     /* Add received remote entries to A and B */
357     if (Annz2 + Bnnz2 > 0) {
358       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
359       PetscCallCUDA(cudaPeekAtLastError());
360     }
361 
362     if (imode == INSERT_VALUES) {
363       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
364       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
365     } else {
366       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
367       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
368     }
369     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
370   } else {
371     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
372   }
373   mat->offloadmask = PETSC_OFFLOAD_GPU;
374   PetscFunctionReturn(PETSC_SUCCESS);
375 }
376 
377 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
378 {
379   Mat             Ad, Ao;
380   const PetscInt *cmap;
381 
382   PetscFunctionBegin;
383   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
384   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
385   if (glob) {
386     PetscInt cst, i, dn, on, *gidx;
387 
388     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
389     PetscCall(MatGetLocalSize(Ao, NULL, &on));
390     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
391     PetscCall(PetscMalloc1(dn + on, &gidx));
392     for (i = 0; i < dn; i++) gidx[i] = cst + i;
393     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
394     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
395   }
396   PetscFunctionReturn(PETSC_SUCCESS);
397 }
398 
399 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
400 {
401   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
402   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
403   PetscInt            i;
404 
405   PetscFunctionBegin;
406   PetscCall(PetscLayoutSetUp(B->rmap));
407   PetscCall(PetscLayoutSetUp(B->cmap));
408   if (PetscDefined(USE_DEBUG) && d_nnz) {
409     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
410   }
411   if (PetscDefined(USE_DEBUG) && o_nnz) {
412     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
413   }
414 #if defined(PETSC_USE_CTABLE)
415   PetscCall(PetscHMapIDestroy(&b->colmap));
416 #else
417   PetscCall(PetscFree(b->colmap));
418 #endif
419   PetscCall(PetscFree(b->garray));
420   PetscCall(VecDestroy(&b->lvec));
421   PetscCall(VecScatterDestroy(&b->Mvctx));
422   /* Because the B will have been resized we simply destroy it and create a new one each time */
423   PetscCall(MatDestroy(&b->B));
424   if (!b->A) {
425     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
426     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
427   }
428   if (!b->B) {
429     PetscMPIInt size;
430     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
431     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
432     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
433   }
434   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
435   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
436   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
437   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
438   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
439   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
440   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
441   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
442   B->preallocated = PETSC_TRUE;
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
447 {
448   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
449 
450   PetscFunctionBegin;
451   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
452   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
453   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
454   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
455   PetscFunctionReturn(PETSC_SUCCESS);
456 }
457 
458 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
459 {
460   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
461 
462   PetscFunctionBegin;
463   PetscCall(MatZeroEntries(l->A));
464   PetscCall(MatZeroEntries(l->B));
465   PetscFunctionReturn(PETSC_SUCCESS);
466 }
467 
468 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
469 {
470   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
471 
472   PetscFunctionBegin;
473   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
474   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
475   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
476   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
477   PetscFunctionReturn(PETSC_SUCCESS);
478 }
479 
480 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
481 {
482   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
483 
484   PetscFunctionBegin;
485   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
486   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
487   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
488   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
489   PetscFunctionReturn(PETSC_SUCCESS);
490 }
491 
492 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
493 {
494   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
495   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
496 
497   PetscFunctionBegin;
498   switch (op) {
499   case MAT_CUSPARSE_MULT_DIAG:
500     cusparseStruct->diagGPUMatFormat = format;
501     break;
502   case MAT_CUSPARSE_MULT_OFFDIAG:
503     cusparseStruct->offdiagGPUMatFormat = format;
504     break;
505   case MAT_CUSPARSE_ALL:
506     cusparseStruct->diagGPUMatFormat    = format;
507     cusparseStruct->offdiagGPUMatFormat = format;
508     break;
509   default:
510     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
511   }
512   PetscFunctionReturn(PETSC_SUCCESS);
513 }
514 
515 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
516 {
517   MatCUSPARSEStorageFormat format;
518   PetscBool                flg;
519   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
520   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
521 
522   PetscFunctionBegin;
523   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
524   if (A->factortype == MAT_FACTOR_NONE) {
525     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
526     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
527     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
528     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
529     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
530     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
531   }
532   PetscOptionsHeadEnd();
533   PetscFunctionReturn(PETSC_SUCCESS);
534 }
535 
536 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
537 {
538   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
539   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
540   PetscObjectState    onnz   = A->nonzerostate;
541 
542   PetscFunctionBegin;
543   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
544   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
545   if (onnz != A->nonzerostate && cusp->deviceMat) {
546     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
547 
548     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
549     PetscCall(PetscNew(&h_mat));
550     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
551     PetscCallCUDA(cudaFree(h_mat->colmap));
552     if (h_mat->allocated_indices) {
553       PetscCallCUDA(cudaFree(h_mat->diag.i));
554       PetscCallCUDA(cudaFree(h_mat->diag.j));
555       if (h_mat->offdiag.j) {
556         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
557         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
558       }
559     }
560     PetscCallCUDA(cudaFree(d_mat));
561     PetscCall(PetscFree(h_mat));
562     cusp->deviceMat = NULL;
563   }
564   PetscFunctionReturn(PETSC_SUCCESS);
565 }
566 
567 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
568 {
569   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
570   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
571 
572   PetscFunctionBegin;
573   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
574   if (cusparseStruct->deviceMat) {
575     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
576 
577     PetscCall(PetscInfo(A, "Have device matrix\n"));
578     PetscCall(PetscNew(&h_mat));
579     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
580     PetscCallCUDA(cudaFree(h_mat->colmap));
581     if (h_mat->allocated_indices) {
582       PetscCallCUDA(cudaFree(h_mat->diag.i));
583       PetscCallCUDA(cudaFree(h_mat->diag.j));
584       if (h_mat->offdiag.j) {
585         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
586         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
587       }
588     }
589     PetscCallCUDA(cudaFree(d_mat));
590     PetscCall(PetscFree(h_mat));
591   }
592   /* Free COO */
593   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
594   PetscCallCXX(delete cusparseStruct);
595   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
596   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
597   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
598   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
599   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
600   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
601   PetscCall(MatDestroy_MPIAIJ(A));
602   PetscFunctionReturn(PETSC_SUCCESS);
603 }
604 
605 /* defines MatSetValues_MPICUSPARSE_Hash() */
606 #define TYPE AIJ
607 #define TYPE_AIJ
608 #define SUB_TYPE_CUSPARSE
609 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
610 #undef TYPE
611 #undef TYPE_AIJ
612 #undef SUB_TYPE_CUSPARSE
613 
614 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
615 {
616   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
617   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
618 
619   PetscFunctionBegin;
620   PetscCall(MatSetUp_MPI_Hash(A));
621   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
622   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
623   A->preallocated = PETSC_TRUE;
624   PetscFunctionReturn(PETSC_SUCCESS);
625 }
626 
627 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
628 {
629   Mat_MPIAIJ *a;
630   Mat         A;
631 
632   PetscFunctionBegin;
633   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
634   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
635   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
636   A             = *newmat;
637   A->boundtocpu = PETSC_FALSE;
638   PetscCall(PetscFree(A->defaultvectype));
639   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
640 
641   a = (Mat_MPIAIJ *)A->data;
642   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
643   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
644   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
645 
646   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
647 
648   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
649   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
650   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
651   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
652   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
653   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
654   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
655   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
656   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
657 
658   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
659   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
660   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
661   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
662   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
663   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
664 #if defined(PETSC_HAVE_HYPRE)
665   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
666 #endif
667   PetscFunctionReturn(PETSC_SUCCESS);
668 }
669 
670 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
671 {
672   PetscFunctionBegin;
673   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
674   PetscCall(MatCreate_MPIAIJ(A));
675   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
676   PetscFunctionReturn(PETSC_SUCCESS);
677 }
678 
679 /*@
680    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
681    (the default parallel PETSc format).  This matrix will ultimately pushed down
682    to NVIDIA GPUs and use the CuSPARSE library for calculations.
683 
684    Collective
685 
686    Input Parameters:
687 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
688 .  m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
689            This value should be the same as the local size used in creating the
690            y vector for the matrix-vector product y = Ax.
691 .  n - This value should be the same as the local size used in creating the
692        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
693        calculated if `N` is given) For square matrices `n` is almost always `m`.
694 .  M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
695 .  N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
696 .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
697            (same value is used for all local rows)
698 .  d_nnz - array containing the number of nonzeros in the various rows of the
699            DIAGONAL portion of the local submatrix (possibly different for each row)
700            or `NULL`, if `d_nz` is used to specify the nonzero structure.
701            The size of this array is equal to the number of local rows, i.e `m`.
702            For matrices you plan to factor you must leave room for the diagonal entry and
703            put in the entry even if it is zero.
704 .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
705            submatrix (same value is used for all local rows).
706 -  o_nnz - array containing the number of nonzeros in the various rows of the
707            OFF-DIAGONAL portion of the local submatrix (possibly different for
708            each row) or `NULL`, if `o_nz` is used to specify the nonzero
709            structure. The size of this array is equal to the number
710            of local rows, i.e `m`.
711 
712    Output Parameter:
713 .  A - the matrix
714 
715    Level: intermediate
716 
717    Notes:
718    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
719    MatXXXXSetPreallocation() paradigm instead of this routine directly.
720    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
721 
722    The AIJ format, also called the
723    compressed row storage), is fully compatible with standard Fortran
724    storage.  That is, the stored row and column indices can begin at
725    either one (as in Fortran) or zero.
726 
727 .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
728 @*/
729 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
730 {
731   PetscMPIInt size;
732 
733   PetscFunctionBegin;
734   PetscCall(MatCreate(comm, A));
735   PetscCall(MatSetSizes(*A, m, n, M, N));
736   PetscCallMPI(MPI_Comm_size(comm, &size));
737   if (size > 1) {
738     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
739     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
740   } else {
741     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
742     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
743   }
744   PetscFunctionReturn(PETSC_SUCCESS);
745 }
746 
747 /*MC
748    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
749 
750    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
751    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
752    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
753 
754    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
755    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
756    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
757    for communicators controlling multiple processes.  It is recommended that you call both of
758    the above preallocation routines for simplicity.
759 
760    Options Database Keys:
761 +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
762 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
763 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
764 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
765 
766   Level: beginner
767 
768 .seealso: [](chapter_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
769 M*/
770 
771 /*MC
772    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
773 
774   Level: beginner
775 
776 .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
777 M*/
778 
779 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
780 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
781 {
782   PetscSplitCSRDataStructure d_mat;
783   PetscMPIInt                size;
784   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
785   PetscScalar               *aa = NULL, *ba = NULL;
786   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
787   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
788   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;
789 
790   PetscFunctionBegin;
791   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
792   if (A->factortype != MAT_FACTOR_NONE) {
793     *B = NULL;
794     PetscFunctionReturn(PETSC_SUCCESS);
795   }
796   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
797   // get jaca
798   if (size == 1) {
799     PetscBool isseqaij;
800 
801     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
802     if (isseqaij) {
803       jaca = (Mat_SeqAIJ *)A->data;
804       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
805       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
806       d_mat           = cusparsestructA->deviceMat;
807       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
808     } else {
809       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
810       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
811       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
812       jaca                      = (Mat_SeqAIJ *)aij->A->data;
813       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
814       d_mat                     = spptr->deviceMat;
815       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
816     }
817     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
818       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
819       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
820       matrixA = (CsrMatrix *)matstruct->mat;
821       bi      = NULL;
822       bj      = NULL;
823       ba      = NULL;
824     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
825   } else {
826     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
827     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
828     jaca                      = (Mat_SeqAIJ *)aij->A->data;
829     jacb                      = (Mat_SeqAIJ *)aij->B->data;
830     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
831 
832     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
833     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
834     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
835     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
836     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
837       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
838       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
839       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
840       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
841       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
842       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
843       matrixA = (CsrMatrix *)matstructA->mat;
844       matrixB = (CsrMatrix *)matstructB->mat;
845       if (jacb->compressedrow.use) {
846         if (!cusparsestructB->rowoffsets_gpu) {
847           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
848           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
849         }
850         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
851       } else {
852         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
853       }
854       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
855       ba = thrust::raw_pointer_cast(matrixB->values->data());
856     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
857     d_mat = spptr->deviceMat;
858   }
859   if (jaca->compressedrow.use) {
860     if (!cusparsestructA->rowoffsets_gpu) {
861       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
862       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
863     }
864     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
865   } else {
866     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
867   }
868   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
869   aa = thrust::raw_pointer_cast(matrixA->values->data());
870 
871   if (!d_mat) {
872     PetscSplitCSRDataStructure h_mat;
873 
874     // create and populate strucy on host and copy on device
875     PetscCall(PetscInfo(A, "Create device matrix\n"));
876     PetscCall(PetscNew(&h_mat));
877     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
878     if (size > 1) { /* need the colmap array */
879       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
880       PetscInt   *colmap;
881       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;
882 
883       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
884 
885       PetscCall(PetscCalloc1(N + 1, &colmap));
886       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
887 #if defined(PETSC_USE_64BIT_INDICES)
888       { // have to make a long version of these
889         int      *h_bi32, *h_bj32;
890         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
891         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
892         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
893         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
894         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
895         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
896 
897         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
898         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
899         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
900         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
901 
902         h_mat->offdiag.i         = d_bi64;
903         h_mat->offdiag.j         = d_bj64;
904         h_mat->allocated_indices = PETSC_TRUE;
905 
906         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
907       }
908 #else
909       h_mat->offdiag.i         = (PetscInt *)bi;
910       h_mat->offdiag.j         = (PetscInt *)bj;
911       h_mat->allocated_indices = PETSC_FALSE;
912 #endif
913       h_mat->offdiag.a = ba;
914       h_mat->offdiag.n = A->rmap->n;
915 
916       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
917       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
918       PetscCall(PetscFree(colmap));
919     }
920     h_mat->rstart = A->rmap->rstart;
921     h_mat->rend   = A->rmap->rend;
922     h_mat->cstart = A->cmap->rstart;
923     h_mat->cend   = A->cmap->rend;
924     h_mat->M      = A->cmap->N;
925 #if defined(PETSC_USE_64BIT_INDICES)
926     {
927       int      *h_ai32, *h_aj32;
928       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
929 
930       static_assert(sizeof(PetscInt) == 8, "");
931       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
932       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
933       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
934       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
935       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
936 
937       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
938       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
939       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
940       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
941 
942       h_mat->diag.i            = d_ai64;
943       h_mat->diag.j            = d_aj64;
944       h_mat->allocated_indices = PETSC_TRUE;
945 
946       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
947     }
948 #else
949     h_mat->diag.i            = (PetscInt *)ai;
950     h_mat->diag.j            = (PetscInt *)aj;
951     h_mat->allocated_indices = PETSC_FALSE;
952 #endif
953     h_mat->diag.a = aa;
954     h_mat->diag.n = A->rmap->n;
955     h_mat->rank   = PetscGlobalRank;
956     // copy pointers and metadata to device
957     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
958     PetscCall(PetscFree(h_mat));
959   } else {
960     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
961   }
962   *B             = d_mat;
963   A->offloadmask = PETSC_OFFLOAD_GPU;
964   PetscFunctionReturn(PETSC_SUCCESS);
965 }
966