xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision df4cd43f92eaa320656440c40edb1046daee8f75)
1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12 
13 struct VecCUDAEquals {
14   template <typename Tuple>
15   __host__ __device__ void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
22 {
23   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
24   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
25 
26   PetscFunctionBegin;
27   if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS);
28   if (cusparseStruct->use_extended_coo) {
29     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
30     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
31     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
32     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
33     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
34     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
35     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
36     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
37     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
38     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
39     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
40     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
41     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
42   }
43   cusparseStruct->use_extended_coo = PETSC_FALSE;
44   delete cusparseStruct->coo_p;
45   delete cusparseStruct->coo_pw;
46   cusparseStruct->coo_p  = NULL;
47   cusparseStruct->coo_pw = NULL;
48   PetscFunctionReturn(PETSC_SUCCESS);
49 }
50 
51 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
52 {
53   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
54   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
55   PetscInt            n    = cusp->coo_nd + cusp->coo_no;
56 
57   PetscFunctionBegin;
58   if (cusp->coo_p && v) {
59     thrust::device_ptr<const PetscScalar> d_v;
60     THRUSTARRAY                          *w = NULL;
61 
62     if (isCudaMem(v)) {
63       d_v = thrust::device_pointer_cast(v);
64     } else {
65       w = new THRUSTARRAY(n);
66       w->assign(v, v + n);
67       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
68       d_v = w->data();
69     }
70 
71     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
72     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
73     PetscCall(PetscLogGpuTimeBegin());
74     thrust::for_each(zibit, zieit, VecCUDAEquals());
75     PetscCall(PetscLogGpuTimeEnd());
76     delete w;
77     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
78     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
79   } else {
80     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
81     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
82   }
83   PetscFunctionReturn(PETSC_SUCCESS);
84 }
85 
86 template <typename Tuple>
87 struct IsNotOffDiagT {
88   PetscInt _cstart, _cend;
89 
90   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
91   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
92 };
93 
94 struct IsOffDiag {
95   PetscInt _cstart, _cend;
96 
97   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
98   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
99 };
100 
101 struct GlobToLoc {
102   PetscInt _start;
103 
104   GlobToLoc(PetscInt start) : _start(start) { }
105   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
106 };
107 
108 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109 {
110   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
111   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
112   PetscInt               N, *jj;
113   size_t                 noff = 0;
114   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
115   THRUSTINTARRAY         d_j(n);
116   ISLocalToGlobalMapping l2g;
117 
118   PetscFunctionBegin;
119   PetscCall(MatDestroy(&b->A));
120   PetscCall(MatDestroy(&b->B));
121 
122   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
123   d_i.assign(coo_i, coo_i + n);
124   d_j.assign(coo_j, coo_j + n);
125   PetscCall(PetscLogGpuTimeBegin());
126   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
127   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
128   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
129     cusp->coo_p  = new THRUSTINTARRAY(n);
130     cusp->coo_pw = new THRUSTARRAY(n);
131     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
132     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
133     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
134     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
135     firstoffd  = mzipp.get_iterator_tuple().get<1>();
136   }
137   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
138   cusp->coo_no = thrust::distance(firstoffd, d_j.end());
139 
140   /* from global to local */
141   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
142   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
143   PetscCall(PetscLogGpuTimeEnd());
144 
145   /* copy offdiag column indices to map on the CPU */
146   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
147   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
148   auto o_j = d_j.begin();
149   PetscCall(PetscLogGpuTimeBegin());
150   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
151   thrust::sort(thrust::device, o_j, d_j.end());
152   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
153   PetscCall(PetscLogGpuTimeEnd());
154   noff = thrust::distance(o_j, wit);
155   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
156   PetscCall(PetscMalloc1(noff, &b->garray));
157   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
158   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
159   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
160   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
161   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
162   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
163   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
164 
165   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
166   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
167   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
168   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
169   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
170   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
171 
172   /* GPU memory, cusparse specific call handles it internally */
173   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
174   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
175   PetscCall(PetscFree(jj));
176 
177   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
178   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
179 
180   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
181   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
182   PetscCall(MatSetUpMultiply_MPIAIJ(B));
183   PetscFunctionReturn(PETSC_SUCCESS);
184 }
185 
186 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
187 {
188   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
189   Mat_MPIAIJCUSPARSE *mpidev;
190   PetscBool           coo_basic = PETSC_TRUE;
191   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
192   PetscInt            rstart, rend;
193 
194   PetscFunctionBegin;
195   PetscCall(PetscFree(mpiaij->garray));
196   PetscCall(VecDestroy(&mpiaij->lvec));
197 #if defined(PETSC_USE_CTABLE)
198   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
199 #else
200   PetscCall(PetscFree(mpiaij->colmap));
201 #endif
202   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
203   mat->assembled     = PETSC_FALSE;
204   mat->was_assembled = PETSC_FALSE;
205   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
206   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
207   if (coo_i) {
208     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
209     PetscCall(PetscGetMemType(coo_i, &mtype));
210     if (PetscMemTypeHost(mtype)) {
211       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
212         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
213           coo_basic = PETSC_FALSE;
214           break;
215         }
216       }
217     }
218   }
219   /* All ranks must agree on the value of coo_basic */
220   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
221   if (coo_basic) {
222     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
223   } else {
224     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
225     mat->offloadmask = PETSC_OFFLOAD_CPU;
226     /* creates the GPU memory */
227     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
228     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
229     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
230     mpidev->use_extended_coo = PETSC_TRUE;
231 
232     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
233     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
234 
235     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
236     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
237 
238     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
239     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
240     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
241 
242     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
243     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
244     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
245 
246     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
247     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
248     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
249 
250     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
251     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
252 
253     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
254     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
255 
256     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
257     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
258     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
259 
260     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
261     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
262     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
263 
264     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
265   }
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
270 {
271   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
272   const PetscCount grid_size = gridDim.x * blockDim.x;
273   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
274 }
275 
276 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
277 {
278   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
279   const PetscCount grid_size = gridDim.x * blockDim.x;
280   for (; i < Annz + Bnnz; i += grid_size) {
281     PetscScalar sum = 0.0;
282     if (i < Annz) {
283       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
284       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
285     } else {
286       i -= Annz;
287       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
288       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
289     }
290   }
291 }
292 
293 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
294 {
295   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
296   const PetscCount grid_size = gridDim.x * blockDim.x;
297   for (; i < Annz2 + Bnnz2; i += grid_size) {
298     if (i < Annz2) {
299       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
300     } else {
301       i -= Annz2;
302       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
303     }
304   }
305 }
306 
307 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
308 {
309   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
310   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
311   Mat                 A = mpiaij->A, B = mpiaij->B;
312   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
313   PetscScalar        *Aa, *Ba = NULL;
314   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
315   const PetscScalar  *v1     = v;
316   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
317   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
318   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
319   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
320   PetscMemType        memtype;
321 
322   PetscFunctionBegin;
323   if (mpidev->use_extended_coo) {
324     PetscMPIInt size;
325 
326     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
327     PetscCall(PetscGetMemType(v, &memtype));
328     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
329       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
330       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
331     }
332 
333     if (imode == INSERT_VALUES) {
334       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
335       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
336     } else {
337       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
338       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
339     }
340 
341     /* Pack entries to be sent to remote */
342     if (mpiaij->sendlen) {
343       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
344       PetscCallCUDA(cudaPeekAtLastError());
345     }
346 
347     /* Send remote entries to their owner and overlap the communication with local computation */
348     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
349     /* Add local entries to A and B */
350     if (Annz + Bnnz > 0) {
351       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
352       PetscCallCUDA(cudaPeekAtLastError());
353     }
354     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
355 
356     /* Add received remote entries to A and B */
357     if (Annz2 + Bnnz2 > 0) {
358       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
359       PetscCallCUDA(cudaPeekAtLastError());
360     }
361 
362     if (imode == INSERT_VALUES) {
363       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
364       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
365     } else {
366       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
367       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
368     }
369     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
370   } else {
371     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
372   }
373   mat->offloadmask = PETSC_OFFLOAD_GPU;
374   PetscFunctionReturn(PETSC_SUCCESS);
375 }
376 
377 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
378 {
379   Mat             Ad, Ao;
380   const PetscInt *cmap;
381 
382   PetscFunctionBegin;
383   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
384   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
385   if (glob) {
386     PetscInt cst, i, dn, on, *gidx;
387 
388     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
389     PetscCall(MatGetLocalSize(Ao, NULL, &on));
390     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
391     PetscCall(PetscMalloc1(dn + on, &gidx));
392     for (i = 0; i < dn; i++) gidx[i] = cst + i;
393     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
394     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
395   }
396   PetscFunctionReturn(PETSC_SUCCESS);
397 }
398 
399 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
400 {
401   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
402   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
403   PetscInt            i;
404 
405   PetscFunctionBegin;
406   PetscCall(PetscLayoutSetUp(B->rmap));
407   PetscCall(PetscLayoutSetUp(B->cmap));
408   if (PetscDefined(USE_DEBUG) && d_nnz) {
409     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
410   }
411   if (PetscDefined(USE_DEBUG) && o_nnz) {
412     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
413   }
414 #if defined(PETSC_USE_CTABLE)
415   PetscCall(PetscHMapIDestroy(&b->colmap));
416 #else
417   PetscCall(PetscFree(b->colmap));
418 #endif
419   PetscCall(PetscFree(b->garray));
420   PetscCall(VecDestroy(&b->lvec));
421   PetscCall(VecScatterDestroy(&b->Mvctx));
422   /* Because the B will have been resized we simply destroy it and create a new one each time */
423   PetscCall(MatDestroy(&b->B));
424   if (!b->A) {
425     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
426     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
427   }
428   if (!b->B) {
429     PetscMPIInt size;
430     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
431     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
432     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
433   }
434   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
435   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
436   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
437   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
438   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
439   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
440   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
441   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
442   B->preallocated = PETSC_TRUE;
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
447 {
448   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
449 
450   PetscFunctionBegin;
451   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
452   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
453   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
454   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
455   PetscFunctionReturn(PETSC_SUCCESS);
456 }
457 
458 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
459 {
460   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
461 
462   PetscFunctionBegin;
463   PetscCall(MatZeroEntries(l->A));
464   PetscCall(MatZeroEntries(l->B));
465   PetscFunctionReturn(PETSC_SUCCESS);
466 }
467 
468 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
469 {
470   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
471 
472   PetscFunctionBegin;
473   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
474   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
475   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
476   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
477   PetscFunctionReturn(PETSC_SUCCESS);
478 }
479 
480 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
481 {
482   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
483 
484   PetscFunctionBegin;
485   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
486   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
487   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
488   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
489   PetscFunctionReturn(PETSC_SUCCESS);
490 }
491 
492 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
493 {
494   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
495   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
496 
497   PetscFunctionBegin;
498   switch (op) {
499   case MAT_CUSPARSE_MULT_DIAG:
500     cusparseStruct->diagGPUMatFormat = format;
501     break;
502   case MAT_CUSPARSE_MULT_OFFDIAG:
503     cusparseStruct->offdiagGPUMatFormat = format;
504     break;
505   case MAT_CUSPARSE_ALL:
506     cusparseStruct->diagGPUMatFormat    = format;
507     cusparseStruct->offdiagGPUMatFormat = format;
508     break;
509   default:
510     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
511   }
512   PetscFunctionReturn(PETSC_SUCCESS);
513 }
514 
515 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
516 {
517   MatCUSPARSEStorageFormat format;
518   PetscBool                flg;
519   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
520   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
521 
522   PetscFunctionBegin;
523   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
524   if (A->factortype == MAT_FACTOR_NONE) {
525     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
526     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
527     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
528     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
529     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
530     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
531   }
532   PetscOptionsHeadEnd();
533   PetscFunctionReturn(PETSC_SUCCESS);
534 }
535 
536 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
537 {
538   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
539   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
540   PetscObjectState    onnz   = A->nonzerostate;
541 
542   PetscFunctionBegin;
543   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
544   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
545   if (onnz != A->nonzerostate && cusp->deviceMat) {
546     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
547 
548     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
549     PetscCall(PetscNew(&h_mat));
550     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
551     PetscCallCUDA(cudaFree(h_mat->colmap));
552     if (h_mat->allocated_indices) {
553       PetscCallCUDA(cudaFree(h_mat->diag.i));
554       PetscCallCUDA(cudaFree(h_mat->diag.j));
555       if (h_mat->offdiag.j) {
556         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
557         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
558       }
559     }
560     PetscCallCUDA(cudaFree(d_mat));
561     PetscCall(PetscFree(h_mat));
562     cusp->deviceMat = NULL;
563   }
564   PetscFunctionReturn(PETSC_SUCCESS);
565 }
566 
567 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
568 {
569   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
570   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
571 
572   PetscFunctionBegin;
573   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
574   if (cusparseStruct->deviceMat) {
575     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
576 
577     PetscCall(PetscInfo(A, "Have device matrix\n"));
578     PetscCall(PetscNew(&h_mat));
579     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
580     PetscCallCUDA(cudaFree(h_mat->colmap));
581     if (h_mat->allocated_indices) {
582       PetscCallCUDA(cudaFree(h_mat->diag.i));
583       PetscCallCUDA(cudaFree(h_mat->diag.j));
584       if (h_mat->offdiag.j) {
585         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
586         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
587       }
588     }
589     PetscCallCUDA(cudaFree(d_mat));
590     PetscCall(PetscFree(h_mat));
591   }
592   /* Free COO */
593   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
594   PetscCallCXX(delete cusparseStruct);
595   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
596   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
597   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
598   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
599   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
600   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
601   PetscCall(MatDestroy_MPIAIJ(A));
602   PetscFunctionReturn(PETSC_SUCCESS);
603 }
604 
605 /* defines MatSetValues_MPICUSPARSE_Hash() */
606 #define TYPE AIJ
607 #define TYPE_AIJ
608 #define SUB_TYPE_CUSPARSE
609 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
610 #undef TYPE
611 #undef TYPE_AIJ
612 #undef SUB_TYPE_CUSPARSE
613 
614 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
615 {
616   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
617   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
618 
619   PetscFunctionBegin;
620   PetscCall(MatSetUp_MPI_Hash(A));
621   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
622   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
623   A->preallocated = PETSC_TRUE;
624   PetscFunctionReturn(PETSC_SUCCESS);
625 }
626 
627 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
628 {
629   Mat_MPIAIJ *a;
630   Mat         A;
631 
632   PetscFunctionBegin;
633   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
634   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
635   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
636   A             = *newmat;
637   A->boundtocpu = PETSC_FALSE;
638   PetscCall(PetscFree(A->defaultvectype));
639   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
640 
641   a = (Mat_MPIAIJ *)A->data;
642   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
643   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
644   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
645 
646   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
647 
648   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
649   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
650   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
651   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
652   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
653   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
654   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
655   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
656   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
657 
658   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
659   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
660   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
661   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
662   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
663   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
664 #if defined(PETSC_HAVE_HYPRE)
665   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
666 #endif
667   PetscFunctionReturn(PETSC_SUCCESS);
668 }
669 
670 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
671 {
672   PetscFunctionBegin;
673   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
674   PetscCall(MatCreate_MPIAIJ(A));
675   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
676   PetscFunctionReturn(PETSC_SUCCESS);
677 }
678 
679 /*@
680    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
681    (the default parallel PETSc format).  This matrix will ultimately pushed down
682    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
683    assembly performance the user should preallocate the matrix storage by setting
684    the parameter `nz` (or the array nnz).
685 
686    Collective
687 
688    Input Parameters:
689 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
690 .  m - number of rows
691 .  n - number of columns
692 .  nz - number of nonzeros per row (same for all rows)
693 -  nnz - array containing the number of nonzeros in the various rows
694          (possibly different for each row) or `NULL`
695 
696    Output Parameter:
697 .  A - the matrix
698 
699    Level: intermediate
700 
701    Notes:
702    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
703    MatXXXXSetPreallocation() paradigm instead of this routine directly.
704    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
705 
706    If `nnz` is given then `nz` is ignored
707 
708    The AIJ format, also called the
709    compressed row storage), is fully compatible with standard Fortran
710    storage.  That is, the stored row and column indices can begin at
711    either one (as in Fortran) or zero.
712 
713    Specify the preallocated storage with either `nz` or `nnz` (not both).
714    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
715    allocation.
716 
717    By default, this format uses inodes (identical nodes) when possible, to
718    improve numerical efficiency of matrix-vector products and solves. We
719    search for consecutive rows with the same nonzero structure, thereby
720    reusing matrix information to achieve increased efficiency.
721 
722 .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
723 @*/
724 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
725 {
726   PetscMPIInt size;
727 
728   PetscFunctionBegin;
729   PetscCall(MatCreate(comm, A));
730   PetscCall(MatSetSizes(*A, m, n, M, N));
731   PetscCallMPI(MPI_Comm_size(comm, &size));
732   if (size > 1) {
733     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
734     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
735   } else {
736     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
737     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
738   }
739   PetscFunctionReturn(PETSC_SUCCESS);
740 }
741 
742 /*MC
743    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
744 
745    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
746    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
747    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
748 
749    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
750    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
751    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
752    for communicators controlling multiple processes.  It is recommended that you call both of
753    the above preallocation routines for simplicity.
754 
755    Options Database Keys:
756 +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
757 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
758 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
759 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
760 
761   Level: beginner
762 
763  .seealso: [](chapter_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
764 M*/
765 
766 /*MC
767    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
768 
769   Level: beginner
770 
771  .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
772 M*/
773 
774 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
775 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
776 {
777   PetscSplitCSRDataStructure d_mat;
778   PetscMPIInt                size;
779   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
780   PetscScalar               *aa = NULL, *ba = NULL;
781   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
782   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
783   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;
784 
785   PetscFunctionBegin;
786   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
787   if (A->factortype != MAT_FACTOR_NONE) {
788     *B = NULL;
789     PetscFunctionReturn(PETSC_SUCCESS);
790   }
791   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
792   // get jaca
793   if (size == 1) {
794     PetscBool isseqaij;
795 
796     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
797     if (isseqaij) {
798       jaca = (Mat_SeqAIJ *)A->data;
799       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
800       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
801       d_mat           = cusparsestructA->deviceMat;
802       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
803     } else {
804       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
805       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
806       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
807       jaca                      = (Mat_SeqAIJ *)aij->A->data;
808       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
809       d_mat                     = spptr->deviceMat;
810       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
811     }
812     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
813       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
814       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
815       matrixA = (CsrMatrix *)matstruct->mat;
816       bi      = NULL;
817       bj      = NULL;
818       ba      = NULL;
819     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
820   } else {
821     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
822     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
823     jaca                      = (Mat_SeqAIJ *)aij->A->data;
824     jacb                      = (Mat_SeqAIJ *)aij->B->data;
825     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
826 
827     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
828     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
829     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
830     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
831     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
832       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
833       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
834       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
835       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
836       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
837       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
838       matrixA = (CsrMatrix *)matstructA->mat;
839       matrixB = (CsrMatrix *)matstructB->mat;
840       if (jacb->compressedrow.use) {
841         if (!cusparsestructB->rowoffsets_gpu) {
842           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
843           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
844         }
845         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
846       } else {
847         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
848       }
849       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
850       ba = thrust::raw_pointer_cast(matrixB->values->data());
851     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
852     d_mat = spptr->deviceMat;
853   }
854   if (jaca->compressedrow.use) {
855     if (!cusparsestructA->rowoffsets_gpu) {
856       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
857       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
858     }
859     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
860   } else {
861     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
862   }
863   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
864   aa = thrust::raw_pointer_cast(matrixA->values->data());
865 
866   if (!d_mat) {
867     PetscSplitCSRDataStructure h_mat;
868 
869     // create and populate strucy on host and copy on device
870     PetscCall(PetscInfo(A, "Create device matrix\n"));
871     PetscCall(PetscNew(&h_mat));
872     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
873     if (size > 1) { /* need the colmap array */
874       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
875       PetscInt   *colmap;
876       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;
877 
878       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
879 
880       PetscCall(PetscCalloc1(N + 1, &colmap));
881       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
882 #if defined(PETSC_USE_64BIT_INDICES)
883       { // have to make a long version of these
884         int      *h_bi32, *h_bj32;
885         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
886         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
887         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
888         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
889         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
890         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
891 
892         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
893         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
894         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
895         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
896 
897         h_mat->offdiag.i         = d_bi64;
898         h_mat->offdiag.j         = d_bj64;
899         h_mat->allocated_indices = PETSC_TRUE;
900 
901         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
902       }
903 #else
904       h_mat->offdiag.i         = (PetscInt *)bi;
905       h_mat->offdiag.j         = (PetscInt *)bj;
906       h_mat->allocated_indices = PETSC_FALSE;
907 #endif
908       h_mat->offdiag.a = ba;
909       h_mat->offdiag.n = A->rmap->n;
910 
911       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
912       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
913       PetscCall(PetscFree(colmap));
914     }
915     h_mat->rstart = A->rmap->rstart;
916     h_mat->rend   = A->rmap->rend;
917     h_mat->cstart = A->cmap->rstart;
918     h_mat->cend   = A->cmap->rend;
919     h_mat->M      = A->cmap->N;
920 #if defined(PETSC_USE_64BIT_INDICES)
921     {
922       int      *h_ai32, *h_aj32;
923       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
924 
925       static_assert(sizeof(PetscInt) == 8, "");
926       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
927       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
928       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
929       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
930       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
931 
932       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
933       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
934       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
935       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
936 
937       h_mat->diag.i            = d_ai64;
938       h_mat->diag.j            = d_aj64;
939       h_mat->allocated_indices = PETSC_TRUE;
940 
941       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
942     }
943 #else
944     h_mat->diag.i            = (PetscInt *)ai;
945     h_mat->diag.j            = (PetscInt *)aj;
946     h_mat->allocated_indices = PETSC_FALSE;
947 #endif
948     h_mat->diag.a = aa;
949     h_mat->diag.n = A->rmap->n;
950     h_mat->rank   = PetscGlobalRank;
951     // copy pointers and metadata to device
952     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
953     PetscCall(PetscFree(h_mat));
954   } else {
955     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
956   }
957   *B             = d_mat;
958   A->offloadmask = PETSC_OFFLOAD_GPU;
959   PetscFunctionReturn(PETSC_SUCCESS);
960 }
961