xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision f6dfbefd03961ab3be6f06be75c96cbf27a49667)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals {
15   template <typename Tuple>
16   __host__ __device__ void operator()(Tuple t) {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) {
22   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
23   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
24 
25   PetscFunctionBegin;
26   if (!cusparseStruct) PetscFunctionReturn(0);
27   if (cusparseStruct->use_extended_coo) {
28     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
29     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
30     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
31     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
32     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
33     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
34     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
35     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
36     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
37     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
38     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
39     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
40     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
41   }
42   cusparseStruct->use_extended_coo = PETSC_FALSE;
43   delete cusparseStruct->coo_p;
44   delete cusparseStruct->coo_pw;
45   cusparseStruct->coo_p  = NULL;
46   cusparseStruct->coo_pw = NULL;
47   PetscFunctionReturn(0);
48 }
49 
50 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) {
51   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
52   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
53   PetscInt            n    = cusp->coo_nd + cusp->coo_no;
54 
55   PetscFunctionBegin;
56   if (cusp->coo_p && v) {
57     thrust::device_ptr<const PetscScalar> d_v;
58     THRUSTARRAY                          *w = NULL;
59 
60     if (isCudaMem(v)) {
61       d_v = thrust::device_pointer_cast(v);
62     } else {
63       w = new THRUSTARRAY(n);
64       w->assign(v, v + n);
65       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
66       d_v = w->data();
67     }
68 
69     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
70     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
71     PetscCall(PetscLogGpuTimeBegin());
72     thrust::for_each(zibit, zieit, VecCUDAEquals());
73     PetscCall(PetscLogGpuTimeEnd());
74     delete w;
75     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
76     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
77   } else {
78     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
79     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
80   }
81   PetscFunctionReturn(0);
82 }
83 
84 template <typename Tuple>
85 struct IsNotOffDiagT {
86   PetscInt _cstart, _cend;
87 
88   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
89   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
90 };
91 
92 struct IsOffDiag {
93   PetscInt _cstart, _cend;
94 
95   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
96   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
97 };
98 
99 struct GlobToLoc {
100   PetscInt _start;
101 
102   GlobToLoc(PetscInt start) : _start(start) { }
103   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
104 };
105 
106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) {
107   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
108   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
109   PetscInt               N, *jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114 
115   PetscFunctionBegin;
116   PetscCall(MatDestroy(&b->A));
117   PetscCall(MatDestroy(&b->B));
118 
119   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
120   d_i.assign(coo_i, coo_i + n);
121   d_j.assign(coo_j, coo_j + n);
122   PetscCall(PetscLogGpuTimeBegin());
123   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
124   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
125   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
126     cusp->coo_p  = new THRUSTINTARRAY(n);
127     cusp->coo_pw = new THRUSTARRAY(n);
128     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
129     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
130     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
131     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
132     firstoffd  = mzipp.get_iterator_tuple().get<1>();
133   }
134   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
135   cusp->coo_no = thrust::distance(firstoffd, d_j.end());
136 
137   /* from global to local */
138   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
139   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
140   PetscCall(PetscLogGpuTimeEnd());
141 
142   /* copy offdiag column indices to map on the CPU */
143   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
144   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
145   auto o_j = d_j.begin();
146   PetscCall(PetscLogGpuTimeBegin());
147   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
148   thrust::sort(thrust::device, o_j, d_j.end());
149   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
150   PetscCall(PetscLogGpuTimeEnd());
151   noff = thrust::distance(o_j, wit);
152   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
153   PetscCall(PetscMalloc1(noff, &b->garray));
154   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
155   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
156   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
157   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
158   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
159   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
160   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
161 
162   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
163   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
164   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
165   PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->A));
166   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
167   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
168   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
169   PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->B));
170 
171   /* GPU memory, cusparse specific call handles it internally */
172   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
173   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
174   PetscCall(PetscFree(jj));
175 
176   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
177   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
178 
179   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
180   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
181   PetscCall(MatSetUpMultiply_MPIAIJ(B));
182   PetscFunctionReturn(0);
183 }
184 
185 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
186   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
187   Mat_MPIAIJCUSPARSE *mpidev;
188   PetscBool           coo_basic = PETSC_TRUE;
189   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
190   PetscInt            rstart, rend;
191 
192   PetscFunctionBegin;
193   PetscCall(PetscFree(mpiaij->garray));
194   PetscCall(VecDestroy(&mpiaij->lvec));
195 #if defined(PETSC_USE_CTABLE)
196   PetscCall(PetscTableDestroy(&mpiaij->colmap));
197 #else
198   PetscCall(PetscFree(mpiaij->colmap));
199 #endif
200   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
201   mat->assembled     = PETSC_FALSE;
202   mat->was_assembled = PETSC_FALSE;
203   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
204   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
205   if (coo_i) {
206     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
207     PetscCall(PetscGetMemType(coo_i, &mtype));
208     if (PetscMemTypeHost(mtype)) {
209       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
210         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
211           coo_basic = PETSC_FALSE;
212           break;
213         }
214       }
215     }
216   }
217   /* All ranks must agree on the value of coo_basic */
218   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
219   if (coo_basic) {
220     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
221   } else {
222     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
223     mat->offloadmask = PETSC_OFFLOAD_CPU;
224     /* creates the GPU memory */
225     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
226     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
227     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
228     mpidev->use_extended_coo = PETSC_TRUE;
229 
230     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
231     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
232 
233     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
234     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
235 
236     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
237     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
238     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
239 
240     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
241     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
242     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
243 
244     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
245     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
246     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
247 
248     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
249     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
250 
251     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
252     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
253 
254     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
255     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
256     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
257 
258     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
259     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
260     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
261 
262     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
263   }
264   PetscFunctionReturn(0);
265 }
266 
267 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) {
268   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
269   const PetscCount grid_size = gridDim.x * blockDim.x;
270   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
271 }
272 
273 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) {
274   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
275   const PetscCount grid_size = gridDim.x * blockDim.x;
276   for (; i < Annz + Bnnz; i += grid_size) {
277     PetscScalar sum = 0.0;
278     if (i < Annz) {
279       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
280       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
281     } else {
282       i -= Annz;
283       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
284       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
285     }
286   }
287 }
288 
289 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) {
290   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
291   const PetscCount grid_size = gridDim.x * blockDim.x;
292   for (; i < Annz2 + Bnnz2; i += grid_size) {
293     if (i < Annz2) {
294       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
295     } else {
296       i -= Annz2;
297       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
298     }
299   }
300 }
301 
302 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) {
303   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
304   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
305   Mat                 A = mpiaij->A, B = mpiaij->B;
306   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
307   PetscScalar        *Aa, *Ba = NULL;
308   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
309   const PetscScalar  *v1     = v;
310   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
311   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
312   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
313   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
314   PetscMemType        memtype;
315 
316   PetscFunctionBegin;
317   if (mpidev->use_extended_coo) {
318     PetscMPIInt size;
319 
320     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
321     PetscCall(PetscGetMemType(v, &memtype));
322     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
323       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
324       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
325     }
326 
327     if (imode == INSERT_VALUES) {
328       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
329       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
330     } else {
331       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
332       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
333     }
334 
335     /* Pack entries to be sent to remote */
336     if (mpiaij->sendlen) {
337       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
338       PetscCallCUDA(cudaPeekAtLastError());
339     }
340 
341     /* Send remote entries to their owner and overlap the communication with local computation */
342     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
343     /* Add local entries to A and B */
344     if (Annz + Bnnz > 0) {
345       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
346       PetscCallCUDA(cudaPeekAtLastError());
347     }
348     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
349 
350     /* Add received remote entries to A and B */
351     if (Annz2 + Bnnz2 > 0) {
352       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
353       PetscCallCUDA(cudaPeekAtLastError());
354     }
355 
356     if (imode == INSERT_VALUES) {
357       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
358       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
359     } else {
360       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
361       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
362     }
363     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
364   } else {
365     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
366   }
367   mat->offloadmask = PETSC_OFFLOAD_GPU;
368   PetscFunctionReturn(0);
369 }
370 
371 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) {
372   Mat             Ad, Ao;
373   const PetscInt *cmap;
374 
375   PetscFunctionBegin;
376   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
377   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
378   if (glob) {
379     PetscInt cst, i, dn, on, *gidx;
380 
381     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
382     PetscCall(MatGetLocalSize(Ao, NULL, &on));
383     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
384     PetscCall(PetscMalloc1(dn + on, &gidx));
385     for (i = 0; i < dn; i++) gidx[i] = cst + i;
386     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
387     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
388   }
389   PetscFunctionReturn(0);
390 }
391 
392 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) {
393   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
394   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
395   PetscInt            i;
396 
397   PetscFunctionBegin;
398   PetscCall(PetscLayoutSetUp(B->rmap));
399   PetscCall(PetscLayoutSetUp(B->cmap));
400   if (PetscDefined(USE_DEBUG) && d_nnz) {
401     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
402   }
403   if (PetscDefined(USE_DEBUG) && o_nnz) {
404     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
405   }
406 #if defined(PETSC_USE_CTABLE)
407   PetscCall(PetscTableDestroy(&b->colmap));
408 #else
409   PetscCall(PetscFree(b->colmap));
410 #endif
411   PetscCall(PetscFree(b->garray));
412   PetscCall(VecDestroy(&b->lvec));
413   PetscCall(VecScatterDestroy(&b->Mvctx));
414   /* Because the B will have been resized we simply destroy it and create a new one each time */
415   PetscCall(MatDestroy(&b->B));
416   if (!b->A) {
417     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
418     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
419     PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->A));
420   }
421   if (!b->B) {
422     PetscMPIInt size;
423     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
424     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
425     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
426     PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->B));
427   }
428   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
429   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
430   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
431   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
432   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
433   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
434   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
435   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
436   B->preallocated = PETSC_TRUE;
437   PetscFunctionReturn(0);
438 }
439 
440 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
441   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
442 
443   PetscFunctionBegin;
444   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
445   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
446   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
447   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
448   PetscFunctionReturn(0);
449 }
450 
451 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) {
452   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
453 
454   PetscFunctionBegin;
455   PetscCall(MatZeroEntries(l->A));
456   PetscCall(MatZeroEntries(l->B));
457   PetscFunctionReturn(0);
458 }
459 
460 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
461   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
462 
463   PetscFunctionBegin;
464   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
465   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
466   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
467   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
468   PetscFunctionReturn(0);
469 }
470 
471 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
472   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
473 
474   PetscFunctionBegin;
475   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
476   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
477   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
478   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
479   PetscFunctionReturn(0);
480 }
481 
482 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
483   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
484   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
485 
486   PetscFunctionBegin;
487   switch (op) {
488   case MAT_CUSPARSE_MULT_DIAG: cusparseStruct->diagGPUMatFormat = format; break;
489   case MAT_CUSPARSE_MULT_OFFDIAG: cusparseStruct->offdiagGPUMatFormat = format; break;
490   case MAT_CUSPARSE_ALL:
491     cusparseStruct->diagGPUMatFormat    = format;
492     cusparseStruct->offdiagGPUMatFormat = format;
493     break;
494   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
495   }
496   PetscFunctionReturn(0);
497 }
498 
499 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) {
500   MatCUSPARSEStorageFormat format;
501   PetscBool                flg;
502   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
503   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
504 
505   PetscFunctionBegin;
506   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
507   if (A->factortype == MAT_FACTOR_NONE) {
508     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
509     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
510     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
511     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
512     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
513     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
514   }
515   PetscOptionsHeadEnd();
516   PetscFunctionReturn(0);
517 }
518 
519 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) {
520   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
521   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
522   PetscObjectState    onnz   = A->nonzerostate;
523 
524   PetscFunctionBegin;
525   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
526   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
527   if (onnz != A->nonzerostate && cusp->deviceMat) {
528     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
529 
530     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
531     PetscCall(PetscNew(&h_mat));
532     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
533     PetscCallCUDA(cudaFree(h_mat->colmap));
534     if (h_mat->allocated_indices) {
535       PetscCallCUDA(cudaFree(h_mat->diag.i));
536       PetscCallCUDA(cudaFree(h_mat->diag.j));
537       if (h_mat->offdiag.j) {
538         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
539         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
540       }
541     }
542     PetscCallCUDA(cudaFree(d_mat));
543     PetscCall(PetscFree(h_mat));
544     cusp->deviceMat = NULL;
545   }
546   PetscFunctionReturn(0);
547 }
548 
549 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) {
550   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
551   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
552 
553   PetscFunctionBegin;
554   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
555   if (cusparseStruct->deviceMat) {
556     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
557 
558     PetscCall(PetscInfo(A, "Have device matrix\n"));
559     PetscCall(PetscNew(&h_mat));
560     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
561     PetscCallCUDA(cudaFree(h_mat->colmap));
562     if (h_mat->allocated_indices) {
563       PetscCallCUDA(cudaFree(h_mat->diag.i));
564       PetscCallCUDA(cudaFree(h_mat->diag.j));
565       if (h_mat->offdiag.j) {
566         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
567         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
568       }
569     }
570     PetscCallCUDA(cudaFree(d_mat));
571     PetscCall(PetscFree(h_mat));
572   }
573   /* Free COO */
574   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
575   PetscCallCXX(delete cusparseStruct);
576   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
577   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
578   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
579   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
580   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
581   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
582   PetscCall(MatDestroy_MPIAIJ(A));
583   PetscFunctionReturn(0);
584 }
585 
586 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat) {
587   Mat_MPIAIJ *a;
588   Mat         A;
589 
590   PetscFunctionBegin;
591   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
592   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
593   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
594   A             = *newmat;
595   A->boundtocpu = PETSC_FALSE;
596   PetscCall(PetscFree(A->defaultvectype));
597   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
598 
599   a = (Mat_MPIAIJ *)A->data;
600   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
601   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
602   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
603 
604   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
605 
606   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
607   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
608   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
609   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
610   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
611   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
612   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
613   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
614 
615   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
616   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
617   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
618   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
619   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
620   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
621 #if defined(PETSC_HAVE_HYPRE)
622   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
623 #endif
624   PetscFunctionReturn(0);
625 }
626 
627 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) {
628   PetscFunctionBegin;
629   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
630   PetscCall(MatCreate_MPIAIJ(A));
631   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
632   PetscFunctionReturn(0);
633 }
634 
635 /*@
636    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
637    (the default parallel PETSc format).  This matrix will ultimately pushed down
638    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
639    assembly performance the user should preallocate the matrix storage by setting
640    the parameter nz (or the array nnz).  By setting these parameters accurately,
641    performance during matrix assembly can be increased by more than a factor of 50.
642 
643    Collective
644 
645    Input Parameters:
646 +  comm - MPI communicator, set to PETSC_COMM_SELF
647 .  m - number of rows
648 .  n - number of columns
649 .  nz - number of nonzeros per row (same for all rows)
650 -  nnz - array containing the number of nonzeros in the various rows
651          (possibly different for each row) or NULL
652 
653    Output Parameter:
654 .  A - the matrix
655 
656    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
657    MatXXXXSetPreallocation() paradigm instead of this routine directly.
658    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
659 
660    Notes:
661    If nnz is given then nz is ignored
662 
663    The AIJ format (also called the Yale sparse matrix format or
664    compressed row storage), is fully compatible with standard Fortran 77
665    storage.  That is, the stored row and column indices can begin at
666    either one (as in Fortran) or zero.  See the users' manual for details.
667 
668    Specify the preallocated storage with either nz or nnz (not both).
669    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
670    allocation.  For large problems you MUST preallocate memory or you
671    will get TERRIBLE performance, see the users' manual chapter on matrices.
672 
673    By default, this format uses inodes (identical nodes) when possible, to
674    improve numerical efficiency of matrix-vector products and solves. We
675    search for consecutive rows with the same nonzero structure, thereby
676    reusing matrix information to achieve increased efficiency.
677 
678    Level: intermediate
679 
680 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
681 @*/
682 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) {
683   PetscMPIInt size;
684 
685   PetscFunctionBegin;
686   PetscCall(MatCreate(comm, A));
687   PetscCall(MatSetSizes(*A, m, n, M, N));
688   PetscCallMPI(MPI_Comm_size(comm, &size));
689   if (size > 1) {
690     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
691     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
692   } else {
693     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
694     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
695   }
696   PetscFunctionReturn(0);
697 }
698 
699 /*MC
700    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
701 
702    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
703    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
704    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
705 
706    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
707    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
708    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
709    for communicators controlling multiple processes.  It is recommended that you call both of
710    the above preallocation routines for simplicity.
711 
712    Options Database Keys:
713 +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
714 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
715 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
716 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
717 
718   Level: beginner
719 
720  .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
721 M*/
722 
723 /*MC
724    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
725 
726   Level: beginner
727 
728  .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
729 M*/
730 
731 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
732 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) {
733   PetscSplitCSRDataStructure d_mat;
734   PetscMPIInt                size;
735   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
736   PetscScalar               *aa = NULL, *ba = NULL;
737   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
738   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
739   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;
740 
741   PetscFunctionBegin;
742   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
743   if (A->factortype != MAT_FACTOR_NONE) {
744     *B = NULL;
745     PetscFunctionReturn(0);
746   }
747   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
748   // get jaca
749   if (size == 1) {
750     PetscBool isseqaij;
751 
752     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
753     if (isseqaij) {
754       jaca = (Mat_SeqAIJ *)A->data;
755       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
756       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
757       d_mat           = cusparsestructA->deviceMat;
758       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
759     } else {
760       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
761       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
762       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
763       jaca                      = (Mat_SeqAIJ *)aij->A->data;
764       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
765       d_mat                     = spptr->deviceMat;
766       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
767     }
768     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
769       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
770       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
771       matrixA = (CsrMatrix *)matstruct->mat;
772       bi      = NULL;
773       bj      = NULL;
774       ba      = NULL;
775     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
776   } else {
777     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
778     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
779     jaca                      = (Mat_SeqAIJ *)aij->A->data;
780     jacb                      = (Mat_SeqAIJ *)aij->B->data;
781     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
782 
783     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
784     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
785     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
786     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
787     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
788       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
789       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
790       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
791       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
792       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
793       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
794       matrixA = (CsrMatrix *)matstructA->mat;
795       matrixB = (CsrMatrix *)matstructB->mat;
796       if (jacb->compressedrow.use) {
797         if (!cusparsestructB->rowoffsets_gpu) {
798           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
799           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
800         }
801         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
802       } else {
803         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
804       }
805       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
806       ba = thrust::raw_pointer_cast(matrixB->values->data());
807     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
808     d_mat = spptr->deviceMat;
809   }
810   if (jaca->compressedrow.use) {
811     if (!cusparsestructA->rowoffsets_gpu) {
812       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
813       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
814     }
815     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
816   } else {
817     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
818   }
819   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
820   aa = thrust::raw_pointer_cast(matrixA->values->data());
821 
822   if (!d_mat) {
823     PetscSplitCSRDataStructure h_mat;
824 
825     // create and populate strucy on host and copy on device
826     PetscCall(PetscInfo(A, "Create device matrix\n"));
827     PetscCall(PetscNew(&h_mat));
828     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
829     if (size > 1) { /* need the colmap array */
830       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
831       PetscInt   *colmap;
832       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;
833 
834       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
835 
836       PetscCall(PetscCalloc1(N + 1, &colmap));
837       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
838 #if defined(PETSC_USE_64BIT_INDICES)
839       { // have to make a long version of these
840         int      *h_bi32, *h_bj32;
841         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
842         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
843         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
844         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
845         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
846         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
847 
848         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
849         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
850         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
851         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
852 
853         h_mat->offdiag.i         = d_bi64;
854         h_mat->offdiag.j         = d_bj64;
855         h_mat->allocated_indices = PETSC_TRUE;
856 
857         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
858       }
859 #else
860       h_mat->offdiag.i         = (PetscInt *)bi;
861       h_mat->offdiag.j         = (PetscInt *)bj;
862       h_mat->allocated_indices = PETSC_FALSE;
863 #endif
864       h_mat->offdiag.a = ba;
865       h_mat->offdiag.n = A->rmap->n;
866 
867       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
868       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
869       PetscCall(PetscFree(colmap));
870     }
871     h_mat->rstart = A->rmap->rstart;
872     h_mat->rend   = A->rmap->rend;
873     h_mat->cstart = A->cmap->rstart;
874     h_mat->cend   = A->cmap->rend;
875     h_mat->M      = A->cmap->N;
876 #if defined(PETSC_USE_64BIT_INDICES)
877     {
878       int      *h_ai32, *h_aj32;
879       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
880 
881       static_assert(sizeof(PetscInt) == 8, "");
882       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
883       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
884       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
885       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
886       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
887 
888       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
889       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
890       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
891       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
892 
893       h_mat->diag.i            = d_ai64;
894       h_mat->diag.j            = d_aj64;
895       h_mat->allocated_indices = PETSC_TRUE;
896 
897       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
898     }
899 #else
900     h_mat->diag.i            = (PetscInt *)ai;
901     h_mat->diag.j            = (PetscInt *)aj;
902     h_mat->allocated_indices = PETSC_FALSE;
903 #endif
904     h_mat->diag.a = aa;
905     h_mat->diag.n = A->rmap->n;
906     h_mat->rank   = PetscGlobalRank;
907     // copy pointers and metadata to device
908     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
909     PetscCall(PetscFree(h_mat));
910   } else {
911     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
912   }
913   *B             = d_mat;
914   A->offloadmask = PETSC_OFFLOAD_GPU;
915   PetscFunctionReturn(0);
916 }
917