xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 750b007cd8d816cecd9de99077bb0a703b4cf61a)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals {
15   template <typename Tuple>
16   __host__ __device__ void operator()(Tuple t) {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) {
22   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
23   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
24 
25   PetscFunctionBegin;
26   if (!cusparseStruct) PetscFunctionReturn(0);
27   if (cusparseStruct->use_extended_coo) {
28     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
29     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
30     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
31     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
32     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
33     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
34     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
35     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
36     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
37     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
38     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
39     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
40     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
41   }
42   cusparseStruct->use_extended_coo = PETSC_FALSE;
43   delete cusparseStruct->coo_p;
44   delete cusparseStruct->coo_pw;
45   cusparseStruct->coo_p  = NULL;
46   cusparseStruct->coo_pw = NULL;
47   PetscFunctionReturn(0);
48 }
49 
50 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) {
51   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
52   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
53   PetscInt            n    = cusp->coo_nd + cusp->coo_no;
54 
55   PetscFunctionBegin;
56   if (cusp->coo_p && v) {
57     thrust::device_ptr<const PetscScalar> d_v;
58     THRUSTARRAY                          *w = NULL;
59 
60     if (isCudaMem(v)) {
61       d_v = thrust::device_pointer_cast(v);
62     } else {
63       w = new THRUSTARRAY(n);
64       w->assign(v, v + n);
65       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
66       d_v = w->data();
67     }
68 
69     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
70     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
71     PetscCall(PetscLogGpuTimeBegin());
72     thrust::for_each(zibit, zieit, VecCUDAEquals());
73     PetscCall(PetscLogGpuTimeEnd());
74     delete w;
75     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
76     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
77   } else {
78     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
79     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
80   }
81   PetscFunctionReturn(0);
82 }
83 
84 template <typename Tuple>
85 struct IsNotOffDiagT {
86   PetscInt _cstart, _cend;
87 
88   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
89   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
90 };
91 
92 struct IsOffDiag {
93   PetscInt _cstart, _cend;
94 
95   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
96   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
97 };
98 
99 struct GlobToLoc {
100   PetscInt _start;
101 
102   GlobToLoc(PetscInt start) : _start(start) { }
103   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
104 };
105 
106 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) {
107   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
108   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
109   PetscInt               N, *jj;
110   size_t                 noff = 0;
111   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
112   THRUSTINTARRAY         d_j(n);
113   ISLocalToGlobalMapping l2g;
114 
115   PetscFunctionBegin;
116   PetscCall(MatDestroy(&b->A));
117   PetscCall(MatDestroy(&b->B));
118 
119   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
120   d_i.assign(coo_i, coo_i + n);
121   d_j.assign(coo_j, coo_j + n);
122   PetscCall(PetscLogGpuTimeBegin());
123   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
124   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
125   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
126     cusp->coo_p  = new THRUSTINTARRAY(n);
127     cusp->coo_pw = new THRUSTARRAY(n);
128     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
129     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
130     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
131     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
132     firstoffd  = mzipp.get_iterator_tuple().get<1>();
133   }
134   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
135   cusp->coo_no = thrust::distance(firstoffd, d_j.end());
136 
137   /* from global to local */
138   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
139   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
140   PetscCall(PetscLogGpuTimeEnd());
141 
142   /* copy offdiag column indices to map on the CPU */
143   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
144   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
145   auto o_j = d_j.begin();
146   PetscCall(PetscLogGpuTimeBegin());
147   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
148   thrust::sort(thrust::device, o_j, d_j.end());
149   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
150   PetscCall(PetscLogGpuTimeEnd());
151   noff = thrust::distance(o_j, wit);
152   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
153   PetscCall(PetscMalloc1(noff, &b->garray));
154   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
155   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
156   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
157   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
158   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
159   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
160   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
161 
162   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
163   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
164   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
165   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
166   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
167   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
168 
169   /* GPU memory, cusparse specific call handles it internally */
170   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
171   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
172   PetscCall(PetscFree(jj));
173 
174   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
175   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
176 
177   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
178   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
179   PetscCall(MatSetUpMultiply_MPIAIJ(B));
180   PetscFunctionReturn(0);
181 }
182 
183 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
184   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
185   Mat_MPIAIJCUSPARSE *mpidev;
186   PetscBool           coo_basic = PETSC_TRUE;
187   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
188   PetscInt            rstart, rend;
189 
190   PetscFunctionBegin;
191   PetscCall(PetscFree(mpiaij->garray));
192   PetscCall(VecDestroy(&mpiaij->lvec));
193 #if defined(PETSC_USE_CTABLE)
194   PetscCall(PetscTableDestroy(&mpiaij->colmap));
195 #else
196   PetscCall(PetscFree(mpiaij->colmap));
197 #endif
198   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
199   mat->assembled     = PETSC_FALSE;
200   mat->was_assembled = PETSC_FALSE;
201   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
202   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
203   if (coo_i) {
204     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
205     PetscCall(PetscGetMemType(coo_i, &mtype));
206     if (PetscMemTypeHost(mtype)) {
207       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
208         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
209           coo_basic = PETSC_FALSE;
210           break;
211         }
212       }
213     }
214   }
215   /* All ranks must agree on the value of coo_basic */
216   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
217   if (coo_basic) {
218     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
219   } else {
220     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
221     mat->offloadmask = PETSC_OFFLOAD_CPU;
222     /* creates the GPU memory */
223     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
224     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
225     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
226     mpidev->use_extended_coo = PETSC_TRUE;
227 
228     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
229     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
230 
231     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
232     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
233 
234     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
235     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
236     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
237 
238     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
239     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
240     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
241 
242     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
243     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
244     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
245 
246     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
247     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
248 
249     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
250     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
251 
252     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
253     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
254     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
255 
256     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
257     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
258     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
259 
260     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
261   }
262   PetscFunctionReturn(0);
263 }
264 
265 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) {
266   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
267   const PetscCount grid_size = gridDim.x * blockDim.x;
268   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
269 }
270 
271 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) {
272   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
273   const PetscCount grid_size = gridDim.x * blockDim.x;
274   for (; i < Annz + Bnnz; i += grid_size) {
275     PetscScalar sum = 0.0;
276     if (i < Annz) {
277       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
278       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
279     } else {
280       i -= Annz;
281       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
282       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
283     }
284   }
285 }
286 
287 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) {
288   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
289   const PetscCount grid_size = gridDim.x * blockDim.x;
290   for (; i < Annz2 + Bnnz2; i += grid_size) {
291     if (i < Annz2) {
292       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
293     } else {
294       i -= Annz2;
295       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
296     }
297   }
298 }
299 
300 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) {
301   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
302   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
303   Mat                 A = mpiaij->A, B = mpiaij->B;
304   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
305   PetscScalar        *Aa, *Ba = NULL;
306   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
307   const PetscScalar  *v1     = v;
308   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
309   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
310   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
311   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
312   PetscMemType        memtype;
313 
314   PetscFunctionBegin;
315   if (mpidev->use_extended_coo) {
316     PetscMPIInt size;
317 
318     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
319     PetscCall(PetscGetMemType(v, &memtype));
320     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
321       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
322       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
323     }
324 
325     if (imode == INSERT_VALUES) {
326       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
327       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
328     } else {
329       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
330       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
331     }
332 
333     /* Pack entries to be sent to remote */
334     if (mpiaij->sendlen) {
335       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
336       PetscCallCUDA(cudaPeekAtLastError());
337     }
338 
339     /* Send remote entries to their owner and overlap the communication with local computation */
340     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
341     /* Add local entries to A and B */
342     if (Annz + Bnnz > 0) {
343       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
344       PetscCallCUDA(cudaPeekAtLastError());
345     }
346     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
347 
348     /* Add received remote entries to A and B */
349     if (Annz2 + Bnnz2 > 0) {
350       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
351       PetscCallCUDA(cudaPeekAtLastError());
352     }
353 
354     if (imode == INSERT_VALUES) {
355       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
356       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
357     } else {
358       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
359       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
360     }
361     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
362   } else {
363     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
364   }
365   mat->offloadmask = PETSC_OFFLOAD_GPU;
366   PetscFunctionReturn(0);
367 }
368 
369 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) {
370   Mat             Ad, Ao;
371   const PetscInt *cmap;
372 
373   PetscFunctionBegin;
374   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
375   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
376   if (glob) {
377     PetscInt cst, i, dn, on, *gidx;
378 
379     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
380     PetscCall(MatGetLocalSize(Ao, NULL, &on));
381     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
382     PetscCall(PetscMalloc1(dn + on, &gidx));
383     for (i = 0; i < dn; i++) gidx[i] = cst + i;
384     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
385     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
386   }
387   PetscFunctionReturn(0);
388 }
389 
390 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) {
391   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
392   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
393   PetscInt            i;
394 
395   PetscFunctionBegin;
396   PetscCall(PetscLayoutSetUp(B->rmap));
397   PetscCall(PetscLayoutSetUp(B->cmap));
398   if (PetscDefined(USE_DEBUG) && d_nnz) {
399     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
400   }
401   if (PetscDefined(USE_DEBUG) && o_nnz) {
402     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
403   }
404 #if defined(PETSC_USE_CTABLE)
405   PetscCall(PetscTableDestroy(&b->colmap));
406 #else
407   PetscCall(PetscFree(b->colmap));
408 #endif
409   PetscCall(PetscFree(b->garray));
410   PetscCall(VecDestroy(&b->lvec));
411   PetscCall(VecScatterDestroy(&b->Mvctx));
412   /* Because the B will have been resized we simply destroy it and create a new one each time */
413   PetscCall(MatDestroy(&b->B));
414   if (!b->A) {
415     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
416     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
417   }
418   if (!b->B) {
419     PetscMPIInt size;
420     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
421     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
422     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
423   }
424   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
425   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
426   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
427   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
428   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
429   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
430   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
431   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
432   B->preallocated = PETSC_TRUE;
433   PetscFunctionReturn(0);
434 }
435 
436 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
437   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
438 
439   PetscFunctionBegin;
440   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
441   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
442   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
443   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
444   PetscFunctionReturn(0);
445 }
446 
447 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) {
448   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
449 
450   PetscFunctionBegin;
451   PetscCall(MatZeroEntries(l->A));
452   PetscCall(MatZeroEntries(l->B));
453   PetscFunctionReturn(0);
454 }
455 
456 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
457   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
458 
459   PetscFunctionBegin;
460   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
461   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
462   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
463   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
464   PetscFunctionReturn(0);
465 }
466 
467 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
468   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
469 
470   PetscFunctionBegin;
471   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
472   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
473   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
474   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
475   PetscFunctionReturn(0);
476 }
477 
478 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
479   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
480   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
481 
482   PetscFunctionBegin;
483   switch (op) {
484   case MAT_CUSPARSE_MULT_DIAG: cusparseStruct->diagGPUMatFormat = format; break;
485   case MAT_CUSPARSE_MULT_OFFDIAG: cusparseStruct->offdiagGPUMatFormat = format; break;
486   case MAT_CUSPARSE_ALL:
487     cusparseStruct->diagGPUMatFormat    = format;
488     cusparseStruct->offdiagGPUMatFormat = format;
489     break;
490   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
491   }
492   PetscFunctionReturn(0);
493 }
494 
495 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) {
496   MatCUSPARSEStorageFormat format;
497   PetscBool                flg;
498   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
499   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
500 
501   PetscFunctionBegin;
502   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
503   if (A->factortype == MAT_FACTOR_NONE) {
504     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
505     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
506     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
507     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
508     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
509     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
510   }
511   PetscOptionsHeadEnd();
512   PetscFunctionReturn(0);
513 }
514 
515 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) {
516   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
517   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
518   PetscObjectState    onnz   = A->nonzerostate;
519 
520   PetscFunctionBegin;
521   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
522   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
523   if (onnz != A->nonzerostate && cusp->deviceMat) {
524     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
525 
526     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
527     PetscCall(PetscNew(&h_mat));
528     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
529     PetscCallCUDA(cudaFree(h_mat->colmap));
530     if (h_mat->allocated_indices) {
531       PetscCallCUDA(cudaFree(h_mat->diag.i));
532       PetscCallCUDA(cudaFree(h_mat->diag.j));
533       if (h_mat->offdiag.j) {
534         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
535         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
536       }
537     }
538     PetscCallCUDA(cudaFree(d_mat));
539     PetscCall(PetscFree(h_mat));
540     cusp->deviceMat = NULL;
541   }
542   PetscFunctionReturn(0);
543 }
544 
545 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) {
546   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
547   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
548 
549   PetscFunctionBegin;
550   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
551   if (cusparseStruct->deviceMat) {
552     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
553 
554     PetscCall(PetscInfo(A, "Have device matrix\n"));
555     PetscCall(PetscNew(&h_mat));
556     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
557     PetscCallCUDA(cudaFree(h_mat->colmap));
558     if (h_mat->allocated_indices) {
559       PetscCallCUDA(cudaFree(h_mat->diag.i));
560       PetscCallCUDA(cudaFree(h_mat->diag.j));
561       if (h_mat->offdiag.j) {
562         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
563         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
564       }
565     }
566     PetscCallCUDA(cudaFree(d_mat));
567     PetscCall(PetscFree(h_mat));
568   }
569   /* Free COO */
570   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
571   PetscCallCXX(delete cusparseStruct);
572   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
573   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
574   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
575   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
576   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
577   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
578   PetscCall(MatDestroy_MPIAIJ(A));
579   PetscFunctionReturn(0);
580 }
581 
582 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat) {
583   Mat_MPIAIJ *a;
584   Mat         A;
585 
586   PetscFunctionBegin;
587   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
588   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
589   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
590   A             = *newmat;
591   A->boundtocpu = PETSC_FALSE;
592   PetscCall(PetscFree(A->defaultvectype));
593   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
594 
595   a = (Mat_MPIAIJ *)A->data;
596   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
597   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
598   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
599 
600   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
601 
602   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
603   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
604   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
605   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
606   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
607   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
608   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
609   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
610 
611   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
612   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
613   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
614   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
615   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
616   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
617 #if defined(PETSC_HAVE_HYPRE)
618   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
619 #endif
620   PetscFunctionReturn(0);
621 }
622 
623 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) {
624   PetscFunctionBegin;
625   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
626   PetscCall(MatCreate_MPIAIJ(A));
627   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
628   PetscFunctionReturn(0);
629 }
630 
631 /*@
632    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
633    (the default parallel PETSc format).  This matrix will ultimately pushed down
634    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
635    assembly performance the user should preallocate the matrix storage by setting
636    the parameter nz (or the array nnz).  By setting these parameters accurately,
637    performance during matrix assembly can be increased by more than a factor of 50.
638 
639    Collective
640 
641    Input Parameters:
642 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
643 .  m - number of rows
644 .  n - number of columns
645 .  nz - number of nonzeros per row (same for all rows)
646 -  nnz - array containing the number of nonzeros in the various rows
647          (possibly different for each row) or NULL
648 
649    Output Parameter:
650 .  A - the matrix
651 
652    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
653    MatXXXXSetPreallocation() paradigm instead of this routine directly.
654    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
655 
656    Notes:
657    If nnz is given then nz is ignored
658 
659    The AIJ format, also called the
660    compressed row storage), is fully compatible with standard Fortran 77
661    storage.  That is, the stored row and column indices can begin at
662    either one (as in Fortran) or zero.  See the users' manual for details.
663 
664    Specify the preallocated storage with either nz or nnz (not both).
665    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
666    allocation.  For large problems you MUST preallocate memory or you
667    will get TERRIBLE performance, see the users' manual chapter on matrices.
668 
669    By default, this format uses inodes (identical nodes) when possible, to
670    improve numerical efficiency of matrix-vector products and solves. We
671    search for consecutive rows with the same nonzero structure, thereby
672    reusing matrix information to achieve increased efficiency.
673 
674    Level: intermediate
675 
676 .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
677 @*/
678 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) {
679   PetscMPIInt size;
680 
681   PetscFunctionBegin;
682   PetscCall(MatCreate(comm, A));
683   PetscCall(MatSetSizes(*A, m, n, M, N));
684   PetscCallMPI(MPI_Comm_size(comm, &size));
685   if (size > 1) {
686     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
687     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
688   } else {
689     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
690     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
691   }
692   PetscFunctionReturn(0);
693 }
694 
695 /*MC
696    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
697 
698    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
699    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
700    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
701 
702    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
703    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
704    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
705    for communicators controlling multiple processes.  It is recommended that you call both of
706    the above preallocation routines for simplicity.
707 
708    Options Database Keys:
709 +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()`
710 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
711 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
712 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
713 
714   Level: beginner
715 
716  .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
717 M*/
718 
719 /*MC
720    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
721 
722   Level: beginner
723 
724  .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
725 M*/
726 
727 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
728 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) {
729   PetscSplitCSRDataStructure d_mat;
730   PetscMPIInt                size;
731   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
732   PetscScalar               *aa = NULL, *ba = NULL;
733   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
734   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
735   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;
736 
737   PetscFunctionBegin;
738   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
739   if (A->factortype != MAT_FACTOR_NONE) {
740     *B = NULL;
741     PetscFunctionReturn(0);
742   }
743   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
744   // get jaca
745   if (size == 1) {
746     PetscBool isseqaij;
747 
748     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
749     if (isseqaij) {
750       jaca = (Mat_SeqAIJ *)A->data;
751       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
752       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
753       d_mat           = cusparsestructA->deviceMat;
754       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
755     } else {
756       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
757       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
758       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
759       jaca                      = (Mat_SeqAIJ *)aij->A->data;
760       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
761       d_mat                     = spptr->deviceMat;
762       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
763     }
764     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
765       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
766       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
767       matrixA = (CsrMatrix *)matstruct->mat;
768       bi      = NULL;
769       bj      = NULL;
770       ba      = NULL;
771     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
772   } else {
773     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
774     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
775     jaca                      = (Mat_SeqAIJ *)aij->A->data;
776     jacb                      = (Mat_SeqAIJ *)aij->B->data;
777     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
778 
779     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
780     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
781     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
782     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
783     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
784       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
785       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
786       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
787       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
788       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
789       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
790       matrixA = (CsrMatrix *)matstructA->mat;
791       matrixB = (CsrMatrix *)matstructB->mat;
792       if (jacb->compressedrow.use) {
793         if (!cusparsestructB->rowoffsets_gpu) {
794           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
795           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
796         }
797         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
798       } else {
799         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
800       }
801       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
802       ba = thrust::raw_pointer_cast(matrixB->values->data());
803     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
804     d_mat = spptr->deviceMat;
805   }
806   if (jaca->compressedrow.use) {
807     if (!cusparsestructA->rowoffsets_gpu) {
808       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
809       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
810     }
811     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
812   } else {
813     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
814   }
815   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
816   aa = thrust::raw_pointer_cast(matrixA->values->data());
817 
818   if (!d_mat) {
819     PetscSplitCSRDataStructure h_mat;
820 
821     // create and populate strucy on host and copy on device
822     PetscCall(PetscInfo(A, "Create device matrix\n"));
823     PetscCall(PetscNew(&h_mat));
824     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
825     if (size > 1) { /* need the colmap array */
826       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
827       PetscInt   *colmap;
828       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;
829 
830       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
831 
832       PetscCall(PetscCalloc1(N + 1, &colmap));
833       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
834 #if defined(PETSC_USE_64BIT_INDICES)
835       { // have to make a long version of these
836         int      *h_bi32, *h_bj32;
837         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
838         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
839         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
840         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
841         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
842         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
843 
844         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
845         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
846         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
847         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
848 
849         h_mat->offdiag.i         = d_bi64;
850         h_mat->offdiag.j         = d_bj64;
851         h_mat->allocated_indices = PETSC_TRUE;
852 
853         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
854       }
855 #else
856       h_mat->offdiag.i         = (PetscInt *)bi;
857       h_mat->offdiag.j         = (PetscInt *)bj;
858       h_mat->allocated_indices = PETSC_FALSE;
859 #endif
860       h_mat->offdiag.a = ba;
861       h_mat->offdiag.n = A->rmap->n;
862 
863       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
864       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
865       PetscCall(PetscFree(colmap));
866     }
867     h_mat->rstart = A->rmap->rstart;
868     h_mat->rend   = A->rmap->rend;
869     h_mat->cstart = A->cmap->rstart;
870     h_mat->cend   = A->cmap->rend;
871     h_mat->M      = A->cmap->N;
872 #if defined(PETSC_USE_64BIT_INDICES)
873     {
874       int      *h_ai32, *h_aj32;
875       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
876 
877       static_assert(sizeof(PetscInt) == 8, "");
878       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
879       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
880       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
881       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
882       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
883 
884       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
885       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
886       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
887       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
888 
889       h_mat->diag.i            = d_ai64;
890       h_mat->diag.j            = d_aj64;
891       h_mat->allocated_indices = PETSC_TRUE;
892 
893       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
894     }
895 #else
896     h_mat->diag.i            = (PetscInt *)ai;
897     h_mat->diag.j            = (PetscInt *)aj;
898     h_mat->allocated_indices = PETSC_FALSE;
899 #endif
900     h_mat->diag.a = aa;
901     h_mat->diag.n = A->rmap->n;
902     h_mat->rank   = PetscGlobalRank;
903     // copy pointers and metadata to device
904     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
905     PetscCall(PetscFree(h_mat));
906   } else {
907     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
908   }
909   *B             = d_mat;
910   A->offloadmask = PETSC_OFFLOAD_GPU;
911   PetscFunctionReturn(0);
912 }
913