xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1 #define PETSC_SKIP_SPINLOCK
2 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3 
4 #include <petscconf.h>
5 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
6 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8 #include <thrust/advance.h>
9 #include <thrust/partition.h>
10 #include <thrust/sort.h>
11 #include <thrust/unique.h>
12 #include <petscsf.h>
13 
14 struct VecCUDAEquals {
15   template <typename Tuple>
16   __host__ __device__ void operator()(Tuple t)
17   {
18     thrust::get<1>(t) = thrust::get<0>(t);
19   }
20 };
21 
22 static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
23 {
24   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
25   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
26 
27   PetscFunctionBegin;
28   if (!cusparseStruct) PetscFunctionReturn(0);
29   if (cusparseStruct->use_extended_coo) {
30     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
31     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
32     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
33     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
34     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
35     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
36     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
37     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
38     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
39     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
40     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
41     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
42     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
43   }
44   cusparseStruct->use_extended_coo = PETSC_FALSE;
45   delete cusparseStruct->coo_p;
46   delete cusparseStruct->coo_pw;
47   cusparseStruct->coo_p  = NULL;
48   cusparseStruct->coo_pw = NULL;
49   PetscFunctionReturn(0);
50 }
51 
52 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
53 {
54   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
55   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
56   PetscInt            n    = cusp->coo_nd + cusp->coo_no;
57 
58   PetscFunctionBegin;
59   if (cusp->coo_p && v) {
60     thrust::device_ptr<const PetscScalar> d_v;
61     THRUSTARRAY                          *w = NULL;
62 
63     if (isCudaMem(v)) {
64       d_v = thrust::device_pointer_cast(v);
65     } else {
66       w = new THRUSTARRAY(n);
67       w->assign(v, v + n);
68       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
69       d_v = w->data();
70     }
71 
72     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
73     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
74     PetscCall(PetscLogGpuTimeBegin());
75     thrust::for_each(zibit, zieit, VecCUDAEquals());
76     PetscCall(PetscLogGpuTimeEnd());
77     delete w;
78     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
79     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
80   } else {
81     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
82     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
83   }
84   PetscFunctionReturn(0);
85 }
86 
87 template <typename Tuple>
88 struct IsNotOffDiagT {
89   PetscInt _cstart, _cend;
90 
91   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
92   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
93 };
94 
95 struct IsOffDiag {
96   PetscInt _cstart, _cend;
97 
98   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
99   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
100 };
101 
102 struct GlobToLoc {
103   PetscInt _start;
104 
105   GlobToLoc(PetscInt start) : _start(start) { }
106   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
107 };
108 
109 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
110 {
111   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
112   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
113   PetscInt               N, *jj;
114   size_t                 noff = 0;
115   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
116   THRUSTINTARRAY         d_j(n);
117   ISLocalToGlobalMapping l2g;
118 
119   PetscFunctionBegin;
120   PetscCall(MatDestroy(&b->A));
121   PetscCall(MatDestroy(&b->B));
122 
123   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
124   d_i.assign(coo_i, coo_i + n);
125   d_j.assign(coo_j, coo_j + n);
126   PetscCall(PetscLogGpuTimeBegin());
127   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
128   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
129   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
130     cusp->coo_p  = new THRUSTINTARRAY(n);
131     cusp->coo_pw = new THRUSTARRAY(n);
132     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
133     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
134     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
135     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
136     firstoffd  = mzipp.get_iterator_tuple().get<1>();
137   }
138   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
139   cusp->coo_no = thrust::distance(firstoffd, d_j.end());
140 
141   /* from global to local */
142   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
143   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
144   PetscCall(PetscLogGpuTimeEnd());
145 
146   /* copy offdiag column indices to map on the CPU */
147   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
148   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
149   auto o_j = d_j.begin();
150   PetscCall(PetscLogGpuTimeBegin());
151   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
152   thrust::sort(thrust::device, o_j, d_j.end());
153   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
154   PetscCall(PetscLogGpuTimeEnd());
155   noff = thrust::distance(o_j, wit);
156   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
157   PetscCall(PetscMalloc1(noff, &b->garray));
158   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
159   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
160   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
161   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
162   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
163   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
164   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
165 
166   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
167   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
168   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
169   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
170   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
171   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
172 
173   /* GPU memory, cusparse specific call handles it internally */
174   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
175   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
176   PetscCall(PetscFree(jj));
177 
178   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
179   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
180 
181   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
182   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
183   PetscCall(MatSetUpMultiply_MPIAIJ(B));
184   PetscFunctionReturn(0);
185 }
186 
187 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
188 {
189   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
190   Mat_MPIAIJCUSPARSE *mpidev;
191   PetscBool           coo_basic = PETSC_TRUE;
192   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
193   PetscInt            rstart, rend;
194 
195   PetscFunctionBegin;
196   PetscCall(PetscFree(mpiaij->garray));
197   PetscCall(VecDestroy(&mpiaij->lvec));
198 #if defined(PETSC_USE_CTABLE)
199   PetscCall(PetscTableDestroy(&mpiaij->colmap));
200 #else
201   PetscCall(PetscFree(mpiaij->colmap));
202 #endif
203   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
204   mat->assembled     = PETSC_FALSE;
205   mat->was_assembled = PETSC_FALSE;
206   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
207   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
208   if (coo_i) {
209     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
210     PetscCall(PetscGetMemType(coo_i, &mtype));
211     if (PetscMemTypeHost(mtype)) {
212       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
213         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
214           coo_basic = PETSC_FALSE;
215           break;
216         }
217       }
218     }
219   }
220   /* All ranks must agree on the value of coo_basic */
221   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
222   if (coo_basic) {
223     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
224   } else {
225     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
226     mat->offloadmask = PETSC_OFFLOAD_CPU;
227     /* creates the GPU memory */
228     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
229     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
230     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
231     mpidev->use_extended_coo = PETSC_TRUE;
232 
233     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
234     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
235 
236     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
237     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
238 
239     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
240     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
241     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
242 
243     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
244     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
245     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
246 
247     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
248     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
249     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
250 
251     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
252     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
253 
254     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
255     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
256 
257     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
258     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
259     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
260 
261     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
262     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
263     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
264 
265     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
266   }
267   PetscFunctionReturn(0);
268 }
269 
270 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
271 {
272   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
273   const PetscCount grid_size = gridDim.x * blockDim.x;
274   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
275 }
276 
277 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
278 {
279   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
280   const PetscCount grid_size = gridDim.x * blockDim.x;
281   for (; i < Annz + Bnnz; i += grid_size) {
282     PetscScalar sum = 0.0;
283     if (i < Annz) {
284       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
285       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
286     } else {
287       i -= Annz;
288       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
289       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
290     }
291   }
292 }
293 
294 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
295 {
296   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
297   const PetscCount grid_size = gridDim.x * blockDim.x;
298   for (; i < Annz2 + Bnnz2; i += grid_size) {
299     if (i < Annz2) {
300       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
301     } else {
302       i -= Annz2;
303       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
304     }
305   }
306 }
307 
308 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
309 {
310   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
311   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
312   Mat                 A = mpiaij->A, B = mpiaij->B;
313   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
314   PetscScalar        *Aa, *Ba = NULL;
315   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
316   const PetscScalar  *v1     = v;
317   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
318   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
319   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
320   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
321   PetscMemType        memtype;
322 
323   PetscFunctionBegin;
324   if (mpidev->use_extended_coo) {
325     PetscMPIInt size;
326 
327     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
328     PetscCall(PetscGetMemType(v, &memtype));
329     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
330       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
331       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
332     }
333 
334     if (imode == INSERT_VALUES) {
335       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
336       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
337     } else {
338       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
339       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
340     }
341 
342     /* Pack entries to be sent to remote */
343     if (mpiaij->sendlen) {
344       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
345       PetscCallCUDA(cudaPeekAtLastError());
346     }
347 
348     /* Send remote entries to their owner and overlap the communication with local computation */
349     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
350     /* Add local entries to A and B */
351     if (Annz + Bnnz > 0) {
352       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
353       PetscCallCUDA(cudaPeekAtLastError());
354     }
355     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
356 
357     /* Add received remote entries to A and B */
358     if (Annz2 + Bnnz2 > 0) {
359       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
360       PetscCallCUDA(cudaPeekAtLastError());
361     }
362 
363     if (imode == INSERT_VALUES) {
364       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
365       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
366     } else {
367       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
368       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
369     }
370     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
371   } else {
372     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
373   }
374   mat->offloadmask = PETSC_OFFLOAD_GPU;
375   PetscFunctionReturn(0);
376 }
377 
378 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
379 {
380   Mat             Ad, Ao;
381   const PetscInt *cmap;
382 
383   PetscFunctionBegin;
384   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
385   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
386   if (glob) {
387     PetscInt cst, i, dn, on, *gidx;
388 
389     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
390     PetscCall(MatGetLocalSize(Ao, NULL, &on));
391     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
392     PetscCall(PetscMalloc1(dn + on, &gidx));
393     for (i = 0; i < dn; i++) gidx[i] = cst + i;
394     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
395     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
396   }
397   PetscFunctionReturn(0);
398 }
399 
400 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
401 {
402   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
403   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
404   PetscInt            i;
405 
406   PetscFunctionBegin;
407   PetscCall(PetscLayoutSetUp(B->rmap));
408   PetscCall(PetscLayoutSetUp(B->cmap));
409   if (PetscDefined(USE_DEBUG) && d_nnz) {
410     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
411   }
412   if (PetscDefined(USE_DEBUG) && o_nnz) {
413     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
414   }
415 #if defined(PETSC_USE_CTABLE)
416   PetscCall(PetscTableDestroy(&b->colmap));
417 #else
418   PetscCall(PetscFree(b->colmap));
419 #endif
420   PetscCall(PetscFree(b->garray));
421   PetscCall(VecDestroy(&b->lvec));
422   PetscCall(VecScatterDestroy(&b->Mvctx));
423   /* Because the B will have been resized we simply destroy it and create a new one each time */
424   PetscCall(MatDestroy(&b->B));
425   if (!b->A) {
426     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
427     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
428   }
429   if (!b->B) {
430     PetscMPIInt size;
431     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
432     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
433     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
434   }
435   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
436   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
437   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
438   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
439   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
440   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
441   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
442   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
443   B->preallocated = PETSC_TRUE;
444   PetscFunctionReturn(0);
445 }
446 
447 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
448 {
449   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
450 
451   PetscFunctionBegin;
452   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
453   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
454   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
455   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
456   PetscFunctionReturn(0);
457 }
458 
459 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
460 {
461   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
462 
463   PetscFunctionBegin;
464   PetscCall(MatZeroEntries(l->A));
465   PetscCall(MatZeroEntries(l->B));
466   PetscFunctionReturn(0);
467 }
468 
469 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
470 {
471   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
472 
473   PetscFunctionBegin;
474   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
475   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
476   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
477   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
478   PetscFunctionReturn(0);
479 }
480 
481 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
482 {
483   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
484 
485   PetscFunctionBegin;
486   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
487   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
488   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
489   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
490   PetscFunctionReturn(0);
491 }
492 
493 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
494 {
495   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
496   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
497 
498   PetscFunctionBegin;
499   switch (op) {
500   case MAT_CUSPARSE_MULT_DIAG:
501     cusparseStruct->diagGPUMatFormat = format;
502     break;
503   case MAT_CUSPARSE_MULT_OFFDIAG:
504     cusparseStruct->offdiagGPUMatFormat = format;
505     break;
506   case MAT_CUSPARSE_ALL:
507     cusparseStruct->diagGPUMatFormat    = format;
508     cusparseStruct->offdiagGPUMatFormat = format;
509     break;
510   default:
511     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
512   }
513   PetscFunctionReturn(0);
514 }
515 
516 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
517 {
518   MatCUSPARSEStorageFormat format;
519   PetscBool                flg;
520   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
521   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
522 
523   PetscFunctionBegin;
524   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
525   if (A->factortype == MAT_FACTOR_NONE) {
526     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
527     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
528     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
529     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
530     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
531     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
532   }
533   PetscOptionsHeadEnd();
534   PetscFunctionReturn(0);
535 }
536 
537 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
538 {
539   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
540   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
541   PetscObjectState    onnz   = A->nonzerostate;
542 
543   PetscFunctionBegin;
544   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
545   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
546   if (onnz != A->nonzerostate && cusp->deviceMat) {
547     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
548 
549     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
550     PetscCall(PetscNew(&h_mat));
551     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
552     PetscCallCUDA(cudaFree(h_mat->colmap));
553     if (h_mat->allocated_indices) {
554       PetscCallCUDA(cudaFree(h_mat->diag.i));
555       PetscCallCUDA(cudaFree(h_mat->diag.j));
556       if (h_mat->offdiag.j) {
557         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
558         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
559       }
560     }
561     PetscCallCUDA(cudaFree(d_mat));
562     PetscCall(PetscFree(h_mat));
563     cusp->deviceMat = NULL;
564   }
565   PetscFunctionReturn(0);
566 }
567 
568 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
569 {
570   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
571   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
572 
573   PetscFunctionBegin;
574   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
575   if (cusparseStruct->deviceMat) {
576     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
577 
578     PetscCall(PetscInfo(A, "Have device matrix\n"));
579     PetscCall(PetscNew(&h_mat));
580     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
581     PetscCallCUDA(cudaFree(h_mat->colmap));
582     if (h_mat->allocated_indices) {
583       PetscCallCUDA(cudaFree(h_mat->diag.i));
584       PetscCallCUDA(cudaFree(h_mat->diag.j));
585       if (h_mat->offdiag.j) {
586         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
587         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
588       }
589     }
590     PetscCallCUDA(cudaFree(d_mat));
591     PetscCall(PetscFree(h_mat));
592   }
593   /* Free COO */
594   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
595   PetscCallCXX(delete cusparseStruct);
596   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
597   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
598   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
599   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
600   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
601   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
602   PetscCall(MatDestroy_MPIAIJ(A));
603   PetscFunctionReturn(0);
604 }
605 
606 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
607 {
608   Mat_MPIAIJ *a;
609   Mat         A;
610 
611   PetscFunctionBegin;
612   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
613   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
614   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
615   A             = *newmat;
616   A->boundtocpu = PETSC_FALSE;
617   PetscCall(PetscFree(A->defaultvectype));
618   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
619 
620   a = (Mat_MPIAIJ *)A->data;
621   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
622   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
623   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
624 
625   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
626 
627   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
628   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
629   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
630   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
631   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
632   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
633   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
634   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
635 
636   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
637   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
638   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
639   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
640   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
641   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
642 #if defined(PETSC_HAVE_HYPRE)
643   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
644 #endif
645   PetscFunctionReturn(0);
646 }
647 
648 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
649 {
650   PetscFunctionBegin;
651   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
652   PetscCall(MatCreate_MPIAIJ(A));
653   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
654   PetscFunctionReturn(0);
655 }
656 
657 /*@
658    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
659    (the default parallel PETSc format).  This matrix will ultimately pushed down
660    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
661    assembly performance the user should preallocate the matrix storage by setting
662    the parameter nz (or the array nnz).  By setting these parameters accurately,
663    performance during matrix assembly can be increased by more than a factor of 50.
664 
665    Collective
666 
667    Input Parameters:
668 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
669 .  m - number of rows
670 .  n - number of columns
671 .  nz - number of nonzeros per row (same for all rows)
672 -  nnz - array containing the number of nonzeros in the various rows
673          (possibly different for each row) or NULL
674 
675    Output Parameter:
676 .  A - the matrix
677 
678    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
679    MatXXXXSetPreallocation() paradigm instead of this routine directly.
680    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
681 
682    Notes:
683    If nnz is given then nz is ignored
684 
685    The AIJ format, also called the
686    compressed row storage), is fully compatible with standard Fortran 77
687    storage.  That is, the stored row and column indices can begin at
688    either one (as in Fortran) or zero.  See the users' manual for details.
689 
690    Specify the preallocated storage with either nz or nnz (not both).
691    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
692    allocation.  For large problems you MUST preallocate memory or you
693    will get TERRIBLE performance, see the users' manual chapter on matrices.
694 
695    By default, this format uses inodes (identical nodes) when possible, to
696    improve numerical efficiency of matrix-vector products and solves. We
697    search for consecutive rows with the same nonzero structure, thereby
698    reusing matrix information to achieve increased efficiency.
699 
700    Level: intermediate
701 
702 .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
703 @*/
704 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
705 {
706   PetscMPIInt size;
707 
708   PetscFunctionBegin;
709   PetscCall(MatCreate(comm, A));
710   PetscCall(MatSetSizes(*A, m, n, M, N));
711   PetscCallMPI(MPI_Comm_size(comm, &size));
712   if (size > 1) {
713     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
714     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
715   } else {
716     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
717     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
718   }
719   PetscFunctionReturn(0);
720 }
721 
722 /*MC
723    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
724 
725    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
726    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
727    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
728 
729    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
730    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
731    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
732    for communicators controlling multiple processes.  It is recommended that you call both of
733    the above preallocation routines for simplicity.
734 
735    Options Database Keys:
736 +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()`
737 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
738 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
739 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
740 
741   Level: beginner
742 
743  .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
744 M*/
745 
746 /*MC
747    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
748 
749   Level: beginner
750 
751  .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
752 M*/
753 
754 // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
755 PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
756 {
757   PetscSplitCSRDataStructure d_mat;
758   PetscMPIInt                size;
759   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
760   PetscScalar               *aa = NULL, *ba = NULL;
761   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
762   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
763   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;
764 
765   PetscFunctionBegin;
766   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
767   if (A->factortype != MAT_FACTOR_NONE) {
768     *B = NULL;
769     PetscFunctionReturn(0);
770   }
771   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
772   // get jaca
773   if (size == 1) {
774     PetscBool isseqaij;
775 
776     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
777     if (isseqaij) {
778       jaca = (Mat_SeqAIJ *)A->data;
779       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
780       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
781       d_mat           = cusparsestructA->deviceMat;
782       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
783     } else {
784       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
785       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
786       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
787       jaca                      = (Mat_SeqAIJ *)aij->A->data;
788       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
789       d_mat                     = spptr->deviceMat;
790       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
791     }
792     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
793       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
794       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
795       matrixA = (CsrMatrix *)matstruct->mat;
796       bi      = NULL;
797       bj      = NULL;
798       ba      = NULL;
799     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
800   } else {
801     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
802     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
803     jaca                      = (Mat_SeqAIJ *)aij->A->data;
804     jacb                      = (Mat_SeqAIJ *)aij->B->data;
805     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
806 
807     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
808     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
809     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
810     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
811     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
812       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
813       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
814       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
815       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
816       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
817       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
818       matrixA = (CsrMatrix *)matstructA->mat;
819       matrixB = (CsrMatrix *)matstructB->mat;
820       if (jacb->compressedrow.use) {
821         if (!cusparsestructB->rowoffsets_gpu) {
822           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
823           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
824         }
825         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
826       } else {
827         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
828       }
829       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
830       ba = thrust::raw_pointer_cast(matrixB->values->data());
831     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
832     d_mat = spptr->deviceMat;
833   }
834   if (jaca->compressedrow.use) {
835     if (!cusparsestructA->rowoffsets_gpu) {
836       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
837       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
838     }
839     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
840   } else {
841     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
842   }
843   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
844   aa = thrust::raw_pointer_cast(matrixA->values->data());
845 
846   if (!d_mat) {
847     PetscSplitCSRDataStructure h_mat;
848 
849     // create and populate strucy on host and copy on device
850     PetscCall(PetscInfo(A, "Create device matrix\n"));
851     PetscCall(PetscNew(&h_mat));
852     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
853     if (size > 1) { /* need the colmap array */
854       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
855       PetscInt   *colmap;
856       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;
857 
858       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
859 
860       PetscCall(PetscCalloc1(N + 1, &colmap));
861       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
862 #if defined(PETSC_USE_64BIT_INDICES)
863       { // have to make a long version of these
864         int      *h_bi32, *h_bj32;
865         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
866         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
867         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
868         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
869         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
870         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
871 
872         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
873         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
874         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
875         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
876 
877         h_mat->offdiag.i         = d_bi64;
878         h_mat->offdiag.j         = d_bj64;
879         h_mat->allocated_indices = PETSC_TRUE;
880 
881         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
882       }
883 #else
884       h_mat->offdiag.i         = (PetscInt *)bi;
885       h_mat->offdiag.j         = (PetscInt *)bj;
886       h_mat->allocated_indices = PETSC_FALSE;
887 #endif
888       h_mat->offdiag.a = ba;
889       h_mat->offdiag.n = A->rmap->n;
890 
891       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
892       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
893       PetscCall(PetscFree(colmap));
894     }
895     h_mat->rstart = A->rmap->rstart;
896     h_mat->rend   = A->rmap->rend;
897     h_mat->cstart = A->cmap->rstart;
898     h_mat->cend   = A->cmap->rend;
899     h_mat->M      = A->cmap->N;
900 #if defined(PETSC_USE_64BIT_INDICES)
901     {
902       int      *h_ai32, *h_aj32;
903       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
904 
905       static_assert(sizeof(PetscInt) == 8, "");
906       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
907       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
908       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
909       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
910       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
911 
912       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
913       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
914       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
915       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
916 
917       h_mat->diag.i            = d_ai64;
918       h_mat->diag.j            = d_aj64;
919       h_mat->allocated_indices = PETSC_TRUE;
920 
921       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
922     }
923 #else
924     h_mat->diag.i            = (PetscInt *)ai;
925     h_mat->diag.j            = (PetscInt *)aj;
926     h_mat->allocated_indices = PETSC_FALSE;
927 #endif
928     h_mat->diag.a = aa;
929     h_mat->diag.n = A->rmap->n;
930     h_mat->rank   = PetscGlobalRank;
931     // copy pointers and metadata to device
932     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
933     PetscCall(PetscFree(h_mat));
934   } else {
935     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
936   }
937   *B             = d_mat;
938   A->offloadmask = PETSC_OFFLOAD_GPU;
939   PetscFunctionReturn(0);
940 }
941