xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 5994c5adf5e28cb515faabacab9288947ca28f46)
1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12 
13 struct VecCUDAEquals {
14   template <typename Tuple>
15   __host__ __device__ void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data)
22 {
23   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data;
24 
25   PetscFunctionBegin;
26   PetscCall(PetscSFDestroy(&coo->sf));
27   PetscCallCUDA(cudaFree(coo->Ajmap1));
28   PetscCallCUDA(cudaFree(coo->Aperm1));
29   PetscCallCUDA(cudaFree(coo->Bjmap1));
30   PetscCallCUDA(cudaFree(coo->Bperm1));
31   PetscCallCUDA(cudaFree(coo->Aimap2));
32   PetscCallCUDA(cudaFree(coo->Ajmap2));
33   PetscCallCUDA(cudaFree(coo->Aperm2));
34   PetscCallCUDA(cudaFree(coo->Bimap2));
35   PetscCallCUDA(cudaFree(coo->Bjmap2));
36   PetscCallCUDA(cudaFree(coo->Bperm2));
37   PetscCallCUDA(cudaFree(coo->Cperm1));
38   PetscCallCUDA(cudaFree(coo->sendbuf));
39   PetscCallCUDA(cudaFree(coo->recvbuf));
40   PetscCall(PetscFree(coo));
41   PetscFunctionReturn(PETSC_SUCCESS);
42 }
43 
44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45 {
46   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
47   PetscBool            dev_ij = PETSC_FALSE;
48   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
49   PetscInt            *i, *j;
50   PetscContainer       container_h, container_d;
51   MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52 
53   PetscFunctionBegin;
54   PetscCall(PetscFree(mpiaij->garray));
55   PetscCall(VecDestroy(&mpiaij->lvec));
56 #if defined(PETSC_USE_CTABLE)
57   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58 #else
59   PetscCall(PetscFree(mpiaij->colmap));
60 #endif
61   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62   mat->assembled     = PETSC_FALSE;
63   mat->was_assembled = PETSC_FALSE;
64   PetscCall(PetscGetMemType(coo_i, &mtype));
65   if (PetscMemTypeDevice(mtype)) {
66     dev_ij = PETSC_TRUE;
67     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
68     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
69     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70   } else {
71     i = coo_i;
72     j = coo_j;
73   }
74 
75   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
76   if (dev_ij) PetscCall(PetscFree2(i, j));
77   mat->offloadmask = PETSC_OFFLOAD_CPU;
78   // Create the GPU memory
79   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
80   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81 
82   // Copy the COO struct to device
83   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
84   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
85   PetscCall(PetscMalloc1(1, &coo_d));
86   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87 
88   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
89   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
90   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
91   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
92   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
93   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
94   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
95   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
96   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
97   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
98   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
99   PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100   PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101   PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102 
103   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
104   PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
105   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
106   PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
107   PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
108   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
109   PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
110   PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
111   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
112   PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
113   PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114 
115   // Put the COO struct in a container and then attach that to the matrix
116   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
117   PetscCall(PetscContainerSetPointer(container_d, coo_d));
118   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
119   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
120   PetscCall(PetscContainerDestroy(&container_d));
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
124 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
125 {
126   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
127   const PetscCount grid_size = gridDim.x * blockDim.x;
128   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
129 }
130 
131 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
132 {
133   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
134   const PetscCount grid_size = gridDim.x * blockDim.x;
135   for (; i < Annz + Bnnz; i += grid_size) {
136     PetscScalar sum = 0.0;
137     if (i < Annz) {
138       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
139       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
140     } else {
141       i -= Annz;
142       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
143       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
144     }
145   }
146 }
147 
148 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
149 {
150   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
151   const PetscCount grid_size = gridDim.x * blockDim.x;
152   for (; i < Annz2 + Bnnz2; i += grid_size) {
153     if (i < Annz2) {
154       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
155     } else {
156       i -= Annz2;
157       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
158     }
159   }
160 }
161 
162 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
163 {
164   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
165   Mat                  A = mpiaij->A, B = mpiaij->B;
166   PetscScalar         *Aa, *Ba;
167   const PetscScalar   *v1 = v;
168   PetscMemType         memtype;
169   PetscContainer       container;
170   MatCOOStruct_MPIAIJ *coo;
171 
172   PetscFunctionBegin;
173   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
174   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
175   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
176 
177   const auto &Annz   = coo->Annz;
178   const auto &Annz2  = coo->Annz2;
179   const auto &Bnnz   = coo->Bnnz;
180   const auto &Bnnz2  = coo->Bnnz2;
181   const auto &vsend  = coo->sendbuf;
182   const auto &v2     = coo->recvbuf;
183   const auto &Ajmap1 = coo->Ajmap1;
184   const auto &Ajmap2 = coo->Ajmap2;
185   const auto &Aimap2 = coo->Aimap2;
186   const auto &Bjmap1 = coo->Bjmap1;
187   const auto &Bjmap2 = coo->Bjmap2;
188   const auto &Bimap2 = coo->Bimap2;
189   const auto &Aperm1 = coo->Aperm1;
190   const auto &Aperm2 = coo->Aperm2;
191   const auto &Bperm1 = coo->Bperm1;
192   const auto &Bperm2 = coo->Bperm2;
193   const auto &Cperm1 = coo->Cperm1;
194 
195   PetscCall(PetscGetMemType(v, &memtype));
196   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
197     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
198     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
199   }
200 
201   if (imode == INSERT_VALUES) {
202     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
203     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
204   } else {
205     PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
206     PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
207   }
208 
209   /* Pack entries to be sent to remote */
210   if (coo->sendlen) {
211     MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
212     PetscCallCUDA(cudaPeekAtLastError());
213   }
214 
215   /* Send remote entries to their owner and overlap the communication with local computation */
216   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
217   /* Add local entries to A and B */
218   if (Annz + Bnnz > 0) {
219     MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
220     PetscCallCUDA(cudaPeekAtLastError());
221   }
222   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
223 
224   /* Add received remote entries to A and B */
225   if (Annz2 + Bnnz2 > 0) {
226     MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
227     PetscCallCUDA(cudaPeekAtLastError());
228   }
229 
230   if (imode == INSERT_VALUES) {
231     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
232     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
233   } else {
234     PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
235     PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
236   }
237   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
238   mat->offloadmask = PETSC_OFFLOAD_GPU;
239   PetscFunctionReturn(PETSC_SUCCESS);
240 }
241 
242 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
243 {
244   Mat             Ad, Ao;
245   const PetscInt *cmap;
246 
247   PetscFunctionBegin;
248   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
249   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
250   if (glob) {
251     PetscInt cst, i, dn, on, *gidx;
252 
253     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
254     PetscCall(MatGetLocalSize(Ao, NULL, &on));
255     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
256     PetscCall(PetscMalloc1(dn + on, &gidx));
257     for (i = 0; i < dn; i++) gidx[i] = cst + i;
258     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
259     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
260   }
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
265 {
266   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
267   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
268   PetscInt            i;
269 
270   PetscFunctionBegin;
271   if (B->hash_active) {
272     B->ops[0]      = b->cops;
273     B->hash_active = PETSC_FALSE;
274   }
275   PetscCall(PetscLayoutSetUp(B->rmap));
276   PetscCall(PetscLayoutSetUp(B->cmap));
277   if (PetscDefined(USE_DEBUG) && d_nnz) {
278     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
279   }
280   if (PetscDefined(USE_DEBUG) && o_nnz) {
281     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
282   }
283 #if defined(PETSC_USE_CTABLE)
284   PetscCall(PetscHMapIDestroy(&b->colmap));
285 #else
286   PetscCall(PetscFree(b->colmap));
287 #endif
288   PetscCall(PetscFree(b->garray));
289   PetscCall(VecDestroy(&b->lvec));
290   PetscCall(VecScatterDestroy(&b->Mvctx));
291   /* Because the B will have been resized we simply destroy it and create a new one each time */
292   PetscCall(MatDestroy(&b->B));
293   if (!b->A) {
294     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
295     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
296   }
297   if (!b->B) {
298     PetscMPIInt size;
299     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
300     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
301     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
302   }
303   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
304   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
305   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
306   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
307   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
308   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
309   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
310   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
311   B->preallocated  = PETSC_TRUE;
312   B->was_assembled = PETSC_FALSE;
313   B->assembled     = PETSC_FALSE;
314   PetscFunctionReturn(PETSC_SUCCESS);
315 }
316 
317 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
318 {
319   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
320 
321   PetscFunctionBegin;
322   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
323   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
324   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
325   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
326   PetscFunctionReturn(PETSC_SUCCESS);
327 }
328 
329 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
330 {
331   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
332 
333   PetscFunctionBegin;
334   PetscCall(MatZeroEntries(l->A));
335   PetscCall(MatZeroEntries(l->B));
336   PetscFunctionReturn(PETSC_SUCCESS);
337 }
338 
339 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
340 {
341   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
342 
343   PetscFunctionBegin;
344   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
345   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
346   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
347   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
348   PetscFunctionReturn(PETSC_SUCCESS);
349 }
350 
351 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
352 {
353   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
354 
355   PetscFunctionBegin;
356   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
357   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
358   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
359   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
360   PetscFunctionReturn(PETSC_SUCCESS);
361 }
362 
363 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
364 {
365   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
366   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
367 
368   PetscFunctionBegin;
369   switch (op) {
370   case MAT_CUSPARSE_MULT_DIAG:
371     cusparseStruct->diagGPUMatFormat = format;
372     break;
373   case MAT_CUSPARSE_MULT_OFFDIAG:
374     cusparseStruct->offdiagGPUMatFormat = format;
375     break;
376   case MAT_CUSPARSE_ALL:
377     cusparseStruct->diagGPUMatFormat    = format;
378     cusparseStruct->offdiagGPUMatFormat = format;
379     break;
380   default:
381     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
382   }
383   PetscFunctionReturn(PETSC_SUCCESS);
384 }
385 
386 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
387 {
388   MatCUSPARSEStorageFormat format;
389   PetscBool                flg;
390   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
391   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
392 
393   PetscFunctionBegin;
394   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
395   if (A->factortype == MAT_FACTOR_NONE) {
396     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
397     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
398     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
399     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
400     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
401     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
402   }
403   PetscOptionsHeadEnd();
404   PetscFunctionReturn(PETSC_SUCCESS);
405 }
406 
407 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
408 {
409   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
410 
411   PetscFunctionBegin;
412   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
413   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
414   PetscFunctionReturn(PETSC_SUCCESS);
415 }
416 
417 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
418 {
419   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
420   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
421 
422   PetscFunctionBegin;
423   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
424   PetscCallCXX(delete cusparseStruct);
425   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
426   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
427   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
428   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
429   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
430   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
431   PetscCall(MatDestroy_MPIAIJ(A));
432   PetscFunctionReturn(PETSC_SUCCESS);
433 }
434 
435 /* defines MatSetValues_MPICUSPARSE_Hash() */
436 #define TYPE AIJ
437 #define TYPE_AIJ
438 #define SUB_TYPE_CUSPARSE
439 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
440 #undef TYPE
441 #undef TYPE_AIJ
442 #undef SUB_TYPE_CUSPARSE
443 
444 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
445 {
446   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
447   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
448 
449   PetscFunctionBegin;
450   PetscCall(MatSetUp_MPI_Hash(A));
451   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
452   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
453   A->preallocated = PETSC_TRUE;
454   PetscFunctionReturn(PETSC_SUCCESS);
455 }
456 
457 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
458 {
459   Mat_MPIAIJ *a;
460   Mat         A;
461 
462   PetscFunctionBegin;
463   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
464   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
465   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
466   A             = *newmat;
467   A->boundtocpu = PETSC_FALSE;
468   PetscCall(PetscFree(A->defaultvectype));
469   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
470 
471   a = (Mat_MPIAIJ *)A->data;
472   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
473   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
474   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
475 
476   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
477 
478   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
479   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
480   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
481   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
482   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
483   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
484   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
485   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
486   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
487 
488   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
489   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
490   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
491   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
492   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
493   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
494 #if defined(PETSC_HAVE_HYPRE)
495   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
496 #endif
497   PetscFunctionReturn(PETSC_SUCCESS);
498 }
499 
500 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
501 {
502   PetscFunctionBegin;
503   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
504   PetscCall(MatCreate_MPIAIJ(A));
505   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
506   PetscFunctionReturn(PETSC_SUCCESS);
507 }
508 
509 /*@
510    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
511    (the default parallel PETSc format).  This matrix will ultimately pushed down
512    to NVIDIA GPUs and use the CuSPARSE library for calculations.
513 
514    Collective
515 
516    Input Parameters:
517 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
518 .  m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
519            This value should be the same as the local size used in creating the
520            y vector for the matrix-vector product y = Ax.
521 .  n - This value should be the same as the local size used in creating the
522        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
523        calculated if `N` is given) For square matrices `n` is almost always `m`.
524 .  M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
525 .  N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
526 .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
527            (same value is used for all local rows)
528 .  d_nnz - array containing the number of nonzeros in the various rows of the
529            DIAGONAL portion of the local submatrix (possibly different for each row)
530            or `NULL`, if `d_nz` is used to specify the nonzero structure.
531            The size of this array is equal to the number of local rows, i.e `m`.
532            For matrices you plan to factor you must leave room for the diagonal entry and
533            put in the entry even if it is zero.
534 .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
535            submatrix (same value is used for all local rows).
536 -  o_nnz - array containing the number of nonzeros in the various rows of the
537            OFF-DIAGONAL portion of the local submatrix (possibly different for
538            each row) or `NULL`, if `o_nz` is used to specify the nonzero
539            structure. The size of this array is equal to the number
540            of local rows, i.e `m`.
541 
542    Output Parameter:
543 .  A - the matrix
544 
545    Level: intermediate
546 
547    Notes:
548    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
549    MatXXXXSetPreallocation() paradigm instead of this routine directly.
550    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
551 
552    The AIJ format, also called the
553    compressed row storage), is fully compatible with standard Fortran
554    storage.  That is, the stored row and column indices can begin at
555    either one (as in Fortran) or zero.
556 
557 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
558 @*/
559 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
560 {
561   PetscMPIInt size;
562 
563   PetscFunctionBegin;
564   PetscCall(MatCreate(comm, A));
565   PetscCall(MatSetSizes(*A, m, n, M, N));
566   PetscCallMPI(MPI_Comm_size(comm, &size));
567   if (size > 1) {
568     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
569     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
570   } else {
571     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
572     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
573   }
574   PetscFunctionReturn(PETSC_SUCCESS);
575 }
576 
577 /*MC
578    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
579 
580    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
581    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
582    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
583 
584    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
585    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
586    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
587    for communicators controlling multiple processes.  It is recommended that you call both of
588    the above preallocation routines for simplicity.
589 
590    Options Database Keys:
591 +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
592 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
593 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
594 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
595 
596   Level: beginner
597 
598 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
599 M*/
600 
601 /*MC
602    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
603 
604   Level: beginner
605 
606 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
607 M*/
608