xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 6adde7963cff47966dd96ba87e71efa6228982fa) !
1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12 
13 struct VecCUDAEquals {
14   template <typename Tuple>
15   __host__ __device__ void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data)
22 {
23   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data;
24 
25   PetscFunctionBegin;
26   PetscCall(PetscSFDestroy(&coo->sf));
27   PetscCallCUDA(cudaFree(coo->Ajmap1));
28   PetscCallCUDA(cudaFree(coo->Aperm1));
29   PetscCallCUDA(cudaFree(coo->Bjmap1));
30   PetscCallCUDA(cudaFree(coo->Bperm1));
31   PetscCallCUDA(cudaFree(coo->Aimap2));
32   PetscCallCUDA(cudaFree(coo->Ajmap2));
33   PetscCallCUDA(cudaFree(coo->Aperm2));
34   PetscCallCUDA(cudaFree(coo->Bimap2));
35   PetscCallCUDA(cudaFree(coo->Bjmap2));
36   PetscCallCUDA(cudaFree(coo->Bperm2));
37   PetscCallCUDA(cudaFree(coo->Cperm1));
38   PetscCallCUDA(cudaFree(coo->sendbuf));
39   PetscCallCUDA(cudaFree(coo->recvbuf));
40   PetscCall(PetscFree(coo));
41   PetscFunctionReturn(PETSC_SUCCESS);
42 }
43 
44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45 {
46   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
47   PetscBool            dev_ij = PETSC_FALSE;
48   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
49   PetscInt            *i, *j;
50   PetscContainer       container_h, container_d;
51   MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52 
53   PetscFunctionBegin;
54   PetscCall(PetscFree(mpiaij->garray));
55   PetscCall(VecDestroy(&mpiaij->lvec));
56 #if defined(PETSC_USE_CTABLE)
57   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58 #else
59   PetscCall(PetscFree(mpiaij->colmap));
60 #endif
61   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62   mat->assembled     = PETSC_FALSE;
63   mat->was_assembled = PETSC_FALSE;
64   PetscCall(PetscGetMemType(coo_i, &mtype));
65   if (PetscMemTypeDevice(mtype)) {
66     dev_ij = PETSC_TRUE;
67     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
68     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
69     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70   } else {
71     i = coo_i;
72     j = coo_j;
73   }
74 
75   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
76   if (dev_ij) PetscCall(PetscFree2(i, j));
77   mat->offloadmask = PETSC_OFFLOAD_CPU;
78   // Create the GPU memory
79   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
80   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81 
82   // Copy the COO struct to device
83   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
84   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
85   PetscCall(PetscMalloc1(1, &coo_d));
86   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87 
88   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
89   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
90   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
91   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
92   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
93   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
94   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
95   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
96   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
97   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
98   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
99   PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100   PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101   PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102 
103   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
104   PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
105   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
106   PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
107   PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
108   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
109   PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
110   PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
111   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
112   PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
113   PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114 
115   // Put the COO struct in a container and then attach that to the matrix
116   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
117   PetscCall(PetscContainerSetPointer(container_d, coo_d));
118   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
119   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
120   PetscCall(PetscContainerDestroy(&container_d));
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
124 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
125 {
126   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
127   const PetscCount grid_size = gridDim.x * blockDim.x;
128   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
129 }
130 
131 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
132 {
133   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
134   const PetscCount grid_size = gridDim.x * blockDim.x;
135   for (; i < Annz + Bnnz; i += grid_size) {
136     PetscScalar sum = 0.0;
137     if (i < Annz) {
138       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
139       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
140     } else {
141       i -= Annz;
142       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
143       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
144     }
145   }
146 }
147 
148 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
149 {
150   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
151   const PetscCount grid_size = gridDim.x * blockDim.x;
152   for (; i < Annz2 + Bnnz2; i += grid_size) {
153     if (i < Annz2) {
154       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
155     } else {
156       i -= Annz2;
157       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
158     }
159   }
160 }
161 
162 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
163 {
164   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
165   Mat                  A = mpiaij->A, B = mpiaij->B;
166   PetscScalar         *Aa, *Ba;
167   const PetscScalar   *v1 = v;
168   PetscMemType         memtype;
169   PetscContainer       container;
170   MatCOOStruct_MPIAIJ *coo;
171 
172   PetscFunctionBegin;
173   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
174   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
175   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
176 
177   const auto &Annz   = coo->Annz;
178   const auto &Annz2  = coo->Annz2;
179   const auto &Bnnz   = coo->Bnnz;
180   const auto &Bnnz2  = coo->Bnnz2;
181   const auto &vsend  = coo->sendbuf;
182   const auto &v2     = coo->recvbuf;
183   const auto &Ajmap1 = coo->Ajmap1;
184   const auto &Ajmap2 = coo->Ajmap2;
185   const auto &Aimap2 = coo->Aimap2;
186   const auto &Bjmap1 = coo->Bjmap1;
187   const auto &Bjmap2 = coo->Bjmap2;
188   const auto &Bimap2 = coo->Bimap2;
189   const auto &Aperm1 = coo->Aperm1;
190   const auto &Aperm2 = coo->Aperm2;
191   const auto &Bperm1 = coo->Bperm1;
192   const auto &Bperm2 = coo->Bperm2;
193   const auto &Cperm1 = coo->Cperm1;
194 
195   PetscCall(PetscGetMemType(v, &memtype));
196   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
197     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
198     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
199   }
200 
201   if (imode == INSERT_VALUES) {
202     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
203     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
204   } else {
205     PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
206     PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
207   }
208 
209   /* Pack entries to be sent to remote */
210   if (coo->sendlen) {
211     MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
212     PetscCallCUDA(cudaPeekAtLastError());
213   }
214 
215   /* Send remote entries to their owner and overlap the communication with local computation */
216   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
217   /* Add local entries to A and B */
218   if (Annz + Bnnz > 0) {
219     MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
220     PetscCallCUDA(cudaPeekAtLastError());
221   }
222   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
223 
224   /* Add received remote entries to A and B */
225   if (Annz2 + Bnnz2 > 0) {
226     MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
227     PetscCallCUDA(cudaPeekAtLastError());
228   }
229 
230   if (imode == INSERT_VALUES) {
231     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
232     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
233   } else {
234     PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
235     PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
236   }
237   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
238   mat->offloadmask = PETSC_OFFLOAD_GPU;
239   PetscFunctionReturn(PETSC_SUCCESS);
240 }
241 
242 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
243 {
244   Mat             Ad, Ao;
245   const PetscInt *cmap;
246 
247   PetscFunctionBegin;
248   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
249   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
250   if (glob) {
251     PetscInt cst, i, dn, on, *gidx;
252 
253     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
254     PetscCall(MatGetLocalSize(Ao, NULL, &on));
255     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
256     PetscCall(PetscMalloc1(dn + on, &gidx));
257     for (i = 0; i < dn; i++) gidx[i] = cst + i;
258     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
259     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
260   }
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
265 {
266   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
267   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
268   PetscInt            i;
269 
270   PetscFunctionBegin;
271   PetscCall(PetscLayoutSetUp(B->rmap));
272   PetscCall(PetscLayoutSetUp(B->cmap));
273   if (PetscDefined(USE_DEBUG) && d_nnz) {
274     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
275   }
276   if (PetscDefined(USE_DEBUG) && o_nnz) {
277     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
278   }
279 #if defined(PETSC_USE_CTABLE)
280   PetscCall(PetscHMapIDestroy(&b->colmap));
281 #else
282   PetscCall(PetscFree(b->colmap));
283 #endif
284   PetscCall(PetscFree(b->garray));
285   PetscCall(VecDestroy(&b->lvec));
286   PetscCall(VecScatterDestroy(&b->Mvctx));
287   /* Because the B will have been resized we simply destroy it and create a new one each time */
288   PetscCall(MatDestroy(&b->B));
289   if (!b->A) {
290     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
291     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
292   }
293   if (!b->B) {
294     PetscMPIInt size;
295     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
296     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
297     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
298   }
299   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
300   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
301   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
302   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
303   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
304   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
305   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
306   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
307   B->preallocated = PETSC_TRUE;
308   PetscFunctionReturn(PETSC_SUCCESS);
309 }
310 
311 PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
312 {
313   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
314 
315   PetscFunctionBegin;
316   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
317   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
318   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
319   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
320   PetscFunctionReturn(PETSC_SUCCESS);
321 }
322 
323 PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
324 {
325   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
326 
327   PetscFunctionBegin;
328   PetscCall(MatZeroEntries(l->A));
329   PetscCall(MatZeroEntries(l->B));
330   PetscFunctionReturn(PETSC_SUCCESS);
331 }
332 
333 PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
334 {
335   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
336 
337   PetscFunctionBegin;
338   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
339   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
340   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
341   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
342   PetscFunctionReturn(PETSC_SUCCESS);
343 }
344 
345 PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
346 {
347   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
348 
349   PetscFunctionBegin;
350   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
351   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
352   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
353   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
354   PetscFunctionReturn(PETSC_SUCCESS);
355 }
356 
357 PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
358 {
359   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
360   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
361 
362   PetscFunctionBegin;
363   switch (op) {
364   case MAT_CUSPARSE_MULT_DIAG:
365     cusparseStruct->diagGPUMatFormat = format;
366     break;
367   case MAT_CUSPARSE_MULT_OFFDIAG:
368     cusparseStruct->offdiagGPUMatFormat = format;
369     break;
370   case MAT_CUSPARSE_ALL:
371     cusparseStruct->diagGPUMatFormat    = format;
372     cusparseStruct->offdiagGPUMatFormat = format;
373     break;
374   default:
375     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
376   }
377   PetscFunctionReturn(PETSC_SUCCESS);
378 }
379 
380 PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
381 {
382   MatCUSPARSEStorageFormat format;
383   PetscBool                flg;
384   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
385   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
386 
387   PetscFunctionBegin;
388   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
389   if (A->factortype == MAT_FACTOR_NONE) {
390     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
391     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
392     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
393     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
394     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
395     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
396   }
397   PetscOptionsHeadEnd();
398   PetscFunctionReturn(PETSC_SUCCESS);
399 }
400 
401 PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
402 {
403   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
404 
405   PetscFunctionBegin;
406   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
407   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
408   PetscFunctionReturn(PETSC_SUCCESS);
409 }
410 
411 PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
412 {
413   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
414   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
415 
416   PetscFunctionBegin;
417   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
418   PetscCallCXX(delete cusparseStruct);
419   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
420   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
421   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
422   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
423   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
424   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
425   PetscCall(MatDestroy_MPIAIJ(A));
426   PetscFunctionReturn(PETSC_SUCCESS);
427 }
428 
429 /* defines MatSetValues_MPICUSPARSE_Hash() */
430 #define TYPE AIJ
431 #define TYPE_AIJ
432 #define SUB_TYPE_CUSPARSE
433 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
434 #undef TYPE
435 #undef TYPE_AIJ
436 #undef SUB_TYPE_CUSPARSE
437 
438 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
439 {
440   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
441   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
442 
443   PetscFunctionBegin;
444   PetscCall(MatSetUp_MPI_Hash(A));
445   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
446   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
447   A->preallocated = PETSC_TRUE;
448   PetscFunctionReturn(PETSC_SUCCESS);
449 }
450 
451 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
452 {
453   Mat_MPIAIJ *a;
454   Mat         A;
455 
456   PetscFunctionBegin;
457   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
458   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
459   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
460   A             = *newmat;
461   A->boundtocpu = PETSC_FALSE;
462   PetscCall(PetscFree(A->defaultvectype));
463   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
464 
465   a = (Mat_MPIAIJ *)A->data;
466   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
467   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
468   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
469 
470   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
471 
472   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
473   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
474   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
475   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
476   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
477   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
478   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
479   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
480   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
481 
482   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
483   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
484   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
485   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
486   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
487   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
488 #if defined(PETSC_HAVE_HYPRE)
489   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
490 #endif
491   PetscFunctionReturn(PETSC_SUCCESS);
492 }
493 
494 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
495 {
496   PetscFunctionBegin;
497   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
498   PetscCall(MatCreate_MPIAIJ(A));
499   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
500   PetscFunctionReturn(PETSC_SUCCESS);
501 }
502 
503 /*@
504    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
505    (the default parallel PETSc format).  This matrix will ultimately pushed down
506    to NVIDIA GPUs and use the CuSPARSE library for calculations.
507 
508    Collective
509 
510    Input Parameters:
511 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
512 .  m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
513            This value should be the same as the local size used in creating the
514            y vector for the matrix-vector product y = Ax.
515 .  n - This value should be the same as the local size used in creating the
516        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
517        calculated if `N` is given) For square matrices `n` is almost always `m`.
518 .  M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
519 .  N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
520 .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
521            (same value is used for all local rows)
522 .  d_nnz - array containing the number of nonzeros in the various rows of the
523            DIAGONAL portion of the local submatrix (possibly different for each row)
524            or `NULL`, if `d_nz` is used to specify the nonzero structure.
525            The size of this array is equal to the number of local rows, i.e `m`.
526            For matrices you plan to factor you must leave room for the diagonal entry and
527            put in the entry even if it is zero.
528 .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
529            submatrix (same value is used for all local rows).
530 -  o_nnz - array containing the number of nonzeros in the various rows of the
531            OFF-DIAGONAL portion of the local submatrix (possibly different for
532            each row) or `NULL`, if `o_nz` is used to specify the nonzero
533            structure. The size of this array is equal to the number
534            of local rows, i.e `m`.
535 
536    Output Parameter:
537 .  A - the matrix
538 
539    Level: intermediate
540 
541    Notes:
542    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
543    MatXXXXSetPreallocation() paradigm instead of this routine directly.
544    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
545 
546    The AIJ format, also called the
547    compressed row storage), is fully compatible with standard Fortran
548    storage.  That is, the stored row and column indices can begin at
549    either one (as in Fortran) or zero.
550 
551 .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
552 @*/
553 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
554 {
555   PetscMPIInt size;
556 
557   PetscFunctionBegin;
558   PetscCall(MatCreate(comm, A));
559   PetscCall(MatSetSizes(*A, m, n, M, N));
560   PetscCallMPI(MPI_Comm_size(comm, &size));
561   if (size > 1) {
562     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
563     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
564   } else {
565     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
566     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
567   }
568   PetscFunctionReturn(PETSC_SUCCESS);
569 }
570 
571 /*MC
572    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
573 
574    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
575    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
576    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
577 
578    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
579    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
580    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
581    for communicators controlling multiple processes.  It is recommended that you call both of
582    the above preallocation routines for simplicity.
583 
584    Options Database Keys:
585 +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
586 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
587 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
588 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
589 
590   Level: beginner
591 
592 .seealso: [](chapter_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
593 M*/
594 
595 /*MC
596    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
597 
598   Level: beginner
599 
600 .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
601 M*/
602