xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 0baf8eba40dbc839082666f9f7396a225d6f663c) !
1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12 
13 struct VecCUDAEquals {
14   template <typename Tuple>
15   __host__ __device__ void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void **data)
22 {
23   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;
24 
25   PetscFunctionBegin;
26   PetscCall(PetscSFDestroy(&coo->sf));
27   PetscCallCUDA(cudaFree(coo->Ajmap1));
28   PetscCallCUDA(cudaFree(coo->Aperm1));
29   PetscCallCUDA(cudaFree(coo->Bjmap1));
30   PetscCallCUDA(cudaFree(coo->Bperm1));
31   PetscCallCUDA(cudaFree(coo->Aimap2));
32   PetscCallCUDA(cudaFree(coo->Ajmap2));
33   PetscCallCUDA(cudaFree(coo->Aperm2));
34   PetscCallCUDA(cudaFree(coo->Bimap2));
35   PetscCallCUDA(cudaFree(coo->Bjmap2));
36   PetscCallCUDA(cudaFree(coo->Bperm2));
37   PetscCallCUDA(cudaFree(coo->Cperm1));
38   PetscCallCUDA(cudaFree(coo->sendbuf));
39   PetscCallCUDA(cudaFree(coo->recvbuf));
40   PetscCall(PetscFree(coo));
41   PetscFunctionReturn(PETSC_SUCCESS);
42 }
43 
44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45 {
46   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
47   PetscBool            dev_ij = PETSC_FALSE;
48   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
49   PetscInt            *i, *j;
50   PetscContainer       container_h;
51   MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52 
53   PetscFunctionBegin;
54   PetscCall(PetscFree(mpiaij->garray));
55   PetscCall(VecDestroy(&mpiaij->lvec));
56 #if defined(PETSC_USE_CTABLE)
57   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58 #else
59   PetscCall(PetscFree(mpiaij->colmap));
60 #endif
61   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62   mat->assembled     = PETSC_FALSE;
63   mat->was_assembled = PETSC_FALSE;
64   PetscCall(PetscGetMemType(coo_i, &mtype));
65   if (PetscMemTypeDevice(mtype)) {
66     dev_ij = PETSC_TRUE;
67     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
68     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
69     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70   } else {
71     i = coo_i;
72     j = coo_j;
73   }
74 
75   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
76   if (dev_ij) PetscCall(PetscFree2(i, j));
77   mat->offloadmask = PETSC_OFFLOAD_CPU;
78   // Create the GPU memory
79   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
80   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81 
82   // Copy the COO struct to device
83   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
84   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
85   PetscCall(PetscMalloc1(1, &coo_d));
86   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87 
88   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
89   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
90   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
91   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
92   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
93   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
94   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
95   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
96   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
97   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
98   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
99   PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100   PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101   PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102 
103   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
104   PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
105   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
106   PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
107   PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
108   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
109   PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
110   PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
111   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
112   PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
113   PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114 
115   // Put the COO struct in a container and then attach that to the matrix
116   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
117   PetscFunctionReturn(PETSC_SUCCESS);
118 }
119 
120 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
121 {
122   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
123   const PetscCount grid_size = gridDim.x * blockDim.x;
124   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
125 }
126 
127 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
128 {
129   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
130   const PetscCount grid_size = gridDim.x * blockDim.x;
131   for (; i < Annz + Bnnz; i += grid_size) {
132     PetscScalar sum = 0.0;
133     if (i < Annz) {
134       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
135       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
136     } else {
137       i -= Annz;
138       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
139       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
140     }
141   }
142 }
143 
144 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
145 {
146   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
147   const PetscCount grid_size = gridDim.x * blockDim.x;
148   for (; i < Annz2 + Bnnz2; i += grid_size) {
149     if (i < Annz2) {
150       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
151     } else {
152       i -= Annz2;
153       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
154     }
155   }
156 }
157 
158 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
159 {
160   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
161   Mat                  A = mpiaij->A, B = mpiaij->B;
162   PetscScalar         *Aa, *Ba;
163   const PetscScalar   *v1 = v;
164   PetscMemType         memtype;
165   PetscContainer       container;
166   MatCOOStruct_MPIAIJ *coo;
167 
168   PetscFunctionBegin;
169   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
170   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
171   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
172 
173   const auto &Annz   = coo->Annz;
174   const auto &Annz2  = coo->Annz2;
175   const auto &Bnnz   = coo->Bnnz;
176   const auto &Bnnz2  = coo->Bnnz2;
177   const auto &vsend  = coo->sendbuf;
178   const auto &v2     = coo->recvbuf;
179   const auto &Ajmap1 = coo->Ajmap1;
180   const auto &Ajmap2 = coo->Ajmap2;
181   const auto &Aimap2 = coo->Aimap2;
182   const auto &Bjmap1 = coo->Bjmap1;
183   const auto &Bjmap2 = coo->Bjmap2;
184   const auto &Bimap2 = coo->Bimap2;
185   const auto &Aperm1 = coo->Aperm1;
186   const auto &Aperm2 = coo->Aperm2;
187   const auto &Bperm1 = coo->Bperm1;
188   const auto &Bperm2 = coo->Bperm2;
189   const auto &Cperm1 = coo->Cperm1;
190 
191   PetscCall(PetscGetMemType(v, &memtype));
192   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
193     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
194     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
195   }
196 
197   if (imode == INSERT_VALUES) {
198     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
199     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
200   } else {
201     PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
202     PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
203   }
204 
205   PetscCall(PetscLogGpuTimeBegin());
206   /* Pack entries to be sent to remote */
207   if (coo->sendlen) {
208     MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
209     PetscCallCUDA(cudaPeekAtLastError());
210   }
211 
212   /* Send remote entries to their owner and overlap the communication with local computation */
213   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
214   /* Add local entries to A and B */
215   if (Annz + Bnnz > 0) {
216     MatAddLocalCOOValues<<<(int)((Annz + Bnnz + 255) / 256), 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
217     PetscCallCUDA(cudaPeekAtLastError());
218   }
219   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
220 
221   /* Add received remote entries to A and B */
222   if (Annz2 + Bnnz2 > 0) {
223     MatAddRemoteCOOValues<<<(int)((Annz2 + Bnnz2 + 255) / 256), 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
224     PetscCallCUDA(cudaPeekAtLastError());
225   }
226   PetscCall(PetscLogGpuTimeEnd());
227 
228   if (imode == INSERT_VALUES) {
229     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
230     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
231   } else {
232     PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
233     PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
234   }
235   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
236   mat->offloadmask = PETSC_OFFLOAD_GPU;
237   PetscFunctionReturn(PETSC_SUCCESS);
238 }
239 
240 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
241 {
242   Mat             Ad, Ao;
243   const PetscInt *cmap;
244 
245   PetscFunctionBegin;
246   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
247   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
248   if (glob) {
249     PetscInt cst, i, dn, on, *gidx;
250 
251     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
252     PetscCall(MatGetLocalSize(Ao, NULL, &on));
253     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
254     PetscCall(PetscMalloc1(dn + on, &gidx));
255     for (i = 0; i < dn; i++) gidx[i] = cst + i;
256     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
257     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
258   }
259   PetscFunctionReturn(PETSC_SUCCESS);
260 }
261 
262 static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
263 {
264   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
265   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
266   PetscInt            i;
267 
268   PetscFunctionBegin;
269   if (B->hash_active) {
270     B->ops[0]      = b->cops;
271     B->hash_active = PETSC_FALSE;
272   }
273   PetscCall(PetscLayoutSetUp(B->rmap));
274   PetscCall(PetscLayoutSetUp(B->cmap));
275   if (PetscDefined(USE_DEBUG) && d_nnz) {
276     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
277   }
278   if (PetscDefined(USE_DEBUG) && o_nnz) {
279     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
280   }
281 #if defined(PETSC_USE_CTABLE)
282   PetscCall(PetscHMapIDestroy(&b->colmap));
283 #else
284   PetscCall(PetscFree(b->colmap));
285 #endif
286   PetscCall(PetscFree(b->garray));
287   PetscCall(VecDestroy(&b->lvec));
288   PetscCall(VecScatterDestroy(&b->Mvctx));
289   /* Because the B will have been resized we simply destroy it and create a new one each time */
290   PetscCall(MatDestroy(&b->B));
291   if (!b->A) {
292     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
293     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
294   }
295   if (!b->B) {
296     PetscMPIInt size;
297     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
298     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
299     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
300   }
301   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
302   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
303   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
304   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
305   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
306   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
307   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
308   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
309   B->preallocated  = PETSC_TRUE;
310   B->was_assembled = PETSC_FALSE;
311   B->assembled     = PETSC_FALSE;
312   PetscFunctionReturn(PETSC_SUCCESS);
313 }
314 
315 static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
316 {
317   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
318 
319   PetscFunctionBegin;
320   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
321   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
322   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
323   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
324   PetscFunctionReturn(PETSC_SUCCESS);
325 }
326 
327 static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
328 {
329   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
330 
331   PetscFunctionBegin;
332   PetscCall(MatZeroEntries(l->A));
333   PetscCall(MatZeroEntries(l->B));
334   PetscFunctionReturn(PETSC_SUCCESS);
335 }
336 
337 static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
338 {
339   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
340 
341   PetscFunctionBegin;
342   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
343   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
344   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
345   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
349 static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
350 {
351   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
352 
353   PetscFunctionBegin;
354   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
355   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
356   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
357   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
358   PetscFunctionReturn(PETSC_SUCCESS);
359 }
360 
361 static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
362 {
363   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
364   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
365 
366   PetscFunctionBegin;
367   switch (op) {
368   case MAT_CUSPARSE_MULT_DIAG:
369     cusparseStruct->diagGPUMatFormat = format;
370     break;
371   case MAT_CUSPARSE_MULT_OFFDIAG:
372     cusparseStruct->offdiagGPUMatFormat = format;
373     break;
374   case MAT_CUSPARSE_ALL:
375     cusparseStruct->diagGPUMatFormat    = format;
376     cusparseStruct->offdiagGPUMatFormat = format;
377     break;
378   default:
379     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
380   }
381   PetscFunctionReturn(PETSC_SUCCESS);
382 }
383 
384 static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
385 {
386   MatCUSPARSEStorageFormat format;
387   PetscBool                flg;
388   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
389   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
390 
391   PetscFunctionBegin;
392   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
393   if (A->factortype == MAT_FACTOR_NONE) {
394     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
395     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
396     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
397     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
398     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
399     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
400   }
401   PetscOptionsHeadEnd();
402   PetscFunctionReturn(PETSC_SUCCESS);
403 }
404 
405 static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
406 {
407   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
408 
409   PetscFunctionBegin;
410   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
411   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
412   PetscFunctionReturn(PETSC_SUCCESS);
413 }
414 
415 static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
416 {
417   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
418   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
419 
420   PetscFunctionBegin;
421   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
422   PetscCallCXX(delete cusparseStruct);
423   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
424   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
425   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
426   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
427   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
428   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
429   PetscCall(MatDestroy_MPIAIJ(A));
430   PetscFunctionReturn(PETSC_SUCCESS);
431 }
432 
433 /* defines MatSetValues_MPICUSPARSE_Hash() */
434 #define TYPE AIJ
435 #define TYPE_AIJ
436 #define SUB_TYPE_CUSPARSE
437 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
438 #undef TYPE
439 #undef TYPE_AIJ
440 #undef SUB_TYPE_CUSPARSE
441 
442 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
443 {
444   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
445   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
446 
447   PetscFunctionBegin;
448   PetscCall(MatSetUp_MPI_Hash(A));
449   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
450   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
451   A->preallocated = PETSC_TRUE;
452   PetscFunctionReturn(PETSC_SUCCESS);
453 }
454 
455 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
456 {
457   Mat_MPIAIJ *a;
458   Mat         A;
459 
460   PetscFunctionBegin;
461   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
462   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
463   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
464   A             = *newmat;
465   A->boundtocpu = PETSC_FALSE;
466   PetscCall(PetscFree(A->defaultvectype));
467   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
468 
469   a = (Mat_MPIAIJ *)A->data;
470   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
471   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
472   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
473 
474   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
475 
476   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
477   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
478   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
479   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
480   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
481   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
482   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
483   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
484   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
485 
486   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
487   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
488   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
489   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
490   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
491   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
492 #if defined(PETSC_HAVE_HYPRE)
493   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
494 #endif
495   PetscFunctionReturn(PETSC_SUCCESS);
496 }
497 
498 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
499 {
500   PetscFunctionBegin;
501   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
502   PetscCall(MatCreate_MPIAIJ(A));
503   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
504   PetscFunctionReturn(PETSC_SUCCESS);
505 }
506 
507 /*@
508   MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
509   (the default parallel PETSc format).  This matrix will ultimately pushed down
510   to NVIDIA GPUs and use the CuSPARSE library for calculations.
511 
512   Collective
513 
514   Input Parameters:
515 + comm  - MPI communicator, set to `PETSC_COMM_SELF`
516 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
517            This value should be the same as the local size used in creating the
518            y vector for the matrix-vector product y = Ax.
519 . n     - This value should be the same as the local size used in creating the
520        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
521        calculated if `N` is given) For square matrices `n` is almost always `m`.
522 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
523 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
524 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
525            (same value is used for all local rows)
526 . d_nnz - array containing the number of nonzeros in the various rows of the
527            DIAGONAL portion of the local submatrix (possibly different for each row)
528            or `NULL`, if `d_nz` is used to specify the nonzero structure.
529            The size of this array is equal to the number of local rows, i.e `m`.
530            For matrices you plan to factor you must leave room for the diagonal entry and
531            put in the entry even if it is zero.
532 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
533            submatrix (same value is used for all local rows).
534 - o_nnz - array containing the number of nonzeros in the various rows of the
535            OFF-DIAGONAL portion of the local submatrix (possibly different for
536            each row) or `NULL`, if `o_nz` is used to specify the nonzero
537            structure. The size of this array is equal to the number
538            of local rows, i.e `m`.
539 
540   Output Parameter:
541 . A - the matrix
542 
543   Level: intermediate
544 
545   Notes:
546   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
547   MatXXXXSetPreallocation() paradigm instead of this routine directly.
548   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
549 
550   The AIJ format, also called the
551   compressed row storage), is fully compatible with standard Fortran
552   storage.  That is, the stored row and column indices can begin at
553   either one (as in Fortran) or zero.
554 
555 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
556 @*/
557 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
558 {
559   PetscMPIInt size;
560 
561   PetscFunctionBegin;
562   PetscCall(MatCreate(comm, A));
563   PetscCall(MatSetSizes(*A, m, n, M, N));
564   PetscCallMPI(MPI_Comm_size(comm, &size));
565   if (size > 1) {
566     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
567     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
568   } else {
569     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
570     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
571   }
572   PetscFunctionReturn(PETSC_SUCCESS);
573 }
574 
575 /*MC
576    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
577 
578    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
579    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
580    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
581 
582    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
583    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
584    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
585    for communicators controlling multiple processes.  It is recommended that you call both of
586    the above preallocation routines for simplicity.
587 
588    Options Database Keys:
589 +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
590 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
591 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
592 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
593 
594   Level: beginner
595 
596 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
597 M*/
598 
599 /*MC
600    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
601 
602   Level: beginner
603 
604 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
605 M*/
606