xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 010bec48e404682eb1850b482d20a94ac5ea46b4)
1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
2 
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12 
13 struct VecCUDAEquals {
14   template <typename Tuple>
15   __host__ __device__ void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
21 static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data)
22 {
23   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data;
24 
25   PetscFunctionBegin;
26   PetscCall(PetscSFDestroy(&coo->sf));
27   PetscCallCUDA(cudaFree(coo->Ajmap1));
28   PetscCallCUDA(cudaFree(coo->Aperm1));
29   PetscCallCUDA(cudaFree(coo->Bjmap1));
30   PetscCallCUDA(cudaFree(coo->Bperm1));
31   PetscCallCUDA(cudaFree(coo->Aimap2));
32   PetscCallCUDA(cudaFree(coo->Ajmap2));
33   PetscCallCUDA(cudaFree(coo->Aperm2));
34   PetscCallCUDA(cudaFree(coo->Bimap2));
35   PetscCallCUDA(cudaFree(coo->Bjmap2));
36   PetscCallCUDA(cudaFree(coo->Bperm2));
37   PetscCallCUDA(cudaFree(coo->Cperm1));
38   PetscCallCUDA(cudaFree(coo->sendbuf));
39   PetscCallCUDA(cudaFree(coo->recvbuf));
40   PetscCall(PetscFree(coo));
41   PetscFunctionReturn(PETSC_SUCCESS);
42 }
43 
44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45 {
46   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
47   PetscBool            dev_ij = PETSC_FALSE;
48   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
49   PetscInt            *i, *j;
50   PetscContainer       container_h, container_d;
51   MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52 
53   PetscFunctionBegin;
54   PetscCall(PetscFree(mpiaij->garray));
55   PetscCall(VecDestroy(&mpiaij->lvec));
56 #if defined(PETSC_USE_CTABLE)
57   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58 #else
59   PetscCall(PetscFree(mpiaij->colmap));
60 #endif
61   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62   mat->assembled     = PETSC_FALSE;
63   mat->was_assembled = PETSC_FALSE;
64   PetscCall(PetscGetMemType(coo_i, &mtype));
65   if (PetscMemTypeDevice(mtype)) {
66     dev_ij = PETSC_TRUE;
67     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
68     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
69     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70   } else {
71     i = coo_i;
72     j = coo_j;
73   }
74 
75   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
76   if (dev_ij) PetscCall(PetscFree2(i, j));
77   mat->offloadmask = PETSC_OFFLOAD_CPU;
78   // Create the GPU memory
79   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
80   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81 
82   // Copy the COO struct to device
83   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
84   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
85   PetscCall(PetscMalloc1(1, &coo_d));
86   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87 
88   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
89   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
90   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
91   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
92   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
93   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
94   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
95   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
96   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
97   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
98   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
99   PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100   PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101   PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102 
103   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
104   PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
105   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
106   PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
107   PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
108   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
109   PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
110   PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
111   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
112   PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
113   PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114 
115   // Put the COO struct in a container and then attach that to the matrix
116   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
117   PetscCall(PetscContainerSetPointer(container_d, coo_d));
118   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
119   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
120   PetscCall(PetscContainerDestroy(&container_d));
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
124 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
125 {
126   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
127   const PetscCount grid_size = gridDim.x * blockDim.x;
128   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
129 }
130 
131 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
132 {
133   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
134   const PetscCount grid_size = gridDim.x * blockDim.x;
135   for (; i < Annz + Bnnz; i += grid_size) {
136     PetscScalar sum = 0.0;
137     if (i < Annz) {
138       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
139       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
140     } else {
141       i -= Annz;
142       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
143       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
144     }
145   }
146 }
147 
148 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
149 {
150   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
151   const PetscCount grid_size = gridDim.x * blockDim.x;
152   for (; i < Annz2 + Bnnz2; i += grid_size) {
153     if (i < Annz2) {
154       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
155     } else {
156       i -= Annz2;
157       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
158     }
159   }
160 }
161 
162 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
163 {
164   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
165   Mat                  A = mpiaij->A, B = mpiaij->B;
166   PetscScalar         *Aa, *Ba;
167   const PetscScalar   *v1 = v;
168   PetscMemType         memtype;
169   PetscContainer       container;
170   MatCOOStruct_MPIAIJ *coo;
171 
172   PetscFunctionBegin;
173   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
174   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
175   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
176 
177   const auto &Annz   = coo->Annz;
178   const auto &Annz2  = coo->Annz2;
179   const auto &Bnnz   = coo->Bnnz;
180   const auto &Bnnz2  = coo->Bnnz2;
181   const auto &vsend  = coo->sendbuf;
182   const auto &v2     = coo->recvbuf;
183   const auto &Ajmap1 = coo->Ajmap1;
184   const auto &Ajmap2 = coo->Ajmap2;
185   const auto &Aimap2 = coo->Aimap2;
186   const auto &Bjmap1 = coo->Bjmap1;
187   const auto &Bjmap2 = coo->Bjmap2;
188   const auto &Bimap2 = coo->Bimap2;
189   const auto &Aperm1 = coo->Aperm1;
190   const auto &Aperm2 = coo->Aperm2;
191   const auto &Bperm1 = coo->Bperm1;
192   const auto &Bperm2 = coo->Bperm2;
193   const auto &Cperm1 = coo->Cperm1;
194 
195   PetscCall(PetscGetMemType(v, &memtype));
196   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
197     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
198     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
199   }
200 
201   if (imode == INSERT_VALUES) {
202     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
203     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
204   } else {
205     PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
206     PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
207   }
208 
209   PetscCall(PetscLogGpuTimeBegin());
210   /* Pack entries to be sent to remote */
211   if (coo->sendlen) {
212     MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
213     PetscCallCUDA(cudaPeekAtLastError());
214   }
215 
216   /* Send remote entries to their owner and overlap the communication with local computation */
217   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
218   /* Add local entries to A and B */
219   if (Annz + Bnnz > 0) {
220     MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
221     PetscCallCUDA(cudaPeekAtLastError());
222   }
223   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
224 
225   /* Add received remote entries to A and B */
226   if (Annz2 + Bnnz2 > 0) {
227     MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
228     PetscCallCUDA(cudaPeekAtLastError());
229   }
230   PetscCall(PetscLogGpuTimeEnd());
231 
232   if (imode == INSERT_VALUES) {
233     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
234     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
235   } else {
236     PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
237     PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
238   }
239   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
240   mat->offloadmask = PETSC_OFFLOAD_GPU;
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
245 {
246   Mat             Ad, Ao;
247   const PetscInt *cmap;
248 
249   PetscFunctionBegin;
250   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
251   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
252   if (glob) {
253     PetscInt cst, i, dn, on, *gidx;
254 
255     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
256     PetscCall(MatGetLocalSize(Ao, NULL, &on));
257     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
258     PetscCall(PetscMalloc1(dn + on, &gidx));
259     for (i = 0; i < dn; i++) gidx[i] = cst + i;
260     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
261     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
262   }
263   PetscFunctionReturn(PETSC_SUCCESS);
264 }
265 
266 static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
267 {
268   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
269   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
270   PetscInt            i;
271 
272   PetscFunctionBegin;
273   if (B->hash_active) {
274     B->ops[0]      = b->cops;
275     B->hash_active = PETSC_FALSE;
276   }
277   PetscCall(PetscLayoutSetUp(B->rmap));
278   PetscCall(PetscLayoutSetUp(B->cmap));
279   if (PetscDefined(USE_DEBUG) && d_nnz) {
280     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
281   }
282   if (PetscDefined(USE_DEBUG) && o_nnz) {
283     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
284   }
285 #if defined(PETSC_USE_CTABLE)
286   PetscCall(PetscHMapIDestroy(&b->colmap));
287 #else
288   PetscCall(PetscFree(b->colmap));
289 #endif
290   PetscCall(PetscFree(b->garray));
291   PetscCall(VecDestroy(&b->lvec));
292   PetscCall(VecScatterDestroy(&b->Mvctx));
293   /* Because the B will have been resized we simply destroy it and create a new one each time */
294   PetscCall(MatDestroy(&b->B));
295   if (!b->A) {
296     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
297     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
298   }
299   if (!b->B) {
300     PetscMPIInt size;
301     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
302     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
303     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
304   }
305   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
306   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
307   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
308   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
309   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
310   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
311   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
312   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
313   B->preallocated  = PETSC_TRUE;
314   B->was_assembled = PETSC_FALSE;
315   B->assembled     = PETSC_FALSE;
316   PetscFunctionReturn(PETSC_SUCCESS);
317 }
318 
319 static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
320 {
321   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
322 
323   PetscFunctionBegin;
324   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
325   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
326   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
327   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
328   PetscFunctionReturn(PETSC_SUCCESS);
329 }
330 
331 static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
332 {
333   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
334 
335   PetscFunctionBegin;
336   PetscCall(MatZeroEntries(l->A));
337   PetscCall(MatZeroEntries(l->B));
338   PetscFunctionReturn(PETSC_SUCCESS);
339 }
340 
341 static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
342 {
343   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
344 
345   PetscFunctionBegin;
346   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
347   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
348   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
349   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
350   PetscFunctionReturn(PETSC_SUCCESS);
351 }
352 
353 static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
354 {
355   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
356 
357   PetscFunctionBegin;
358   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
359   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
360   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
361   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
362   PetscFunctionReturn(PETSC_SUCCESS);
363 }
364 
365 static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
366 {
367   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
368   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
369 
370   PetscFunctionBegin;
371   switch (op) {
372   case MAT_CUSPARSE_MULT_DIAG:
373     cusparseStruct->diagGPUMatFormat = format;
374     break;
375   case MAT_CUSPARSE_MULT_OFFDIAG:
376     cusparseStruct->offdiagGPUMatFormat = format;
377     break;
378   case MAT_CUSPARSE_ALL:
379     cusparseStruct->diagGPUMatFormat    = format;
380     cusparseStruct->offdiagGPUMatFormat = format;
381     break;
382   default:
383     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
384   }
385   PetscFunctionReturn(PETSC_SUCCESS);
386 }
387 
388 static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
389 {
390   MatCUSPARSEStorageFormat format;
391   PetscBool                flg;
392   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
393   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
394 
395   PetscFunctionBegin;
396   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
397   if (A->factortype == MAT_FACTOR_NONE) {
398     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
399     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
400     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
401     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
402     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
403     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
404   }
405   PetscOptionsHeadEnd();
406   PetscFunctionReturn(PETSC_SUCCESS);
407 }
408 
409 static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
410 {
411   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
412 
413   PetscFunctionBegin;
414   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
415   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
416   PetscFunctionReturn(PETSC_SUCCESS);
417 }
418 
419 static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
420 {
421   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
422   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
423 
424   PetscFunctionBegin;
425   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
426   PetscCallCXX(delete cusparseStruct);
427   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
428   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
429   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
430   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
431   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
432   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
433   PetscCall(MatDestroy_MPIAIJ(A));
434   PetscFunctionReturn(PETSC_SUCCESS);
435 }
436 
437 /* defines MatSetValues_MPICUSPARSE_Hash() */
438 #define TYPE AIJ
439 #define TYPE_AIJ
440 #define SUB_TYPE_CUSPARSE
441 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
442 #undef TYPE
443 #undef TYPE_AIJ
444 #undef SUB_TYPE_CUSPARSE
445 
446 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
447 {
448   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
449   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
450 
451   PetscFunctionBegin;
452   PetscCall(MatSetUp_MPI_Hash(A));
453   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
454   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
455   A->preallocated = PETSC_TRUE;
456   PetscFunctionReturn(PETSC_SUCCESS);
457 }
458 
459 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
460 {
461   Mat_MPIAIJ *a;
462   Mat         A;
463 
464   PetscFunctionBegin;
465   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
466   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
467   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
468   A             = *newmat;
469   A->boundtocpu = PETSC_FALSE;
470   PetscCall(PetscFree(A->defaultvectype));
471   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
472 
473   a = (Mat_MPIAIJ *)A->data;
474   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
475   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
476   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
477 
478   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
479 
480   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
481   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
482   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
483   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
484   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
485   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
486   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
487   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
488   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
489 
490   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
491   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
492   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
493   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
494   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
495   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
496 #if defined(PETSC_HAVE_HYPRE)
497   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
498 #endif
499   PetscFunctionReturn(PETSC_SUCCESS);
500 }
501 
502 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
503 {
504   PetscFunctionBegin;
505   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
506   PetscCall(MatCreate_MPIAIJ(A));
507   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
508   PetscFunctionReturn(PETSC_SUCCESS);
509 }
510 
511 /*@
512   MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
513   (the default parallel PETSc format).  This matrix will ultimately pushed down
514   to NVIDIA GPUs and use the CuSPARSE library for calculations.
515 
516   Collective
517 
518   Input Parameters:
519 + comm  - MPI communicator, set to `PETSC_COMM_SELF`
520 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
521            This value should be the same as the local size used in creating the
522            y vector for the matrix-vector product y = Ax.
523 . n     - This value should be the same as the local size used in creating the
524        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
525        calculated if `N` is given) For square matrices `n` is almost always `m`.
526 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
527 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
528 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
529            (same value is used for all local rows)
530 . d_nnz - array containing the number of nonzeros in the various rows of the
531            DIAGONAL portion of the local submatrix (possibly different for each row)
532            or `NULL`, if `d_nz` is used to specify the nonzero structure.
533            The size of this array is equal to the number of local rows, i.e `m`.
534            For matrices you plan to factor you must leave room for the diagonal entry and
535            put in the entry even if it is zero.
536 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
537            submatrix (same value is used for all local rows).
538 - o_nnz - array containing the number of nonzeros in the various rows of the
539            OFF-DIAGONAL portion of the local submatrix (possibly different for
540            each row) or `NULL`, if `o_nz` is used to specify the nonzero
541            structure. The size of this array is equal to the number
542            of local rows, i.e `m`.
543 
544   Output Parameter:
545 . A - the matrix
546 
547   Level: intermediate
548 
549   Notes:
550   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
551   MatXXXXSetPreallocation() paradigm instead of this routine directly.
552   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
553 
554   The AIJ format, also called the
555   compressed row storage), is fully compatible with standard Fortran
556   storage.  That is, the stored row and column indices can begin at
557   either one (as in Fortran) or zero.
558 
559 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
560 @*/
561 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
562 {
563   PetscMPIInt size;
564 
565   PetscFunctionBegin;
566   PetscCall(MatCreate(comm, A));
567   PetscCall(MatSetSizes(*A, m, n, M, N));
568   PetscCallMPI(MPI_Comm_size(comm, &size));
569   if (size > 1) {
570     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
571     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
572   } else {
573     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
574     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
575   }
576   PetscFunctionReturn(PETSC_SUCCESS);
577 }
578 
579 /*MC
580    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
581 
582    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
583    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
584    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
585 
586    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
587    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
588    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
589    for communicators controlling multiple processes.  It is recommended that you call both of
590    the above preallocation routines for simplicity.
591 
592    Options Database Keys:
593 +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
594 .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
595 .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
596 -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
597 
598   Level: beginner
599 
600 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
601 M*/
602 
603 /*MC
604    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
605 
606   Level: beginner
607 
608 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
609 M*/
610