xref: /petsc/src/mat/impls/aij/mpi/mpihipsparse/mpiaijhipsparse.hip.cxx (revision daba9d70159ea2f6905738fcbec7404635487b2b)
1 /* Portions of this code are under:
2    Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
3 */
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpihipsparse/mpihipsparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12 
13 struct VecHIPEquals {
14   template <typename Tuple>
operator ()VecHIPEquals15   __host__ __device__ void operator()(Tuple t)
16   {
17     thrust::get<1>(t) = thrust::get<0>(t);
18   }
19 };
20 
MatCOOStructDestroy_MPIAIJHIPSPARSE(PetscCtxRt data)21 static PetscErrorCode MatCOOStructDestroy_MPIAIJHIPSPARSE(PetscCtxRt data)
22 {
23   MatCOOStruct_MPIAIJ *coo = *(MatCOOStruct_MPIAIJ **)data;
24 
25   PetscFunctionBegin;
26   PetscCall(PetscSFDestroy(&coo->sf));
27   PetscCallHIP(hipFree(coo->Ajmap1));
28   PetscCallHIP(hipFree(coo->Aperm1));
29   PetscCallHIP(hipFree(coo->Bjmap1));
30   PetscCallHIP(hipFree(coo->Bperm1));
31   PetscCallHIP(hipFree(coo->Aimap2));
32   PetscCallHIP(hipFree(coo->Ajmap2));
33   PetscCallHIP(hipFree(coo->Aperm2));
34   PetscCallHIP(hipFree(coo->Bimap2));
35   PetscCallHIP(hipFree(coo->Bjmap2));
36   PetscCallHIP(hipFree(coo->Bperm2));
37   PetscCallHIP(hipFree(coo->Cperm1));
38   PetscCallHIP(hipFree(coo->sendbuf));
39   PetscCallHIP(hipFree(coo->recvbuf));
40   PetscCall(PetscFree(coo));
41   PetscFunctionReturn(PETSC_SUCCESS);
42 }
43 
MatSetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45 {
46   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
47   PetscBool            dev_ij = PETSC_FALSE;
48   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
49   PetscInt            *i, *j;
50   PetscContainer       container_h;
51   MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52 
53   PetscFunctionBegin;
54   PetscCall(PetscFree(mpiaij->garray));
55   PetscCall(VecDestroy(&mpiaij->lvec));
56 #if defined(PETSC_USE_CTABLE)
57   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58 #else
59   PetscCall(PetscFree(mpiaij->colmap));
60 #endif
61   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62   mat->assembled     = PETSC_FALSE;
63   mat->was_assembled = PETSC_FALSE;
64   PetscCall(PetscGetMemType(coo_i, &mtype));
65   if (PetscMemTypeDevice(mtype)) {
66     dev_ij = PETSC_TRUE;
67     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
68     PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
69     PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
70   } else {
71     i = coo_i;
72     j = coo_j;
73   }
74 
75   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
76   if (dev_ij) PetscCall(PetscFree2(i, j));
77   mat->offloadmask = PETSC_OFFLOAD_CPU;
78   // Create the GPU memory
79   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mpiaij->A));
80   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mpiaij->B));
81 
82   // Copy the COO struct to device
83   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
84   PetscCall(PetscContainerGetPointer(container_h, &coo_h));
85   PetscCall(PetscMalloc1(1, &coo_d));
86   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87 
88   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
89   PetscCallHIP(hipMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
90   PetscCallHIP(hipMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
91   PetscCallHIP(hipMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
92   PetscCallHIP(hipMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
93   PetscCallHIP(hipMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
94   PetscCallHIP(hipMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
95   PetscCallHIP(hipMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
96   PetscCallHIP(hipMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
97   PetscCallHIP(hipMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
98   PetscCallHIP(hipMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
99   PetscCallHIP(hipMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100   PetscCallHIP(hipMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101   PetscCallHIP(hipMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102 
103   PetscCallHIP(hipMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
104   PetscCallHIP(hipMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), hipMemcpyHostToDevice));
105   PetscCallHIP(hipMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
106   PetscCallHIP(hipMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), hipMemcpyHostToDevice));
107   PetscCallHIP(hipMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), hipMemcpyHostToDevice));
108   PetscCallHIP(hipMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
109   PetscCallHIP(hipMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), hipMemcpyHostToDevice));
110   PetscCallHIP(hipMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), hipMemcpyHostToDevice));
111   PetscCallHIP(hipMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
112   PetscCallHIP(hipMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), hipMemcpyHostToDevice));
113   PetscCallHIP(hipMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), hipMemcpyHostToDevice));
114 
115   // Put the COO struct in a container and then attach that to the matrix
116   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_MPIAIJHIPSPARSE));
117   PetscFunctionReturn(PETSC_SUCCESS);
118 }
119 
MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])120 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
121 {
122   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
123   const PetscCount grid_size = gridDim.x * blockDim.x;
124   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
125 }
126 
MatAddLocalCOOValues(const PetscScalar kv[],InsertMode imode,PetscCount Annz,const PetscCount Ajmap1[],const PetscCount Aperm1[],PetscScalar Aa[],PetscCount Bnnz,const PetscCount Bjmap1[],const PetscCount Bperm1[],PetscScalar Ba[])127 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
128 {
129   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
130   const PetscCount grid_size = gridDim.x * blockDim.x;
131   for (; i < Annz + Bnnz; i += grid_size) {
132     PetscScalar sum = 0.0;
133     if (i < Annz) {
134       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
135       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
136     } else {
137       i -= Annz;
138       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
139       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
140     }
141   }
142 }
143 
MatAddRemoteCOOValues(const PetscScalar kv[],PetscCount Annz2,const PetscCount Aimap2[],const PetscCount Ajmap2[],const PetscCount Aperm2[],PetscScalar Aa[],PetscCount Bnnz2,const PetscCount Bimap2[],const PetscCount Bjmap2[],const PetscCount Bperm2[],PetscScalar Ba[])144 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
145 {
146   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
147   const PetscCount grid_size = gridDim.x * blockDim.x;
148   for (; i < Annz2 + Bnnz2; i += grid_size) {
149     if (i < Annz2) {
150       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
151     } else {
152       i -= Annz2;
153       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
154     }
155   }
156 }
157 
MatSetValuesCOO_MPIAIJHIPSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)158 static PetscErrorCode MatSetValuesCOO_MPIAIJHIPSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
159 {
160   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
161   Mat                  A = mpiaij->A, B = mpiaij->B;
162   PetscScalar         *Aa, *Ba;
163   const PetscScalar   *v1 = v;
164   PetscMemType         memtype;
165   PetscContainer       container;
166   MatCOOStruct_MPIAIJ *coo;
167 
168   PetscFunctionBegin;
169   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
170   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
171   PetscCall(PetscContainerGetPointer(container, &coo));
172 
173   const auto &Annz   = coo->Annz;
174   const auto &Annz2  = coo->Annz2;
175   const auto &Bnnz   = coo->Bnnz;
176   const auto &Bnnz2  = coo->Bnnz2;
177   const auto &vsend  = coo->sendbuf;
178   const auto &v2     = coo->recvbuf;
179   const auto &Ajmap1 = coo->Ajmap1;
180   const auto &Ajmap2 = coo->Ajmap2;
181   const auto &Aimap2 = coo->Aimap2;
182   const auto &Bjmap1 = coo->Bjmap1;
183   const auto &Bjmap2 = coo->Bjmap2;
184   const auto &Bimap2 = coo->Bimap2;
185   const auto &Aperm1 = coo->Aperm1;
186   const auto &Aperm2 = coo->Aperm2;
187   const auto &Bperm1 = coo->Bperm1;
188   const auto &Bperm2 = coo->Bperm2;
189   const auto &Cperm1 = coo->Cperm1;
190 
191   PetscCall(PetscGetMemType(v, &memtype));
192   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
193     PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
194     PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
195   }
196 
197   if (imode == INSERT_VALUES) {
198     PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
199     PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(B, &Ba));
200   } else {
201     PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa)); /* read & write matrix values */
202     PetscCall(MatSeqAIJHIPSPARSEGetArray(B, &Ba));
203   }
204 
205   PetscCall(PetscLogGpuTimeBegin());
206   /* Pack entries to be sent to remote */
207   if (coo->sendlen) {
208     hipLaunchKernelGGL(HIP_KERNEL_NAME(MatPackCOOValues), dim3((coo->sendlen + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, coo->sendlen, Cperm1, vsend);
209     PetscCallHIP(hipPeekAtLastError());
210   }
211 
212   /* Send remote entries to their owner and overlap the communication with local computation */
213   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HIP, vsend, PETSC_MEMTYPE_HIP, v2, MPI_REPLACE));
214   /* Add local entries to A and B */
215   if (Annz + Bnnz > 0) {
216     hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddLocalCOOValues), dim3((Annz + Bnnz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
217     PetscCallHIP(hipPeekAtLastError());
218   }
219   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
220 
221   /* Add received remote entries to A and B */
222   if (Annz2 + Bnnz2 > 0) {
223     hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddRemoteCOOValues), dim3((Annz2 + Bnnz2 + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
224     PetscCallHIP(hipPeekAtLastError());
225   }
226   PetscCall(PetscLogGpuTimeEnd());
227 
228   if (imode == INSERT_VALUES) {
229     PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
230     PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(B, &Ba));
231   } else {
232     PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
233     PetscCall(MatSeqAIJHIPSPARSERestoreArray(B, &Ba));
234   }
235   if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
236   mat->offloadmask = PETSC_OFFLOAD_GPU;
237   PetscFunctionReturn(PETSC_SUCCESS);
238 }
239 
MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE(Mat A,MatReuse scall,IS * glob,Mat * A_loc)240 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
241 {
242   Mat             Ad, Ao;
243   const PetscInt *cmap;
244 
245   PetscFunctionBegin;
246   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
247   PetscCall(MatSeqAIJHIPSPARSEMergeMats(Ad, Ao, scall, A_loc));
248   if (glob) {
249     PetscInt cst, i, dn, on, *gidx;
250 
251     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
252     PetscCall(MatGetLocalSize(Ao, NULL, &on));
253     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
254     PetscCall(PetscMalloc1(dn + on, &gidx));
255     for (i = 0; i < dn; i++) gidx[i] = cst + i;
256     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
257     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
258   }
259   PetscFunctionReturn(PETSC_SUCCESS);
260 }
261 
MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])262 static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
263 {
264   Mat_MPIAIJ          *b               = (Mat_MPIAIJ *)B->data;
265   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)b->spptr;
266   PetscInt             i;
267 
268   PetscFunctionBegin;
269   if (B->hash_active) {
270     B->ops[0]      = b->cops;
271     B->hash_active = PETSC_FALSE;
272   }
273   PetscCall(PetscLayoutSetUp(B->rmap));
274   PetscCall(PetscLayoutSetUp(B->cmap));
275   if (PetscDefined(USE_DEBUG) && d_nnz) {
276     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
277   }
278   if (PetscDefined(USE_DEBUG) && o_nnz) {
279     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
280   }
281 #if defined(PETSC_USE_CTABLE)
282   PetscCall(PetscHMapIDestroy(&b->colmap));
283 #else
284   PetscCall(PetscFree(b->colmap));
285 #endif
286   PetscCall(PetscFree(b->garray));
287   PetscCall(VecDestroy(&b->lvec));
288   PetscCall(VecScatterDestroy(&b->Mvctx));
289   /* Because the B will have been resized we simply destroy it and create a new one each time */
290   PetscCall(MatDestroy(&b->B));
291   if (!b->A) {
292     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
293     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
294   }
295   if (!b->B) {
296     PetscMPIInt size;
297     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
298     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
299     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
300   }
301   PetscCall(MatSetType(b->A, MATSEQAIJHIPSPARSE));
302   PetscCall(MatSetType(b->B, MATSEQAIJHIPSPARSE));
303   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
304   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
305   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
306   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
307   PetscCall(MatHIPSPARSESetFormat(b->A, MAT_HIPSPARSE_MULT, hipsparseStruct->diagGPUMatFormat));
308   PetscCall(MatHIPSPARSESetFormat(b->B, MAT_HIPSPARSE_MULT, hipsparseStruct->offdiagGPUMatFormat));
309   B->preallocated  = PETSC_TRUE;
310   B->was_assembled = PETSC_FALSE;
311   B->assembled     = PETSC_FALSE;
312   PetscFunctionReturn(PETSC_SUCCESS);
313 }
314 
MatMult_MPIAIJHIPSPARSE(Mat A,Vec xx,Vec yy)315 static PetscErrorCode MatMult_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
316 {
317   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
318 
319   PetscFunctionBegin;
320   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
321   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
322   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
323   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
324   PetscFunctionReturn(PETSC_SUCCESS);
325 }
326 
MatZeroEntries_MPIAIJHIPSPARSE(Mat A)327 static PetscErrorCode MatZeroEntries_MPIAIJHIPSPARSE(Mat A)
328 {
329   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
330 
331   PetscFunctionBegin;
332   PetscCall(MatZeroEntries(l->A));
333   PetscCall(MatZeroEntries(l->B));
334   PetscFunctionReturn(PETSC_SUCCESS);
335 }
336 
MatMultAdd_MPIAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz)337 static PetscErrorCode MatMultAdd_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
338 {
339   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
340 
341   PetscFunctionBegin;
342   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
343   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
344   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
345   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
MatMultTranspose_MPIAIJHIPSPARSE(Mat A,Vec xx,Vec yy)349 static PetscErrorCode MatMultTranspose_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
350 {
351   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
352 
353   PetscFunctionBegin;
354   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
355   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
356   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
357   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
358   PetscFunctionReturn(PETSC_SUCCESS);
359 }
360 
MatHIPSPARSESetFormat_MPIAIJHIPSPARSE(Mat A,MatHIPSPARSEFormatOperation op,MatHIPSPARSEStorageFormat format)361 static PetscErrorCode MatHIPSPARSESetFormat_MPIAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
362 {
363   Mat_MPIAIJ          *a               = (Mat_MPIAIJ *)A->data;
364   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;
365 
366   PetscFunctionBegin;
367   switch (op) {
368   case MAT_HIPSPARSE_MULT_DIAG:
369     hipsparseStruct->diagGPUMatFormat = format;
370     break;
371   case MAT_HIPSPARSE_MULT_OFFDIAG:
372     hipsparseStruct->offdiagGPUMatFormat = format;
373     break;
374   case MAT_HIPSPARSE_ALL:
375     hipsparseStruct->diagGPUMatFormat    = format;
376     hipsparseStruct->offdiagGPUMatFormat = format;
377     break;
378   default:
379     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. Only MAT_HIPSPARSE_MULT_DIAG, MAT_HIPSPARSE_MULT_DIAG, and MAT_HIPSPARSE_MULT_ALL are currently supported.", op);
380   }
381   PetscFunctionReturn(PETSC_SUCCESS);
382 }
383 
MatSetFromOptions_MPIAIJHIPSPARSE(Mat A,PetscOptionItems PetscOptionsObject)384 static PetscErrorCode MatSetFromOptions_MPIAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
385 {
386   MatHIPSPARSEStorageFormat format;
387   PetscBool                 flg;
388   Mat_MPIAIJ               *a               = (Mat_MPIAIJ *)A->data;
389   Mat_MPIAIJHIPSPARSE      *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;
390 
391   PetscFunctionBegin;
392   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJHIPSPARSE options");
393   if (A->factortype == MAT_FACTOR_NONE) {
394     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
395     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_DIAG, format));
396     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
397     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_OFFDIAG, format));
398     PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
399     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
400   }
401   PetscOptionsHeadEnd();
402   PetscFunctionReturn(PETSC_SUCCESS);
403 }
404 
MatAssemblyEnd_MPIAIJHIPSPARSE(Mat A,MatAssemblyType mode)405 static PetscErrorCode MatAssemblyEnd_MPIAIJHIPSPARSE(Mat A, MatAssemblyType mode)
406 {
407   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
408 
409   PetscFunctionBegin;
410   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
411   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQHIP));
412   PetscFunctionReturn(PETSC_SUCCESS);
413 }
414 
MatDestroy_MPIAIJHIPSPARSE(Mat A)415 static PetscErrorCode MatDestroy_MPIAIJHIPSPARSE(Mat A)
416 {
417   Mat_MPIAIJ          *aij             = (Mat_MPIAIJ *)A->data;
418   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)aij->spptr;
419 
420   PetscFunctionBegin;
421   PetscCheck(hipsparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
422   PetscCallCXX(delete hipsparseStruct);
423   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
424   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
425   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
426   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
427   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
428   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", NULL));
429   PetscCall(MatDestroy_MPIAIJ(A));
430   PetscFunctionReturn(PETSC_SUCCESS);
431 }
432 
MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat B,MatType mtype,MatReuse reuse,Mat * newmat)433 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
434 {
435   Mat_MPIAIJ *a;
436   Mat         A;
437 
438   PetscFunctionBegin;
439   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
440   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
441   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
442   A             = *newmat;
443   A->boundtocpu = PETSC_FALSE;
444   PetscCall(PetscFree(A->defaultvectype));
445   PetscCall(PetscStrallocpy(VECHIP, &A->defaultvectype));
446 
447   a = (Mat_MPIAIJ *)A->data;
448   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJHIPSPARSE));
449   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJHIPSPARSE));
450   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQHIP));
451 
452   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJHIPSPARSE);
453 
454   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJHIPSPARSE;
455   A->ops->mult                  = MatMult_MPIAIJHIPSPARSE;
456   A->ops->multadd               = MatMultAdd_MPIAIJHIPSPARSE;
457   A->ops->multtranspose         = MatMultTranspose_MPIAIJHIPSPARSE;
458   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJHIPSPARSE;
459   A->ops->destroy               = MatDestroy_MPIAIJHIPSPARSE;
460   A->ops->zeroentries           = MatZeroEntries_MPIAIJHIPSPARSE;
461   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
462   A->ops->getcurrentmemtype     = MatGetCurrentMemType_MPIAIJ;
463 
464   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJHIPSPARSE));
465   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE));
466   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE));
467   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_MPIAIJHIPSPARSE));
468   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJHIPSPARSE));
469   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJHIPSPARSE));
470 #if defined(PETSC_HAVE_HYPRE)
471   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
472 #endif
473   PetscFunctionReturn(PETSC_SUCCESS);
474 }
475 
MatCreate_MPIAIJHIPSPARSE(Mat A)476 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJHIPSPARSE(Mat A)
477 {
478   PetscFunctionBegin;
479   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
480   PetscCall(MatCreate_MPIAIJ(A));
481   PetscCall(MatConvert_MPIAIJ_MPIAIJHIPSPARSE(A, MATMPIAIJHIPSPARSE, MAT_INPLACE_MATRIX, &A));
482   PetscFunctionReturn(PETSC_SUCCESS);
483 }
484 
485 /*@
486   MatCreateAIJHIPSPARSE - Creates a sparse matrix in AIJ (compressed row) format
487   (the default parallel PETSc format).
488 
489   Collective
490 
491   Input Parameters:
492 + comm  - MPI communicator, set to `PETSC_COMM_SELF`
493 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
494            This value should be the same as the local size used in creating the
495            y vector for the matrix-vector product y = Ax.
496 . n     - This value should be the same as the local size used in creating the
497        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
498        calculated if `N` is given) For square matrices `n` is almost always `m`.
499 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
500 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
501 . d_nz  - number of nonzeros per row (same for all rows), for the "diagonal" portion of the matrix
502 . d_nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`, for the "diagonal" portion of the matrix
503 . o_nz  - number of nonzeros per row (same for all rows), for the "off-diagonal" portion of the matrix
504 - o_nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`, for the "off-diagonal" portion of the matrix
505 
506   Output Parameter:
507 . A - the matrix
508 
509   Level: intermediate
510 
511   Notes:
512   This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for
513   calculations. For good matrix assembly performance the user should preallocate the matrix
514   storage by setting the parameter `nz` (or the array `nnz`).
515 
516   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
517   MatXXXXSetPreallocation() paradigm instead of this routine directly.
518   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
519 
520   If `d_nnz` (`o_nnz`) is given then `d_nz` (`o_nz`) is ignored
521 
522   The `MATAIJ` format (compressed row storage), is fully compatible with standard Fortran
523   storage.  That is, the stored row and column indices can begin at
524   either one (as in Fortran) or zero.
525 
526   Specify the preallocated storage with either `d_nz` (`o_nz`) or `d_nnz` (`o_nnz`) (not both).
527   Set `d_nz` (`o_nz`) = `PETSC_DEFAULT` and `d_nnz` (`o_nnz`) = `NULL` for PETSc to control dynamic memory
528   allocation.
529 
530 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJHIPSPARSE`, `MATAIJHIPSPARSE`
531 @*/
MatCreateAIJHIPSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat * A)532 PetscErrorCode MatCreateAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
533 {
534   PetscMPIInt size;
535 
536   PetscFunctionBegin;
537   PetscCall(MatCreate(comm, A));
538   PetscCall(MatSetSizes(*A, m, n, M, N));
539   PetscCallMPI(MPI_Comm_size(comm, &size));
540   if (size > 1) {
541     PetscCall(MatSetType(*A, MATMPIAIJHIPSPARSE));
542     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
543   } else {
544     PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
545     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
546   }
547   PetscFunctionReturn(PETSC_SUCCESS);
548 }
549 
550 /*MC
551    MATAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJHIPSPARSE`.
552 
553    A matrix type whose data resides on GPUs. These matrices can be in either
554    CSR, ELL, or Hybrid format. All matrix calculations are performed on AMD GPUs using the HIPSPARSE library.
555 
556    This matrix type is identical to `MATSEQAIJHIPSPARSE` when constructed with a single process communicator,
557    and `MATMPIAIJHIPSPARSE` otherwise.  As a result, for single process communicators,
558    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
559    for communicators controlling multiple processes.  It is recommended that you call both of
560    the above preallocation routines for simplicity.
561 
562    Options Database Keys:
563 +  -mat_type mpiaijhipsparse - sets the matrix type to `MATMPIAIJHIPSPARSE`
564 .  -mat_hipsparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
565 .  -mat_hipsparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
566 -  -mat_hipsparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
567 
568   Level: beginner
569 
570 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJHIPSPARSE()`, `MATSEQAIJHIPSPARSE`, `MATMPIAIJHIPSPARSE`, `MatCreateSeqAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
571 M*/
572 
573 /*MC
574    MATMPIAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJHIPSPARSE`.
575 
576   Level: beginner
577 
578 .seealso: [](ch_matrices), `Mat`, `MATAIJHIPSPARSE`, `MATSEQAIJHIPSPARSE`
579 M*/
580