1 /* Portions of this code are under:
2 Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
3 */
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpihipsparse/mpihipsparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12
13 struct VecHIPEquals {
14 template <typename Tuple>
operator ()VecHIPEquals15 __host__ __device__ void operator()(Tuple t)
16 {
17 thrust::get<1>(t) = thrust::get<0>(t);
18 }
19 };
20
MatCOOStructDestroy_MPIAIJHIPSPARSE(PetscCtxRt data)21 static PetscErrorCode MatCOOStructDestroy_MPIAIJHIPSPARSE(PetscCtxRt data)
22 {
23 MatCOOStruct_MPIAIJ *coo = *(MatCOOStruct_MPIAIJ **)data;
24
25 PetscFunctionBegin;
26 PetscCall(PetscSFDestroy(&coo->sf));
27 PetscCallHIP(hipFree(coo->Ajmap1));
28 PetscCallHIP(hipFree(coo->Aperm1));
29 PetscCallHIP(hipFree(coo->Bjmap1));
30 PetscCallHIP(hipFree(coo->Bperm1));
31 PetscCallHIP(hipFree(coo->Aimap2));
32 PetscCallHIP(hipFree(coo->Ajmap2));
33 PetscCallHIP(hipFree(coo->Aperm2));
34 PetscCallHIP(hipFree(coo->Bimap2));
35 PetscCallHIP(hipFree(coo->Bjmap2));
36 PetscCallHIP(hipFree(coo->Bperm2));
37 PetscCallHIP(hipFree(coo->Cperm1));
38 PetscCallHIP(hipFree(coo->sendbuf));
39 PetscCallHIP(hipFree(coo->recvbuf));
40 PetscCall(PetscFree(coo));
41 PetscFunctionReturn(PETSC_SUCCESS);
42 }
43
MatSetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45 {
46 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
47 PetscBool dev_ij = PETSC_FALSE;
48 PetscMemType mtype = PETSC_MEMTYPE_HOST;
49 PetscInt *i, *j;
50 PetscContainer container_h;
51 MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52
53 PetscFunctionBegin;
54 PetscCall(PetscFree(mpiaij->garray));
55 PetscCall(VecDestroy(&mpiaij->lvec));
56 #if defined(PETSC_USE_CTABLE)
57 PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58 #else
59 PetscCall(PetscFree(mpiaij->colmap));
60 #endif
61 PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62 mat->assembled = PETSC_FALSE;
63 mat->was_assembled = PETSC_FALSE;
64 PetscCall(PetscGetMemType(coo_i, &mtype));
65 if (PetscMemTypeDevice(mtype)) {
66 dev_ij = PETSC_TRUE;
67 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
68 PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
69 PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
70 } else {
71 i = coo_i;
72 j = coo_j;
73 }
74
75 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
76 if (dev_ij) PetscCall(PetscFree2(i, j));
77 mat->offloadmask = PETSC_OFFLOAD_CPU;
78 // Create the GPU memory
79 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mpiaij->A));
80 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mpiaij->B));
81
82 // Copy the COO struct to device
83 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
84 PetscCall(PetscContainerGetPointer(container_h, &coo_h));
85 PetscCall(PetscMalloc1(1, &coo_d));
86 *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87
88 PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
89 PetscCallHIP(hipMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
90 PetscCallHIP(hipMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
91 PetscCallHIP(hipMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
92 PetscCallHIP(hipMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
93 PetscCallHIP(hipMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
94 PetscCallHIP(hipMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
95 PetscCallHIP(hipMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
96 PetscCallHIP(hipMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
97 PetscCallHIP(hipMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
98 PetscCallHIP(hipMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
99 PetscCallHIP(hipMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100 PetscCallHIP(hipMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101 PetscCallHIP(hipMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102
103 PetscCallHIP(hipMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
104 PetscCallHIP(hipMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), hipMemcpyHostToDevice));
105 PetscCallHIP(hipMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
106 PetscCallHIP(hipMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), hipMemcpyHostToDevice));
107 PetscCallHIP(hipMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), hipMemcpyHostToDevice));
108 PetscCallHIP(hipMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
109 PetscCallHIP(hipMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), hipMemcpyHostToDevice));
110 PetscCallHIP(hipMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), hipMemcpyHostToDevice));
111 PetscCallHIP(hipMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
112 PetscCallHIP(hipMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), hipMemcpyHostToDevice));
113 PetscCallHIP(hipMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), hipMemcpyHostToDevice));
114
115 // Put the COO struct in a container and then attach that to the matrix
116 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_MPIAIJHIPSPARSE));
117 PetscFunctionReturn(PETSC_SUCCESS);
118 }
119
MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])120 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
121 {
122 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
123 const PetscCount grid_size = gridDim.x * blockDim.x;
124 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
125 }
126
MatAddLocalCOOValues(const PetscScalar kv[],InsertMode imode,PetscCount Annz,const PetscCount Ajmap1[],const PetscCount Aperm1[],PetscScalar Aa[],PetscCount Bnnz,const PetscCount Bjmap1[],const PetscCount Bperm1[],PetscScalar Ba[])127 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
128 {
129 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
130 const PetscCount grid_size = gridDim.x * blockDim.x;
131 for (; i < Annz + Bnnz; i += grid_size) {
132 PetscScalar sum = 0.0;
133 if (i < Annz) {
134 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
135 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
136 } else {
137 i -= Annz;
138 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
139 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
140 }
141 }
142 }
143
MatAddRemoteCOOValues(const PetscScalar kv[],PetscCount Annz2,const PetscCount Aimap2[],const PetscCount Ajmap2[],const PetscCount Aperm2[],PetscScalar Aa[],PetscCount Bnnz2,const PetscCount Bimap2[],const PetscCount Bjmap2[],const PetscCount Bperm2[],PetscScalar Ba[])144 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
145 {
146 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
147 const PetscCount grid_size = gridDim.x * blockDim.x;
148 for (; i < Annz2 + Bnnz2; i += grid_size) {
149 if (i < Annz2) {
150 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
151 } else {
152 i -= Annz2;
153 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
154 }
155 }
156 }
157
MatSetValuesCOO_MPIAIJHIPSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)158 static PetscErrorCode MatSetValuesCOO_MPIAIJHIPSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
159 {
160 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
161 Mat A = mpiaij->A, B = mpiaij->B;
162 PetscScalar *Aa, *Ba;
163 const PetscScalar *v1 = v;
164 PetscMemType memtype;
165 PetscContainer container;
166 MatCOOStruct_MPIAIJ *coo;
167
168 PetscFunctionBegin;
169 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
170 PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
171 PetscCall(PetscContainerGetPointer(container, &coo));
172
173 const auto &Annz = coo->Annz;
174 const auto &Annz2 = coo->Annz2;
175 const auto &Bnnz = coo->Bnnz;
176 const auto &Bnnz2 = coo->Bnnz2;
177 const auto &vsend = coo->sendbuf;
178 const auto &v2 = coo->recvbuf;
179 const auto &Ajmap1 = coo->Ajmap1;
180 const auto &Ajmap2 = coo->Ajmap2;
181 const auto &Aimap2 = coo->Aimap2;
182 const auto &Bjmap1 = coo->Bjmap1;
183 const auto &Bjmap2 = coo->Bjmap2;
184 const auto &Bimap2 = coo->Bimap2;
185 const auto &Aperm1 = coo->Aperm1;
186 const auto &Aperm2 = coo->Aperm2;
187 const auto &Bperm1 = coo->Bperm1;
188 const auto &Bperm2 = coo->Bperm2;
189 const auto &Cperm1 = coo->Cperm1;
190
191 PetscCall(PetscGetMemType(v, &memtype));
192 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
193 PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
194 PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
195 }
196
197 if (imode == INSERT_VALUES) {
198 PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
199 PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(B, &Ba));
200 } else {
201 PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa)); /* read & write matrix values */
202 PetscCall(MatSeqAIJHIPSPARSEGetArray(B, &Ba));
203 }
204
205 PetscCall(PetscLogGpuTimeBegin());
206 /* Pack entries to be sent to remote */
207 if (coo->sendlen) {
208 hipLaunchKernelGGL(HIP_KERNEL_NAME(MatPackCOOValues), dim3((coo->sendlen + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, coo->sendlen, Cperm1, vsend);
209 PetscCallHIP(hipPeekAtLastError());
210 }
211
212 /* Send remote entries to their owner and overlap the communication with local computation */
213 PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HIP, vsend, PETSC_MEMTYPE_HIP, v2, MPI_REPLACE));
214 /* Add local entries to A and B */
215 if (Annz + Bnnz > 0) {
216 hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddLocalCOOValues), dim3((Annz + Bnnz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
217 PetscCallHIP(hipPeekAtLastError());
218 }
219 PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
220
221 /* Add received remote entries to A and B */
222 if (Annz2 + Bnnz2 > 0) {
223 hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddRemoteCOOValues), dim3((Annz2 + Bnnz2 + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
224 PetscCallHIP(hipPeekAtLastError());
225 }
226 PetscCall(PetscLogGpuTimeEnd());
227
228 if (imode == INSERT_VALUES) {
229 PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
230 PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(B, &Ba));
231 } else {
232 PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
233 PetscCall(MatSeqAIJHIPSPARSERestoreArray(B, &Ba));
234 }
235 if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
236 mat->offloadmask = PETSC_OFFLOAD_GPU;
237 PetscFunctionReturn(PETSC_SUCCESS);
238 }
239
MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE(Mat A,MatReuse scall,IS * glob,Mat * A_loc)240 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
241 {
242 Mat Ad, Ao;
243 const PetscInt *cmap;
244
245 PetscFunctionBegin;
246 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
247 PetscCall(MatSeqAIJHIPSPARSEMergeMats(Ad, Ao, scall, A_loc));
248 if (glob) {
249 PetscInt cst, i, dn, on, *gidx;
250
251 PetscCall(MatGetLocalSize(Ad, NULL, &dn));
252 PetscCall(MatGetLocalSize(Ao, NULL, &on));
253 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
254 PetscCall(PetscMalloc1(dn + on, &gidx));
255 for (i = 0; i < dn; i++) gidx[i] = cst + i;
256 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
257 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
258 }
259 PetscFunctionReturn(PETSC_SUCCESS);
260 }
261
MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])262 static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
263 {
264 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
265 Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)b->spptr;
266 PetscInt i;
267
268 PetscFunctionBegin;
269 if (B->hash_active) {
270 B->ops[0] = b->cops;
271 B->hash_active = PETSC_FALSE;
272 }
273 PetscCall(PetscLayoutSetUp(B->rmap));
274 PetscCall(PetscLayoutSetUp(B->cmap));
275 if (PetscDefined(USE_DEBUG) && d_nnz) {
276 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
277 }
278 if (PetscDefined(USE_DEBUG) && o_nnz) {
279 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
280 }
281 #if defined(PETSC_USE_CTABLE)
282 PetscCall(PetscHMapIDestroy(&b->colmap));
283 #else
284 PetscCall(PetscFree(b->colmap));
285 #endif
286 PetscCall(PetscFree(b->garray));
287 PetscCall(VecDestroy(&b->lvec));
288 PetscCall(VecScatterDestroy(&b->Mvctx));
289 /* Because the B will have been resized we simply destroy it and create a new one each time */
290 PetscCall(MatDestroy(&b->B));
291 if (!b->A) {
292 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
293 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
294 }
295 if (!b->B) {
296 PetscMPIInt size;
297 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
298 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
299 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
300 }
301 PetscCall(MatSetType(b->A, MATSEQAIJHIPSPARSE));
302 PetscCall(MatSetType(b->B, MATSEQAIJHIPSPARSE));
303 PetscCall(MatBindToCPU(b->A, B->boundtocpu));
304 PetscCall(MatBindToCPU(b->B, B->boundtocpu));
305 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
306 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
307 PetscCall(MatHIPSPARSESetFormat(b->A, MAT_HIPSPARSE_MULT, hipsparseStruct->diagGPUMatFormat));
308 PetscCall(MatHIPSPARSESetFormat(b->B, MAT_HIPSPARSE_MULT, hipsparseStruct->offdiagGPUMatFormat));
309 B->preallocated = PETSC_TRUE;
310 B->was_assembled = PETSC_FALSE;
311 B->assembled = PETSC_FALSE;
312 PetscFunctionReturn(PETSC_SUCCESS);
313 }
314
MatMult_MPIAIJHIPSPARSE(Mat A,Vec xx,Vec yy)315 static PetscErrorCode MatMult_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
316 {
317 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
318
319 PetscFunctionBegin;
320 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
321 PetscCall((*a->A->ops->mult)(a->A, xx, yy));
322 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
323 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
324 PetscFunctionReturn(PETSC_SUCCESS);
325 }
326
MatZeroEntries_MPIAIJHIPSPARSE(Mat A)327 static PetscErrorCode MatZeroEntries_MPIAIJHIPSPARSE(Mat A)
328 {
329 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
330
331 PetscFunctionBegin;
332 PetscCall(MatZeroEntries(l->A));
333 PetscCall(MatZeroEntries(l->B));
334 PetscFunctionReturn(PETSC_SUCCESS);
335 }
336
MatMultAdd_MPIAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz)337 static PetscErrorCode MatMultAdd_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
338 {
339 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
340
341 PetscFunctionBegin;
342 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
343 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
344 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
345 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
346 PetscFunctionReturn(PETSC_SUCCESS);
347 }
348
MatMultTranspose_MPIAIJHIPSPARSE(Mat A,Vec xx,Vec yy)349 static PetscErrorCode MatMultTranspose_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
350 {
351 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
352
353 PetscFunctionBegin;
354 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
355 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
356 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
357 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
358 PetscFunctionReturn(PETSC_SUCCESS);
359 }
360
MatHIPSPARSESetFormat_MPIAIJHIPSPARSE(Mat A,MatHIPSPARSEFormatOperation op,MatHIPSPARSEStorageFormat format)361 static PetscErrorCode MatHIPSPARSESetFormat_MPIAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
362 {
363 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
364 Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;
365
366 PetscFunctionBegin;
367 switch (op) {
368 case MAT_HIPSPARSE_MULT_DIAG:
369 hipsparseStruct->diagGPUMatFormat = format;
370 break;
371 case MAT_HIPSPARSE_MULT_OFFDIAG:
372 hipsparseStruct->offdiagGPUMatFormat = format;
373 break;
374 case MAT_HIPSPARSE_ALL:
375 hipsparseStruct->diagGPUMatFormat = format;
376 hipsparseStruct->offdiagGPUMatFormat = format;
377 break;
378 default:
379 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. Only MAT_HIPSPARSE_MULT_DIAG, MAT_HIPSPARSE_MULT_DIAG, and MAT_HIPSPARSE_MULT_ALL are currently supported.", op);
380 }
381 PetscFunctionReturn(PETSC_SUCCESS);
382 }
383
MatSetFromOptions_MPIAIJHIPSPARSE(Mat A,PetscOptionItems PetscOptionsObject)384 static PetscErrorCode MatSetFromOptions_MPIAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
385 {
386 MatHIPSPARSEStorageFormat format;
387 PetscBool flg;
388 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
389 Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;
390
391 PetscFunctionBegin;
392 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJHIPSPARSE options");
393 if (A->factortype == MAT_FACTOR_NONE) {
394 PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
395 if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_DIAG, format));
396 PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
397 if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_OFFDIAG, format));
398 PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
399 if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
400 }
401 PetscOptionsHeadEnd();
402 PetscFunctionReturn(PETSC_SUCCESS);
403 }
404
MatAssemblyEnd_MPIAIJHIPSPARSE(Mat A,MatAssemblyType mode)405 static PetscErrorCode MatAssemblyEnd_MPIAIJHIPSPARSE(Mat A, MatAssemblyType mode)
406 {
407 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
408
409 PetscFunctionBegin;
410 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
411 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQHIP));
412 PetscFunctionReturn(PETSC_SUCCESS);
413 }
414
MatDestroy_MPIAIJHIPSPARSE(Mat A)415 static PetscErrorCode MatDestroy_MPIAIJHIPSPARSE(Mat A)
416 {
417 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
418 Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)aij->spptr;
419
420 PetscFunctionBegin;
421 PetscCheck(hipsparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
422 PetscCallCXX(delete hipsparseStruct);
423 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
424 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
425 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
426 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
427 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
428 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", NULL));
429 PetscCall(MatDestroy_MPIAIJ(A));
430 PetscFunctionReturn(PETSC_SUCCESS);
431 }
432
MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat B,MatType mtype,MatReuse reuse,Mat * newmat)433 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
434 {
435 Mat_MPIAIJ *a;
436 Mat A;
437
438 PetscFunctionBegin;
439 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
440 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
441 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
442 A = *newmat;
443 A->boundtocpu = PETSC_FALSE;
444 PetscCall(PetscFree(A->defaultvectype));
445 PetscCall(PetscStrallocpy(VECHIP, &A->defaultvectype));
446
447 a = (Mat_MPIAIJ *)A->data;
448 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJHIPSPARSE));
449 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJHIPSPARSE));
450 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQHIP));
451
452 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJHIPSPARSE);
453
454 A->ops->assemblyend = MatAssemblyEnd_MPIAIJHIPSPARSE;
455 A->ops->mult = MatMult_MPIAIJHIPSPARSE;
456 A->ops->multadd = MatMultAdd_MPIAIJHIPSPARSE;
457 A->ops->multtranspose = MatMultTranspose_MPIAIJHIPSPARSE;
458 A->ops->setfromoptions = MatSetFromOptions_MPIAIJHIPSPARSE;
459 A->ops->destroy = MatDestroy_MPIAIJHIPSPARSE;
460 A->ops->zeroentries = MatZeroEntries_MPIAIJHIPSPARSE;
461 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
462 A->ops->getcurrentmemtype = MatGetCurrentMemType_MPIAIJ;
463
464 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJHIPSPARSE));
465 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE));
466 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE));
467 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_MPIAIJHIPSPARSE));
468 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJHIPSPARSE));
469 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJHIPSPARSE));
470 #if defined(PETSC_HAVE_HYPRE)
471 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
472 #endif
473 PetscFunctionReturn(PETSC_SUCCESS);
474 }
475
MatCreate_MPIAIJHIPSPARSE(Mat A)476 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJHIPSPARSE(Mat A)
477 {
478 PetscFunctionBegin;
479 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
480 PetscCall(MatCreate_MPIAIJ(A));
481 PetscCall(MatConvert_MPIAIJ_MPIAIJHIPSPARSE(A, MATMPIAIJHIPSPARSE, MAT_INPLACE_MATRIX, &A));
482 PetscFunctionReturn(PETSC_SUCCESS);
483 }
484
485 /*@
486 MatCreateAIJHIPSPARSE - Creates a sparse matrix in AIJ (compressed row) format
487 (the default parallel PETSc format).
488
489 Collective
490
491 Input Parameters:
492 + comm - MPI communicator, set to `PETSC_COMM_SELF`
493 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
494 This value should be the same as the local size used in creating the
495 y vector for the matrix-vector product y = Ax.
496 . n - This value should be the same as the local size used in creating the
497 x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
498 calculated if `N` is given) For square matrices `n` is almost always `m`.
499 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
500 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
501 . d_nz - number of nonzeros per row (same for all rows), for the "diagonal" portion of the matrix
502 . d_nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`, for the "diagonal" portion of the matrix
503 . o_nz - number of nonzeros per row (same for all rows), for the "off-diagonal" portion of the matrix
504 - o_nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`, for the "off-diagonal" portion of the matrix
505
506 Output Parameter:
507 . A - the matrix
508
509 Level: intermediate
510
511 Notes:
512 This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for
513 calculations. For good matrix assembly performance the user should preallocate the matrix
514 storage by setting the parameter `nz` (or the array `nnz`).
515
516 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
517 MatXXXXSetPreallocation() paradigm instead of this routine directly.
518 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
519
520 If `d_nnz` (`o_nnz`) is given then `d_nz` (`o_nz`) is ignored
521
522 The `MATAIJ` format (compressed row storage), is fully compatible with standard Fortran
523 storage. That is, the stored row and column indices can begin at
524 either one (as in Fortran) or zero.
525
526 Specify the preallocated storage with either `d_nz` (`o_nz`) or `d_nnz` (`o_nnz`) (not both).
527 Set `d_nz` (`o_nz`) = `PETSC_DEFAULT` and `d_nnz` (`o_nnz`) = `NULL` for PETSc to control dynamic memory
528 allocation.
529
530 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJHIPSPARSE`, `MATAIJHIPSPARSE`
531 @*/
MatCreateAIJHIPSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat * A)532 PetscErrorCode MatCreateAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
533 {
534 PetscMPIInt size;
535
536 PetscFunctionBegin;
537 PetscCall(MatCreate(comm, A));
538 PetscCall(MatSetSizes(*A, m, n, M, N));
539 PetscCallMPI(MPI_Comm_size(comm, &size));
540 if (size > 1) {
541 PetscCall(MatSetType(*A, MATMPIAIJHIPSPARSE));
542 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
543 } else {
544 PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
545 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
546 }
547 PetscFunctionReturn(PETSC_SUCCESS);
548 }
549
550 /*MC
551 MATAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJHIPSPARSE`.
552
553 A matrix type whose data resides on GPUs. These matrices can be in either
554 CSR, ELL, or Hybrid format. All matrix calculations are performed on AMD GPUs using the HIPSPARSE library.
555
556 This matrix type is identical to `MATSEQAIJHIPSPARSE` when constructed with a single process communicator,
557 and `MATMPIAIJHIPSPARSE` otherwise. As a result, for single process communicators,
558 `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
559 for communicators controlling multiple processes. It is recommended that you call both of
560 the above preallocation routines for simplicity.
561
562 Options Database Keys:
563 + -mat_type mpiaijhipsparse - sets the matrix type to `MATMPIAIJHIPSPARSE`
564 . -mat_hipsparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
565 . -mat_hipsparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
566 - -mat_hipsparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
567
568 Level: beginner
569
570 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJHIPSPARSE()`, `MATSEQAIJHIPSPARSE`, `MATMPIAIJHIPSPARSE`, `MatCreateSeqAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
571 M*/
572
573 /*MC
574 MATMPIAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJHIPSPARSE`.
575
576 Level: beginner
577
578 .seealso: [](ch_matrices), `Mat`, `MATAIJHIPSPARSE`, `MATSEQAIJHIPSPARSE`
579 M*/
580