1 /*
2 Defines the basic matrix operations for the AIJ (compressed row)
3 matrix storage format using the HIPSPARSE library,
4 Portions of this code are under:
5 Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6 */
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal()
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/iterator/transform_iterator.h>
17 #if PETSC_CPP_VERSION >= 14
18 #define PETSC_HAVE_THRUST_ASYNC 1
19 #include <thrust/async/for_each.h>
20 #endif
21 #include <thrust/iterator/constant_iterator.h>
22 #include <thrust/iterator/discard_iterator.h>
23 #include <thrust/binary_search.h>
24 #include <thrust/remove.h>
25 #include <thrust/sort.h>
26 #include <thrust/unique.h>
27
28 const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
29 const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
30 const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
31 //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
32
33 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
34 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
35 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
36 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
37 static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
38 static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
39 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
40 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
41 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
42 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
43 static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems PetscOptionsObject);
44 static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
45 static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
46 static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
47 static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
48 static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
49 static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
50 static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
51 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
52 static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
53 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
54 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
55 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
56 static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
57 static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
58 static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
59 static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
60 static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
61 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
62 static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
63 static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
64 static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
65
66 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
67 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
68
69 /*
70 PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
71 {
72 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
73
74 PetscFunctionBegin;
75 PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
76 hipsparsestruct->stream = stream;
77 PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
78 PetscFunctionReturn(PETSC_SUCCESS);
79 }
80
81 PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
82 {
83 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
84
85 PetscFunctionBegin;
86 PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
87 if (hipsparsestruct->handle != handle) {
88 if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
89 hipsparsestruct->handle = handle;
90 }
91 PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
92 PetscFunctionReturn(PETSC_SUCCESS);
93 }
94
95 PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
96 {
97 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
98 PetscBool flg;
99
100 PetscFunctionBegin;
101 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
102 if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
103 if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
104 PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 */
107
MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A,MatHIPSPARSEFormatOperation op,MatHIPSPARSEStorageFormat format)108 PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
109 {
110 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
111
112 PetscFunctionBegin;
113 switch (op) {
114 case MAT_HIPSPARSE_MULT:
115 hipsparsestruct->format = format;
116 break;
117 case MAT_HIPSPARSE_ALL:
118 hipsparsestruct->format = format;
119 break;
120 default:
121 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
122 }
123 PetscFunctionReturn(PETSC_SUCCESS);
124 }
125
126 /*@
127 MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
128 operation. Only the `MatMult()` operation can use different GPU storage formats
129
130 Not Collective
131
132 Input Parameters:
133 + A - Matrix of type `MATSEQAIJHIPSPARSE`
134 . op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
135 `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
136 - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
137
138 Level: intermediate
139
140 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
141 @*/
MatHIPSPARSESetFormat(Mat A,MatHIPSPARSEFormatOperation op,MatHIPSPARSEStorageFormat format)142 PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
143 {
144 PetscFunctionBegin;
145 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
146 PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
147 PetscFunctionReturn(PETSC_SUCCESS);
148 }
149
MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A,PetscBool use_cpu)150 PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
151 {
152 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
153
154 PetscFunctionBegin;
155 hipsparsestruct->use_cpu_solve = use_cpu;
156 PetscFunctionReturn(PETSC_SUCCESS);
157 }
158
159 /*@
160 MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
161
162 Input Parameters:
163 + A - Matrix of type `MATSEQAIJHIPSPARSE`
164 - use_cpu - set flag for using the built-in CPU `MatSolve()`
165
166 Level: intermediate
167
168 Notes:
169 The hipSparse LU solver currently computes the factors with the built-in CPU method
170 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
171 This method to specifies if the solve is done on the CPU or GPU (GPU is the default).
172
173 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
174 @*/
MatHIPSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)175 PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
176 {
177 PetscFunctionBegin;
178 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
179 PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
180 PetscFunctionReturn(PETSC_SUCCESS);
181 }
182
MatSetOption_SeqAIJHIPSPARSE(Mat A,MatOption op,PetscBool flg)183 static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
184 {
185 PetscFunctionBegin;
186 switch (op) {
187 case MAT_FORM_EXPLICIT_TRANSPOSE:
188 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
189 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
190 A->form_explicit_transpose = flg;
191 break;
192 default:
193 PetscCall(MatSetOption_SeqAIJ(A, op, flg));
194 break;
195 }
196 PetscFunctionReturn(PETSC_SUCCESS);
197 }
198
MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B,Mat A,const MatFactorInfo * info)199 static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
200 {
201 PetscBool row_identity, col_identity;
202 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
203 IS isrow = b->row, iscol = b->col;
204 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;
205
206 PetscFunctionBegin;
207 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
208 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
209 B->offloadmask = PETSC_OFFLOAD_CPU;
210 /* determine which version of MatSolve needs to be used. */
211 PetscCall(ISIdentity(isrow, &row_identity));
212 PetscCall(ISIdentity(iscol, &col_identity));
213 if (!hipsparsestruct->use_cpu_solve) {
214 if (row_identity && col_identity) {
215 B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
216 B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
217 } else {
218 B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
219 B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
220 }
221 }
222 B->ops->matsolve = NULL;
223 B->ops->matsolvetranspose = NULL;
224
225 /* get the triangular factors */
226 if (!hipsparsestruct->use_cpu_solve) PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B));
227 PetscFunctionReturn(PETSC_SUCCESS);
228 }
229
MatSetFromOptions_SeqAIJHIPSPARSE(Mat A,PetscOptionItems PetscOptionsObject)230 static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
231 {
232 MatHIPSPARSEStorageFormat format;
233 PetscBool flg;
234 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
235
236 PetscFunctionBegin;
237 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
238 if (A->factortype == MAT_FACTOR_NONE) {
239 PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
240 if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
241 PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
242 if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
243 PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
244 if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
245 PetscCall(
246 PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
247 /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
248 PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249 PetscCall(
250 PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
251 PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
252 /*
253 PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
254 PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
255 */
256 }
257 PetscOptionsHeadEnd();
258 PetscFunctionReturn(PETSC_SUCCESS);
259 }
260
MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)261 static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
262 {
263 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
264 PetscInt n = A->rmap->n;
265 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
266 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
267 const PetscInt *ai = a->i, *aj = a->j, *vi;
268 const MatScalar *aa = a->a, *v;
269 PetscInt *AiLo, *AjLo;
270 PetscInt i, nz, nzLower, offset, rowOffset;
271
272 PetscFunctionBegin;
273 if (!n) PetscFunctionReturn(PETSC_SUCCESS);
274 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
275 try {
276 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
277 nzLower = n + ai[n] - ai[1];
278 if (!loTriFactor) {
279 PetscScalar *AALo;
280 PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));
281
282 /* Allocate Space for the lower triangular matrix */
283 PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
284 PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));
285
286 /* Fill the lower triangular matrix */
287 AiLo[0] = (PetscInt)0;
288 AiLo[n] = nzLower;
289 AjLo[0] = (PetscInt)0;
290 AALo[0] = (MatScalar)1.0;
291 v = aa;
292 vi = aj;
293 offset = 1;
294 rowOffset = 1;
295 for (i = 1; i < n; i++) {
296 nz = ai[i + 1] - ai[i];
297 /* additional 1 for the term on the diagonal */
298 AiLo[i] = rowOffset;
299 rowOffset += nz + 1;
300
301 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
302 PetscCall(PetscArraycpy(&AALo[offset], v, nz));
303 offset += nz;
304 AjLo[offset] = (PetscInt)i;
305 AALo[offset] = (MatScalar)1.0;
306 offset += 1;
307 v += nz;
308 vi += nz;
309 }
310
311 /* allocate space for the triangular factor information */
312 PetscCall(PetscNew(&loTriFactor));
313 loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
314 /* Create the matrix description */
315 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
316 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
317 PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
318 PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
319 PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
320
321 /* set the operation */
322 loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
323
324 /* set the matrix */
325 loTriFactor->csrMat = new CsrMatrix;
326 loTriFactor->csrMat->num_rows = n;
327 loTriFactor->csrMat->num_cols = n;
328 loTriFactor->csrMat->num_entries = nzLower;
329 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
330 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
331 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
332
333 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
334 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
335 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
336
337 /* Create the solve analysis information */
338 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
339 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
340 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
342 PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
343
344 /* perform the solve analysis */
345 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
346 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
347
348 PetscCallHIP(WaitForHIP());
349 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
350
351 /* assign the pointer */
352 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
353 loTriFactor->AA_h = AALo;
354 PetscCallHIP(hipHostFree(AiLo));
355 PetscCallHIP(hipHostFree(AjLo));
356 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
357 } else { /* update values only */
358 if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
359 /* Fill the lower triangular matrix */
360 loTriFactor->AA_h[0] = 1.0;
361 v = aa;
362 vi = aj;
363 offset = 1;
364 for (i = 1; i < n; i++) {
365 nz = ai[i + 1] - ai[i];
366 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
367 offset += nz;
368 loTriFactor->AA_h[offset] = 1.0;
369 offset += 1;
370 v += nz;
371 }
372 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
373 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
374 }
375 } catch (char *ex) {
376 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
377 }
378 }
379 PetscFunctionReturn(PETSC_SUCCESS);
380 }
381
MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)382 static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
383 {
384 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
385 PetscInt n = A->rmap->n;
386 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
387 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
388 const PetscInt *aj = a->j, *adiag, *vi;
389 const MatScalar *aa = a->a, *v;
390 PetscInt *AiUp, *AjUp;
391 PetscInt i, nz, nzUpper, offset;
392
393 PetscFunctionBegin;
394 if (!n) PetscFunctionReturn(PETSC_SUCCESS);
395 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
396 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
397 try {
398 /* next, figure out the number of nonzeros in the upper triangular matrix. */
399 nzUpper = adiag[0] - adiag[n];
400 if (!upTriFactor) {
401 PetscScalar *AAUp;
402 PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
403
404 /* Allocate Space for the upper triangular matrix */
405 PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
406 PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
407
408 /* Fill the upper triangular matrix */
409 AiUp[0] = (PetscInt)0;
410 AiUp[n] = nzUpper;
411 offset = nzUpper;
412 for (i = n - 1; i >= 0; i--) {
413 v = aa + adiag[i + 1] + 1;
414 vi = aj + adiag[i + 1] + 1;
415 nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
416 offset -= (nz + 1); /* decrement the offset */
417
418 /* first, set the diagonal elements */
419 AjUp[offset] = (PetscInt)i;
420 AAUp[offset] = (MatScalar)1. / v[nz];
421 AiUp[i] = AiUp[i + 1] - (nz + 1);
422
423 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
424 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
425 }
426
427 /* allocate space for the triangular factor information */
428 PetscCall(PetscNew(&upTriFactor));
429 upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
430
431 /* Create the matrix description */
432 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
433 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
434 PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
435 PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
436 PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
437
438 /* set the operation */
439 upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
440
441 /* set the matrix */
442 upTriFactor->csrMat = new CsrMatrix;
443 upTriFactor->csrMat->num_rows = n;
444 upTriFactor->csrMat->num_cols = n;
445 upTriFactor->csrMat->num_entries = nzUpper;
446 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
447 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
448 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
449 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
450 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
451 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
452
453 /* Create the solve analysis information */
454 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
455 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
456 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
457 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
458 PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
459
460 /* perform the solve analysis */
461 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
462 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
463
464 PetscCallHIP(WaitForHIP());
465 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
466
467 /* assign the pointer */
468 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
469 upTriFactor->AA_h = AAUp;
470 PetscCallHIP(hipHostFree(AiUp));
471 PetscCallHIP(hipHostFree(AjUp));
472 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
473 } else {
474 if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
475 /* Fill the upper triangular matrix */
476 offset = nzUpper;
477 for (i = n - 1; i >= 0; i--) {
478 v = aa + adiag[i + 1] + 1;
479 nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
480 offset -= (nz + 1); /* decrement the offset */
481
482 /* first, set the diagonal elements */
483 upTriFactor->AA_h[offset] = 1. / v[nz];
484 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
485 }
486 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
487 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
488 }
489 } catch (char *ex) {
490 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
491 }
492 }
493 PetscFunctionReturn(PETSC_SUCCESS);
494 }
495
MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)496 static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
497 {
498 PetscBool row_identity, col_identity;
499 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
500 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
501 IS isrow = a->row, iscol = a->icol;
502 PetscInt n = A->rmap->n;
503
504 PetscFunctionBegin;
505 PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
506 PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
507 PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));
508
509 if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
510 hipsparseTriFactors->nnz = a->nz;
511
512 A->offloadmask = PETSC_OFFLOAD_BOTH;
513 /* lower triangular indices */
514 PetscCall(ISIdentity(isrow, &row_identity));
515 if (!row_identity && !hipsparseTriFactors->rpermIndices) {
516 const PetscInt *r;
517
518 PetscCall(ISGetIndices(isrow, &r));
519 hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
520 hipsparseTriFactors->rpermIndices->assign(r, r + n);
521 PetscCall(ISRestoreIndices(isrow, &r));
522 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
523 }
524 /* upper triangular indices */
525 PetscCall(ISIdentity(iscol, &col_identity));
526 if (!col_identity && !hipsparseTriFactors->cpermIndices) {
527 const PetscInt *c;
528
529 PetscCall(ISGetIndices(iscol, &c));
530 hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
531 hipsparseTriFactors->cpermIndices->assign(c, c + n);
532 PetscCall(ISRestoreIndices(iscol, &c));
533 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
534 }
535 PetscFunctionReturn(PETSC_SUCCESS);
536 }
537
MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)538 static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
539 {
540 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
541 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
542 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
543 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
544 PetscInt *AiUp, *AjUp;
545 PetscScalar *AAUp;
546 PetscScalar *AALo;
547 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
548 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
549 const PetscInt *ai = b->i, *aj = b->j, *vj;
550 const MatScalar *aa = b->a, *v;
551
552 PetscFunctionBegin;
553 if (!n) PetscFunctionReturn(PETSC_SUCCESS);
554 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
555 try {
556 PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
557 PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
558 if (!upTriFactor && !loTriFactor) {
559 /* Allocate Space for the upper triangular matrix */
560 PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
561 PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
562
563 /* Fill the upper triangular matrix */
564 AiUp[0] = (PetscInt)0;
565 AiUp[n] = nzUpper;
566 offset = 0;
567 for (i = 0; i < n; i++) {
568 /* set the pointers */
569 v = aa + ai[i];
570 vj = aj + ai[i];
571 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
572
573 /* first, set the diagonal elements */
574 AjUp[offset] = (PetscInt)i;
575 AAUp[offset] = (MatScalar)1.0 / v[nz];
576 AiUp[i] = offset;
577 AALo[offset] = (MatScalar)1.0 / v[nz];
578
579 offset += 1;
580 if (nz > 0) {
581 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
582 PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
583 for (j = offset; j < offset + nz; j++) {
584 AAUp[j] = -AAUp[j];
585 AALo[j] = AAUp[j] / v[nz];
586 }
587 offset += nz;
588 }
589 }
590
591 /* allocate space for the triangular factor information */
592 PetscCall(PetscNew(&upTriFactor));
593 upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
594
595 /* Create the matrix description */
596 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
597 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
598 PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
599 PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
600 PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
601
602 /* set the matrix */
603 upTriFactor->csrMat = new CsrMatrix;
604 upTriFactor->csrMat->num_rows = A->rmap->n;
605 upTriFactor->csrMat->num_cols = A->cmap->n;
606 upTriFactor->csrMat->num_entries = a->nz;
607 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
608 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
610 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
611 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
612 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
613
614 /* set the operation */
615 upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
616
617 /* Create the solve analysis information */
618 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
619 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
620 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
621 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
622 PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
623
624 /* perform the solve analysis */
625 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
626 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
627
628 PetscCallHIP(WaitForHIP());
629 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
630
631 /* assign the pointer */
632 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
633
634 /* allocate space for the triangular factor information */
635 PetscCall(PetscNew(&loTriFactor));
636 loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
637
638 /* Create the matrix description */
639 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
640 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
641 PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
642 PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
643 PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
644
645 /* set the operation */
646 loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;
647
648 /* set the matrix */
649 loTriFactor->csrMat = new CsrMatrix;
650 loTriFactor->csrMat->num_rows = A->rmap->n;
651 loTriFactor->csrMat->num_cols = A->cmap->n;
652 loTriFactor->csrMat->num_entries = a->nz;
653 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
654 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
655 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
656 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
657 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
658 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
659
660 /* Create the solve analysis information */
661 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
662 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
663 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
664 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
665 PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
666
667 /* perform the solve analysis */
668 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
669 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
670
671 PetscCallHIP(WaitForHIP());
672 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
673
674 /* assign the pointer */
675 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
676
677 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
678 PetscCallHIP(hipHostFree(AiUp));
679 PetscCallHIP(hipHostFree(AjUp));
680 } else {
681 /* Fill the upper triangular matrix */
682 offset = 0;
683 for (i = 0; i < n; i++) {
684 /* set the pointers */
685 v = aa + ai[i];
686 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
687
688 /* first, set the diagonal elements */
689 AAUp[offset] = 1.0 / v[nz];
690 AALo[offset] = 1.0 / v[nz];
691
692 offset += 1;
693 if (nz > 0) {
694 PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
695 for (j = offset; j < offset + nz; j++) {
696 AAUp[j] = -AAUp[j];
697 AALo[j] = AAUp[j] / v[nz];
698 }
699 offset += nz;
700 }
701 }
702 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
703 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
704 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
705 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
706 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
707 }
708 PetscCallHIP(hipHostFree(AAUp));
709 PetscCallHIP(hipHostFree(AALo));
710 } catch (char *ex) {
711 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
712 }
713 }
714 PetscFunctionReturn(PETSC_SUCCESS);
715 }
716
MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)717 static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
718 {
719 PetscBool perm_identity;
720 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
721 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
722 IS ip = a->row;
723 PetscInt n = A->rmap->n;
724
725 PetscFunctionBegin;
726 PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
727 PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
728 if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
729 hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
730
731 A->offloadmask = PETSC_OFFLOAD_BOTH;
732 /* lower triangular indices */
733 PetscCall(ISIdentity(ip, &perm_identity));
734 if (!perm_identity) {
735 IS iip;
736 const PetscInt *irip, *rip;
737
738 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
739 PetscCall(ISGetIndices(iip, &irip));
740 PetscCall(ISGetIndices(ip, &rip));
741 hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
742 hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
743 hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
744 hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
745 PetscCall(ISRestoreIndices(iip, &irip));
746 PetscCall(ISDestroy(&iip));
747 PetscCall(ISRestoreIndices(ip, &rip));
748 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
749 }
750 PetscFunctionReturn(PETSC_SUCCESS);
751 }
752
MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B,Mat A,const MatFactorInfo * info)753 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
754 {
755 PetscBool perm_identity;
756 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
757 IS ip = b->row;
758
759 PetscFunctionBegin;
760 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
761 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
762 B->offloadmask = PETSC_OFFLOAD_CPU;
763 /* determine which version of MatSolve needs to be used. */
764 PetscCall(ISIdentity(ip, &perm_identity));
765 if (perm_identity) {
766 B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
767 B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
768 B->ops->matsolve = NULL;
769 B->ops->matsolvetranspose = NULL;
770 } else {
771 B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
772 B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
773 B->ops->matsolve = NULL;
774 B->ops->matsolvetranspose = NULL;
775 }
776
777 /* get the triangular factors */
778 PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
779 PetscFunctionReturn(PETSC_SUCCESS);
780 }
781
MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)782 static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
783 {
784 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
785 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
786 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
787 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
788 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
789 hipsparseIndexBase_t indexBase;
790 hipsparseMatrixType_t matrixType;
791 hipsparseFillMode_t fillMode;
792 hipsparseDiagType_t diagType;
793
794 PetscFunctionBegin;
795 /* allocate space for the transpose of the lower triangular factor */
796 PetscCall(PetscNew(&loTriFactorT));
797 loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
798
799 /* set the matrix descriptors of the lower triangular factor */
800 matrixType = hipsparseGetMatType(loTriFactor->descr);
801 indexBase = hipsparseGetMatIndexBase(loTriFactor->descr);
802 fillMode = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
803 diagType = hipsparseGetMatDiagType(loTriFactor->descr);
804
805 /* Create the matrix description */
806 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
807 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
808 PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
809 PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
810 PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));
811
812 /* set the operation */
813 loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
814
815 /* allocate GPU space for the CSC of the lower triangular factor*/
816 loTriFactorT->csrMat = new CsrMatrix;
817 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
818 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
819 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
820 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
821 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
822 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
823
824 /* compute the transpose of the lower triangular factor, i.e. the CSC */
825 /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
826 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
827 PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
828 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
829 loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
830 PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
831 #endif
832 */
833 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
834
835 PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
836 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
837 #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
838 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
839 hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
840 #else
841 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
842 #endif
843
844 PetscCallHIP(WaitForHIP());
845 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
846
847 /* Create the solve analysis information */
848 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
849 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
850 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
851 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
852 PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
853
854 /* perform the solve analysis */
855 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
856 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
857
858 PetscCallHIP(WaitForHIP());
859 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
860
861 /* assign the pointer */
862 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
863
864 /*********************************************/
865 /* Now the Transpose of the Upper Tri Factor */
866 /*********************************************/
867
868 /* allocate space for the transpose of the upper triangular factor */
869 PetscCall(PetscNew(&upTriFactorT));
870 upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
871
872 /* set the matrix descriptors of the upper triangular factor */
873 matrixType = hipsparseGetMatType(upTriFactor->descr);
874 indexBase = hipsparseGetMatIndexBase(upTriFactor->descr);
875 fillMode = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
876 diagType = hipsparseGetMatDiagType(upTriFactor->descr);
877
878 /* Create the matrix description */
879 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
880 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
881 PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
882 PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
883 PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));
884
885 /* set the operation */
886 upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
887
888 /* allocate GPU space for the CSC of the upper triangular factor*/
889 upTriFactorT->csrMat = new CsrMatrix;
890 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
891 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
892 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
893 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
894 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
895 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
896
897 /* compute the transpose of the upper triangular factor, i.e. the CSC */
898 /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
899 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
900 PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
901 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
902 upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
903 PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
904 #endif
905 */
906 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
907 PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
908 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
909 #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
910 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
911 hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
912 #else
913 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
914 #endif
915
916 PetscCallHIP(WaitForHIP());
917 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
918
919 /* Create the solve analysis information */
920 PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
921 PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
922 PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
923 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
924 PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
925
926 /* perform the solve analysis */
927 PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
928 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
929
930 PetscCallHIP(WaitForHIP());
931 PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
932
933 /* assign the pointer */
934 ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
935 PetscFunctionReturn(PETSC_SUCCESS);
936 }
937
938 struct PetscScalarToPetscInt {
operator ()PetscScalarToPetscInt939 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
940 };
941
MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)942 static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
943 {
944 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
945 Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
946 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
947 hipsparseIndexBase_t indexBase;
948
949 PetscFunctionBegin;
950 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
951 matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
952 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
953 matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
954 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
955 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
956 PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
957 PetscCall(PetscLogGpuTimeBegin());
958 if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
959 if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
960 matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
961 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
962 indexBase = hipsparseGetMatIndexBase(matstruct->descr);
963 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
964 PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
965
966 /* set alpha and beta */
967 PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
968 PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
969 PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
970 PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
971 PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
972 PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
973
974 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
975 CsrMatrix *matrixT = new CsrMatrix;
976 matstructT->mat = matrixT;
977 matrixT->num_rows = A->cmap->n;
978 matrixT->num_cols = A->rmap->n;
979 matrixT->num_entries = a->nz;
980 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
981 matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
982 matrixT->values = new THRUSTARRAY(a->nz);
983
984 if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
985 hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
986
987 PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
988 indexBase, hipsparse_scalartype));
989 } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
990 CsrMatrix *temp = new CsrMatrix;
991 CsrMatrix *tempT = new CsrMatrix;
992 /* First convert HYB to CSR */
993 temp->num_rows = A->rmap->n;
994 temp->num_cols = A->cmap->n;
995 temp->num_entries = a->nz;
996 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
997 temp->column_indices = new THRUSTINTARRAY32(a->nz);
998 temp->values = new THRUSTARRAY(a->nz);
999
1000 PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));
1001
1002 /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1003 tempT->num_rows = A->rmap->n;
1004 tempT->num_cols = A->cmap->n;
1005 tempT->num_entries = a->nz;
1006 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1007 tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1008 tempT->values = new THRUSTARRAY(a->nz);
1009
1010 PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1011 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1012
1013 /* Last, convert CSC to HYB */
1014 hipsparseHybMat_t hybMat;
1015 PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1016 hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1017 PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));
1018
1019 /* assign the pointer */
1020 matstructT->mat = hybMat;
1021 A->transupdated = PETSC_TRUE;
1022 /* delete temporaries */
1023 if (tempT) {
1024 if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1025 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1026 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1027 delete (CsrMatrix *)tempT;
1028 }
1029 if (temp) {
1030 if (temp->values) delete (THRUSTARRAY *)temp->values;
1031 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1032 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1033 delete (CsrMatrix *)temp;
1034 }
1035 }
1036 }
1037 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1038 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1039 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1040 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1041 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1042 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1043 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1044 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1045 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1046 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1047 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1048 if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1049 hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1050 hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1051 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1052 }
1053 if (!hipsparsestruct->csr2csc_i) {
1054 THRUSTARRAY csr2csc_a(matrix->num_entries);
1055 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1056
1057 indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1058 if (matrix->num_entries) {
1059 /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1060 Need to verify this for ROCm.
1061 */
1062 PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1063 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1064 } else {
1065 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1066 }
1067
1068 hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1069 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1070 }
1071 PetscCallThrust(
1072 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1073 }
1074 PetscCall(PetscLogGpuTimeEnd());
1075 PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1076 /* the compressed row indices is not used for matTranspose */
1077 matstructT->cprowIndices = NULL;
1078 /* assign the pointer */
1079 ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1080 A->transupdated = PETSC_TRUE;
1081 PetscFunctionReturn(PETSC_SUCCESS);
1082 }
1083
1084 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
MatSolveTranspose_SeqAIJHIPSPARSE(Mat A,Vec bb,Vec xx)1085 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1086 {
1087 PetscInt n = xx->map->n;
1088 const PetscScalar *barray;
1089 PetscScalar *xarray;
1090 thrust::device_ptr<const PetscScalar> bGPU;
1091 thrust::device_ptr<PetscScalar> xGPU;
1092 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1093 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1094 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1095 THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1096
1097 PetscFunctionBegin;
1098 /* Analyze the matrix and create the transpose ... on the fly */
1099 if (!loTriFactorT && !upTriFactorT) {
1100 PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1101 loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1102 upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1103 }
1104
1105 /* Get the GPU pointers */
1106 PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1107 PetscCall(VecHIPGetArrayRead(bb, &barray));
1108 xGPU = thrust::device_pointer_cast(xarray);
1109 bGPU = thrust::device_pointer_cast(barray);
1110
1111 PetscCall(PetscLogGpuTimeBegin());
1112 /* First, reorder with the row permutation */
1113 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);
1114
1115 /* First, solve U */
1116 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1117 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1118
1119 /* Then, solve L */
1120 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1121 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1122
1123 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1124 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());
1125
1126 /* Copy the temporary to the full solution. */
1127 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);
1128
1129 /* restore */
1130 PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1131 PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1132 PetscCall(PetscLogGpuTimeEnd());
1133 PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1134 PetscFunctionReturn(PETSC_SUCCESS);
1135 }
1136
MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1137 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1138 {
1139 const PetscScalar *barray;
1140 PetscScalar *xarray;
1141 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1142 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1143 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1144 THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1145
1146 PetscFunctionBegin;
1147 /* Analyze the matrix and create the transpose ... on the fly */
1148 if (!loTriFactorT && !upTriFactorT) {
1149 PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1150 loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1151 upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1152 }
1153
1154 /* Get the GPU pointers */
1155 PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1156 PetscCall(VecHIPGetArrayRead(bb, &barray));
1157
1158 PetscCall(PetscLogGpuTimeBegin());
1159 /* First, solve U */
1160 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1161 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1162
1163 /* Then, solve L */
1164 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1165 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1166
1167 /* restore */
1168 PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1169 PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1170 PetscCall(PetscLogGpuTimeEnd());
1171 PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1172 PetscFunctionReturn(PETSC_SUCCESS);
1173 }
1174
MatSolve_SeqAIJHIPSPARSE(Mat A,Vec bb,Vec xx)1175 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1176 {
1177 const PetscScalar *barray;
1178 PetscScalar *xarray;
1179 thrust::device_ptr<const PetscScalar> bGPU;
1180 thrust::device_ptr<PetscScalar> xGPU;
1181 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1182 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1183 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1184 THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1185
1186 PetscFunctionBegin;
1187 /* Get the GPU pointers */
1188 PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1189 PetscCall(VecHIPGetArrayRead(bb, &barray));
1190 xGPU = thrust::device_pointer_cast(xarray);
1191 bGPU = thrust::device_pointer_cast(barray);
1192
1193 PetscCall(PetscLogGpuTimeBegin());
1194 /* First, reorder with the row permutation */
1195 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
1196
1197 /* Next, solve L */
1198 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1199 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1200
1201 /* Then, solve U */
1202 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1203 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1204
1205 /* Last, reorder with the column permutation */
1206 thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
1207
1208 PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1209 PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1210 PetscCall(PetscLogGpuTimeEnd());
1211 PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1212 PetscFunctionReturn(PETSC_SUCCESS);
1213 }
1214
MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1215 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1216 {
1217 const PetscScalar *barray;
1218 PetscScalar *xarray;
1219 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1220 Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1221 Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1222 THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1223
1224 PetscFunctionBegin;
1225 /* Get the GPU pointers */
1226 PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1227 PetscCall(VecHIPGetArrayRead(bb, &barray));
1228
1229 PetscCall(PetscLogGpuTimeBegin());
1230 /* First, solve L */
1231 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1232 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1233
1234 /* Next, solve U */
1235 PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1236 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1237
1238 PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1239 PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1240 PetscCall(PetscLogGpuTimeEnd());
1241 PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1242 PetscFunctionReturn(PETSC_SUCCESS);
1243 }
1244
1245 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1246 /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact,Vec b,Vec x)1247 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1248 {
1249 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1250 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1251 const PetscScalar *barray;
1252 PetscScalar *xarray;
1253
1254 PetscFunctionBegin;
1255 PetscCall(VecHIPGetArrayWrite(x, &xarray));
1256 PetscCall(VecHIPGetArrayRead(b, &barray));
1257 PetscCall(PetscLogGpuTimeBegin());
1258
1259 /* Solve L*y = b */
1260 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1261 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1262 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1263 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1264 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1265 #else
1266 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1267 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1268 #endif
1269 /* Solve U*x = y */
1270 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1271 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1272 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1273 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1274 #else
1275 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1276 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1277 #endif
1278 PetscCall(VecHIPRestoreArrayRead(b, &barray));
1279 PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1280
1281 PetscCall(PetscLogGpuTimeEnd());
1282 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1283 PetscFunctionReturn(PETSC_SUCCESS);
1284 }
1285
MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact,Vec b,Vec x)1286 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1287 {
1288 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1289 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1290 const PetscScalar *barray;
1291 PetscScalar *xarray;
1292
1293 PetscFunctionBegin;
1294 if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1295 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1296 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1297 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1298
1299 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1300 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1301 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1302 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1303 fs->createdTransposeSpSVDescr = PETSC_TRUE;
1304 }
1305
1306 if (!fs->updatedTransposeSpSVAnalysis) {
1307 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1308
1309 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1310 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1311 }
1312
1313 PetscCall(VecHIPGetArrayWrite(x, &xarray));
1314 PetscCall(VecHIPGetArrayRead(b, &barray));
1315 PetscCall(PetscLogGpuTimeBegin());
1316
1317 /* Solve Ut*y = b */
1318 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1319 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1320 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1321 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1322 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1323 #else
1324 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1325 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1326 #endif
1327 /* Solve Lt*x = y */
1328 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1329 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1330 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1331 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1332 #else
1333 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1334 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1335 #endif
1336 PetscCall(VecHIPRestoreArrayRead(b, &barray));
1337 PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1338 PetscCall(PetscLogGpuTimeEnd());
1339 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1340 PetscFunctionReturn(PETSC_SUCCESS);
1341 }
1342
MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo * info)1343 static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1344 {
1345 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1346 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1347 Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1348 CsrMatrix *Acsr;
1349 PetscInt m, nz;
1350 PetscBool flg;
1351
1352 PetscFunctionBegin;
1353 if (PetscDefined(USE_DEBUG)) {
1354 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1355 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1356 }
1357
1358 /* Copy A's value to fact */
1359 m = fact->rmap->n;
1360 nz = aij->nz;
1361 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1362 Acsr = (CsrMatrix *)Acusp->mat->mat;
1363 PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1364
1365 /* Factorize fact inplace */
1366 if (m)
1367 PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1368 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1369 if (PetscDefined(USE_DEBUG)) {
1370 int numerical_zero;
1371 hipsparseStatus_t status;
1372 status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1373 PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1374 }
1375
1376 /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1377 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1378
1379 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1380
1381 /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1382 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1383
1384 fact->offloadmask = PETSC_OFFLOAD_GPU;
1385 fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ILU0;
1386 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1387 fact->ops->matsolve = NULL;
1388 fact->ops->matsolvetranspose = NULL;
1389 PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1390 PetscFunctionReturn(PETSC_SUCCESS);
1391 }
1392
MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)1393 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1394 {
1395 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1396 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1397 PetscInt m, nz;
1398
1399 PetscFunctionBegin;
1400 if (PetscDefined(USE_DEBUG)) {
1401 PetscBool flg, diagDense;
1402
1403 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1404 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1405 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1406 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1407 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1408 }
1409
1410 /* Free the old stale stuff */
1411 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1412
1413 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1414 but they will not be used. Allocate them just for easy debugging.
1415 */
1416 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1417
1418 fact->offloadmask = PETSC_OFFLOAD_BOTH;
1419 fact->factortype = MAT_FACTOR_ILU;
1420 fact->info.factor_mallocs = 0;
1421 fact->info.fill_ratio_given = info->fill;
1422 fact->info.fill_ratio_needed = 1.0;
1423
1424 aij->row = NULL;
1425 aij->col = NULL;
1426
1427 /* ====================================================================== */
1428 /* Copy A's i, j to fact and also allocate the value array of fact. */
1429 /* We'll do in-place factorization on fact */
1430 /* ====================================================================== */
1431 const int *Ai, *Aj;
1432
1433 m = fact->rmap->n;
1434 nz = aij->nz;
1435
1436 PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1437 PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1438 PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1439 PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1440 PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1441 PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1442
1443 /* ====================================================================== */
1444 /* Create descriptors for M, L, U */
1445 /* ====================================================================== */
1446 hipsparseFillMode_t fillMode;
1447 hipsparseDiagType_t diagType;
1448
1449 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1450 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1451 PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1452
1453 /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1454 hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1455 assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1456 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1457 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1458 */
1459 fillMode = HIPSPARSE_FILL_MODE_LOWER;
1460 diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1461 PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1462 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1463 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1464
1465 fillMode = HIPSPARSE_FILL_MODE_UPPER;
1466 diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1467 PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1468 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1469 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1470
1471 /* ========================================================================= */
1472 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1473 /* ========================================================================= */
1474 PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1475 if (m)
1476 PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1477 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1478
1479 PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1480 PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1481
1482 PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1483 PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1484
1485 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1486 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1487
1488 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1489 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1490
1491 /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1492 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1493 */
1494 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1495 PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1496 fs->spsvBuffer_L = fs->factBuffer_M;
1497 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1498 } else {
1499 PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1500 fs->spsvBuffer_U = fs->factBuffer_M;
1501 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1502 }
1503
1504 /* ========================================================================== */
1505 /* Perform analysis of ilu0 on M, SpSv on L and U */
1506 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1507 /* ========================================================================== */
1508 int structural_zero;
1509
1510 fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1511 if (m)
1512 PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1513 fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1514 if (PetscDefined(USE_DEBUG)) {
1515 /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1516 hipsparseStatus_t status;
1517 status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1518 PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1519 }
1520
1521 /* Estimate FLOPs of the numeric factorization */
1522 {
1523 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1524 PetscInt *Ai, nzRow, nzLeft;
1525 PetscLogDouble flops = 0.0;
1526 const PetscInt *Adiag;
1527
1528 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &Adiag, NULL));
1529 Ai = Aseq->i;
1530 for (PetscInt i = 0; i < m; i++) {
1531 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1532 nzRow = Ai[i + 1] - Ai[i];
1533 nzLeft = Adiag[i] - Ai[i];
1534 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1535 and include the eliminated one will be updated, which incurs a multiplication and an addition.
1536 */
1537 nzLeft = (nzRow - 1) / 2;
1538 flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1539 }
1540 }
1541 fs->numericFactFlops = flops;
1542 }
1543 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1544 PetscFunctionReturn(PETSC_SUCCESS);
1545 }
1546
MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact,Vec b,Vec x)1547 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1548 {
1549 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1550 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1551 const PetscScalar *barray;
1552 PetscScalar *xarray;
1553
1554 PetscFunctionBegin;
1555 PetscCall(VecHIPGetArrayWrite(x, &xarray));
1556 PetscCall(VecHIPGetArrayRead(b, &barray));
1557 PetscCall(PetscLogGpuTimeBegin());
1558
1559 /* Solve L*y = b */
1560 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1561 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1562 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1563 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1564 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1565 #else
1566 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1567 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1568 #endif
1569 /* Solve Lt*x = y */
1570 PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1571 #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1572 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1573 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1574 #else
1575 PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1576 fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1577 #endif
1578 PetscCall(VecHIPRestoreArrayRead(b, &barray));
1579 PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1580
1581 PetscCall(PetscLogGpuTimeEnd());
1582 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1583 PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585
MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo * info)1586 static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1587 {
1588 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1589 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1590 Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1591 CsrMatrix *Acsr;
1592 PetscInt m, nz;
1593 PetscBool flg;
1594
1595 PetscFunctionBegin;
1596 if (PetscDefined(USE_DEBUG)) {
1597 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1598 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1599 }
1600
1601 /* Copy A's value to fact */
1602 m = fact->rmap->n;
1603 nz = aij->nz;
1604 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1605 Acsr = (CsrMatrix *)Acusp->mat->mat;
1606 PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1607
1608 /* Factorize fact inplace */
1609 /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1610 The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1611 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1612 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1613 */
1614 if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1615 if (PetscDefined(USE_DEBUG)) {
1616 int numerical_zero;
1617 hipsparseStatus_t status;
1618 status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1619 PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1620 }
1621
1622 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1623
1624 /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1625 ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1626 */
1627 PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1628
1629 fact->offloadmask = PETSC_OFFLOAD_GPU;
1630 fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ICC0;
1631 fact->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_ICC0;
1632 fact->ops->matsolve = NULL;
1633 fact->ops->matsolvetranspose = NULL;
1634 PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1635 PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637
MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo * info)1638 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1639 {
1640 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1641 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1642 PetscInt m, nz;
1643
1644 PetscFunctionBegin;
1645 if (PetscDefined(USE_DEBUG)) {
1646 PetscBool flg, diagDense;
1647
1648 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1649 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1650 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1651 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1652 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1653 }
1654
1655 /* Free the old stale stuff */
1656 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1657
1658 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1659 but they will not be used. Allocate them just for easy debugging.
1660 */
1661 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1662
1663 fact->offloadmask = PETSC_OFFLOAD_BOTH;
1664 fact->factortype = MAT_FACTOR_ICC;
1665 fact->info.factor_mallocs = 0;
1666 fact->info.fill_ratio_given = info->fill;
1667 fact->info.fill_ratio_needed = 1.0;
1668
1669 aij->row = NULL;
1670 aij->col = NULL;
1671
1672 /* ====================================================================== */
1673 /* Copy A's i, j to fact and also allocate the value array of fact. */
1674 /* We'll do in-place factorization on fact */
1675 /* ====================================================================== */
1676 const int *Ai, *Aj;
1677
1678 m = fact->rmap->n;
1679 nz = aij->nz;
1680
1681 PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1682 PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1683 PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1684 PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1685 PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1686 PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1687
1688 /* ====================================================================== */
1689 /* Create mat descriptors for M, L */
1690 /* ====================================================================== */
1691 hipsparseFillMode_t fillMode;
1692 hipsparseDiagType_t diagType;
1693
1694 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1695 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1696 PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1697
1698 /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1699 hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1700 assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1701 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1702 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1703 */
1704 fillMode = HIPSPARSE_FILL_MODE_LOWER;
1705 diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1706 PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1707 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1708 PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1709
1710 /* ========================================================================= */
1711 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1712 /* ========================================================================= */
1713 PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1714 if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1715
1716 PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1717 PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1718
1719 PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1720 PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1721
1722 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1723 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1724
1725 PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1726 PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1727
1728 /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1729 See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1730 */
1731 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1732 PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1733 fs->spsvBuffer_L = fs->factBuffer_M;
1734 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1735 } else {
1736 PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1737 fs->spsvBuffer_Lt = fs->factBuffer_M;
1738 PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1739 }
1740
1741 /* ========================================================================== */
1742 /* Perform analysis of ic0 on M */
1743 /* The lower triangular part of M has the same sparsity pattern as L */
1744 /* ========================================================================== */
1745 int structural_zero;
1746
1747 fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1748 if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1749 if (PetscDefined(USE_DEBUG)) {
1750 hipsparseStatus_t status;
1751 /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1752 status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1753 PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1754 }
1755
1756 /* Estimate FLOPs of the numeric factorization */
1757 {
1758 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1759 PetscInt *Ai, nzRow, nzLeft;
1760 PetscLogDouble flops = 0.0;
1761
1762 Ai = Aseq->i;
1763 for (PetscInt i = 0; i < m; i++) {
1764 nzRow = Ai[i + 1] - Ai[i];
1765 if (nzRow > 1) {
1766 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1767 and include the eliminated one will be updated, which incurs a multiplication and an addition.
1768 */
1769 nzLeft = (nzRow - 1) / 2;
1770 flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1771 }
1772 }
1773 fs->numericFactFlops = flops;
1774 }
1775 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1776 PetscFunctionReturn(PETSC_SUCCESS);
1777 }
1778 #endif
1779
MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)1780 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1781 {
1782 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1783
1784 PetscFunctionBegin;
1785 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1786 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1787 if (!info->factoronhost) {
1788 PetscCall(ISIdentity(isrow, &row_identity));
1789 PetscCall(ISIdentity(iscol, &col_identity));
1790 }
1791 if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1792 else
1793 #endif
1794 {
1795 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1796 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1797 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1798 }
1799 PetscFunctionReturn(PETSC_SUCCESS);
1800 }
1801
MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)1802 static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1803 {
1804 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1805
1806 PetscFunctionBegin;
1807 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1808 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1809 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1810 PetscFunctionReturn(PETSC_SUCCESS);
1811 }
1812
MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)1813 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1814 {
1815 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1816
1817 PetscFunctionBegin;
1818 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1819 PetscBool perm_identity = PETSC_FALSE;
1820 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1821 if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1822 else
1823 #endif
1824 {
1825 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1826 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1827 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1828 }
1829 PetscFunctionReturn(PETSC_SUCCESS);
1830 }
1831
MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)1832 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1833 {
1834 Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1835
1836 PetscFunctionBegin;
1837 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1838 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1839 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1840 PetscFunctionReturn(PETSC_SUCCESS);
1841 }
1842
MatFactorGetSolverType_seqaij_hipsparse(Mat A,MatSolverType * type)1843 static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1844 {
1845 PetscFunctionBegin;
1846 *type = MATSOLVERHIPSPARSE;
1847 PetscFunctionReturn(PETSC_SUCCESS);
1848 }
1849
1850 /*MC
1851 MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1852 on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1853 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1854 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1855 HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1856 algorithms are not recommended. This class does NOT support direct solver operations.
1857
1858 Level: beginner
1859
1860 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1861 M*/
1862
MatGetFactor_seqaijhipsparse_hipsparse(Mat A,MatFactorType ftype,Mat * B)1863 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1864 {
1865 PetscInt n = A->rmap->n;
1866
1867 PetscFunctionBegin;
1868 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1869 PetscCall(MatSetSizes(*B, n, n, n, n));
1870 (*B)->factortype = ftype;
1871 PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
1872
1873 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1874 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1875 PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1876 if (!A->boundtocpu) {
1877 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1878 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1879 } else {
1880 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1881 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1882 }
1883 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1884 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1885 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1886 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1887 if (!A->boundtocpu) {
1888 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1889 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1890 } else {
1891 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1892 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1893 }
1894 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1895 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1896 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1897
1898 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1899 (*B)->canuseordering = PETSC_TRUE;
1900 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1901 PetscFunctionReturn(PETSC_SUCCESS);
1902 }
1903
MatSeqAIJHIPSPARSECopyFromGPU(Mat A)1904 static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1905 {
1906 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1907 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1908 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1909 Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1910 #endif
1911
1912 PetscFunctionBegin;
1913 if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1914 PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1915 if (A->factortype == MAT_FACTOR_NONE) {
1916 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1917 PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1918 }
1919 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1920 else if (fs->csrVal) {
1921 /* We have a factorized matrix on device and are able to copy it to host */
1922 PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1923 }
1924 #endif
1925 else
1926 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1927 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1928 PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1929 A->offloadmask = PETSC_OFFLOAD_BOTH;
1930 }
1931 PetscFunctionReturn(PETSC_SUCCESS);
1932 }
1933
MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A,PetscScalar * array[])1934 static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1935 {
1936 PetscFunctionBegin;
1937 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1938 *array = ((Mat_SeqAIJ *)A->data)->a;
1939 PetscFunctionReturn(PETSC_SUCCESS);
1940 }
1941
MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A,PetscScalar * array[])1942 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1943 {
1944 PetscFunctionBegin;
1945 A->offloadmask = PETSC_OFFLOAD_CPU;
1946 *array = NULL;
1947 PetscFunctionReturn(PETSC_SUCCESS);
1948 }
1949
MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A,const PetscScalar * array[])1950 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1951 {
1952 PetscFunctionBegin;
1953 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1954 *array = ((Mat_SeqAIJ *)A->data)->a;
1955 PetscFunctionReturn(PETSC_SUCCESS);
1956 }
1957
MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A,const PetscScalar * array[])1958 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1959 {
1960 PetscFunctionBegin;
1961 *array = NULL;
1962 PetscFunctionReturn(PETSC_SUCCESS);
1963 }
1964
MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A,PetscScalar * array[])1965 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1966 {
1967 PetscFunctionBegin;
1968 *array = ((Mat_SeqAIJ *)A->data)->a;
1969 PetscFunctionReturn(PETSC_SUCCESS);
1970 }
1971
MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A,PetscScalar * array[])1972 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1973 {
1974 PetscFunctionBegin;
1975 A->offloadmask = PETSC_OFFLOAD_CPU;
1976 *array = NULL;
1977 PetscFunctionReturn(PETSC_SUCCESS);
1978 }
1979
MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A,const PetscInt ** i,const PetscInt ** j,PetscScalar ** a,PetscMemType * mtype)1980 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1981 {
1982 Mat_SeqAIJHIPSPARSE *cusp;
1983 CsrMatrix *matrix;
1984
1985 PetscFunctionBegin;
1986 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1987 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1988 cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1989 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1990 matrix = (CsrMatrix *)cusp->mat->mat;
1991
1992 if (i) {
1993 #if !defined(PETSC_USE_64BIT_INDICES)
1994 *i = matrix->row_offsets->data().get();
1995 #else
1996 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
1997 #endif
1998 }
1999 if (j) {
2000 #if !defined(PETSC_USE_64BIT_INDICES)
2001 *j = matrix->column_indices->data().get();
2002 #else
2003 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2004 #endif
2005 }
2006 if (a) *a = matrix->values->data().get();
2007 if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2008 PetscFunctionReturn(PETSC_SUCCESS);
2009 }
2010
MatSeqAIJHIPSPARSECopyToGPU(Mat A)2011 PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2012 {
2013 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2014 Mat_SeqAIJHIPSPARSEMultStruct *matstruct = hipsparsestruct->mat;
2015 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2016 PetscBool both = PETSC_TRUE;
2017 PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2018
2019 PetscFunctionBegin;
2020 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2021 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2022 if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2023 CsrMatrix *matrix;
2024 matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
2025
2026 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2027 PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2028 matrix->values->assign(a->a, a->a + a->nz);
2029 PetscCallHIP(WaitForHIP());
2030 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2031 PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2032 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2033 } else {
2034 PetscInt nnz;
2035 PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2036 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2037 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2038 delete hipsparsestruct->workVector;
2039 delete hipsparsestruct->rowoffsets_gpu;
2040 hipsparsestruct->workVector = NULL;
2041 hipsparsestruct->rowoffsets_gpu = NULL;
2042 try {
2043 if (a->compressedrow.use) {
2044 m = a->compressedrow.nrows;
2045 ii = a->compressedrow.i;
2046 ridx = a->compressedrow.rindex;
2047 } else {
2048 m = A->rmap->n;
2049 ii = a->i;
2050 ridx = NULL;
2051 }
2052 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2053 if (!a->a) {
2054 nnz = ii[m];
2055 both = PETSC_FALSE;
2056 } else nnz = a->nz;
2057 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2058
2059 /* create hipsparse matrix */
2060 hipsparsestruct->nrows = m;
2061 matstruct = new Mat_SeqAIJHIPSPARSEMultStruct;
2062 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2063 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2064 PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2065
2066 PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2067 PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2068 PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2069 PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2070 PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2071 PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2072 PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2073
2074 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2075 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2076 /* set the matrix */
2077 CsrMatrix *mat = new CsrMatrix;
2078 mat->num_rows = m;
2079 mat->num_cols = A->cmap->n;
2080 mat->num_entries = nnz;
2081 mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2082 mat->column_indices = new THRUSTINTARRAY32(nnz);
2083 mat->values = new THRUSTARRAY(nnz);
2084 mat->row_offsets->assign(ii, ii + m + 1);
2085 mat->column_indices->assign(a->j, a->j + nnz);
2086 if (a->a) mat->values->assign(a->a, a->a + nnz);
2087
2088 /* assign the pointer */
2089 matstruct->mat = mat;
2090 if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2091 PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2092 HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2093 }
2094 } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2095 CsrMatrix *mat = new CsrMatrix;
2096 mat->num_rows = m;
2097 mat->num_cols = A->cmap->n;
2098 mat->num_entries = nnz;
2099 mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2100 mat->column_indices = new THRUSTINTARRAY32(nnz);
2101 mat->values = new THRUSTARRAY(nnz);
2102 mat->row_offsets->assign(ii, ii + m + 1);
2103 mat->column_indices->assign(a->j, a->j + nnz);
2104 if (a->a) mat->values->assign(a->a, a->a + nnz);
2105
2106 hipsparseHybMat_t hybMat;
2107 PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2108 hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2109 PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2110 /* assign the pointer */
2111 matstruct->mat = hybMat;
2112
2113 if (mat) {
2114 if (mat->values) delete (THRUSTARRAY *)mat->values;
2115 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2116 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2117 delete (CsrMatrix *)mat;
2118 }
2119 }
2120
2121 /* assign the compressed row indices */
2122 if (a->compressedrow.use) {
2123 hipsparsestruct->workVector = new THRUSTARRAY(m);
2124 matstruct->cprowIndices = new THRUSTINTARRAY(m);
2125 matstruct->cprowIndices->assign(ridx, ridx + m);
2126 tmp = m;
2127 } else {
2128 hipsparsestruct->workVector = NULL;
2129 matstruct->cprowIndices = NULL;
2130 tmp = 0;
2131 }
2132 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2133
2134 /* assign the pointer */
2135 hipsparsestruct->mat = matstruct;
2136 } catch (char *ex) {
2137 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2138 }
2139 PetscCallHIP(WaitForHIP());
2140 PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2141 hipsparsestruct->nonzerostate = A->nonzerostate;
2142 }
2143 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2144 }
2145 PetscFunctionReturn(PETSC_SUCCESS);
2146 }
2147
2148 struct VecHIPPlusEquals {
2149 template <typename Tuple>
operator ()VecHIPPlusEquals2150 __host__ __device__ void operator()(Tuple t)
2151 {
2152 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2153 }
2154 };
2155
2156 struct VecHIPEquals {
2157 template <typename Tuple>
operator ()VecHIPEquals2158 __host__ __device__ void operator()(Tuple t)
2159 {
2160 thrust::get<1>(t) = thrust::get<0>(t);
2161 }
2162 };
2163
2164 struct VecHIPEqualsReverse {
2165 template <typename Tuple>
operator ()VecHIPEqualsReverse2166 __host__ __device__ void operator()(Tuple t)
2167 {
2168 thrust::get<0>(t) = thrust::get<1>(t);
2169 }
2170 };
2171
2172 struct MatProductCtx_MatMatHipsparse {
2173 PetscBool cisdense;
2174 PetscScalar *Bt;
2175 Mat X;
2176 PetscBool reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2177 PetscLogDouble flops;
2178 CsrMatrix *Bcsr;
2179 hipsparseSpMatDescr_t matSpBDescr;
2180 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2181 hipsparseDnMatDescr_t matBDescr;
2182 hipsparseDnMatDescr_t matCDescr;
2183 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2184 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2185 void *dBuffer4, *dBuffer5;
2186 #endif
2187 size_t mmBufferSize;
2188 void *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2189 hipsparseSpGEMMDescr_t spgemmDesc;
2190 };
2191
MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)2192 static PetscErrorCode MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)
2193 {
2194 MatProductCtx_MatMatHipsparse *mmdata = *(MatProductCtx_MatMatHipsparse **)data;
2195
2196 PetscFunctionBegin;
2197 PetscCallHIP(hipFree(mmdata->Bt));
2198 delete mmdata->Bcsr;
2199 if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2200 if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2201 if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2202 if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2203 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2204 if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4));
2205 if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5));
2206 #endif
2207 if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer));
2208 if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2));
2209 PetscCall(MatDestroy(&mmdata->X));
2210 PetscCall(PetscFree(*(void **)data));
2211 PetscFunctionReturn(PETSC_SUCCESS);
2212 }
2213
MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)2214 static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2215 {
2216 Mat_Product *product = C->product;
2217 Mat A, B;
2218 PetscInt m, n, blda, clda;
2219 PetscBool flg, biship;
2220 Mat_SeqAIJHIPSPARSE *cusp;
2221 hipsparseOperation_t opA;
2222 const PetscScalar *barray;
2223 PetscScalar *carray;
2224 MatProductCtx_MatMatHipsparse *mmdata;
2225 Mat_SeqAIJHIPSPARSEMultStruct *mat;
2226 CsrMatrix *csrmat;
2227
2228 PetscFunctionBegin;
2229 MatCheckProduct(C, 1);
2230 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2231 mmdata = (MatProductCtx_MatMatHipsparse *)product->data;
2232 A = product->A;
2233 B = product->B;
2234 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2235 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2236 /* currently CopyToGpu does not copy if the matrix is bound to CPU
2237 Instead of silently accepting the wrong answer, I prefer to raise the error */
2238 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2239 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2240 cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2241 switch (product->type) {
2242 case MATPRODUCT_AB:
2243 case MATPRODUCT_PtAP:
2244 mat = cusp->mat;
2245 opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2246 m = A->rmap->n;
2247 n = B->cmap->n;
2248 break;
2249 case MATPRODUCT_AtB:
2250 if (!A->form_explicit_transpose) {
2251 mat = cusp->mat;
2252 opA = HIPSPARSE_OPERATION_TRANSPOSE;
2253 } else {
2254 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2255 mat = cusp->matTranspose;
2256 opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2257 }
2258 m = A->cmap->n;
2259 n = B->cmap->n;
2260 break;
2261 case MATPRODUCT_ABt:
2262 case MATPRODUCT_RARt:
2263 mat = cusp->mat;
2264 opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2265 m = A->rmap->n;
2266 n = B->rmap->n;
2267 break;
2268 default:
2269 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2270 }
2271 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2272 csrmat = (CsrMatrix *)mat->mat;
2273 /* if the user passed a CPU matrix, copy the data to the GPU */
2274 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2275 if (!biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
2276 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2277 PetscCall(MatDenseGetLDA(B, &blda));
2278 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2279 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2280 PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2281 } else {
2282 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2283 PetscCall(MatDenseGetLDA(C, &clda));
2284 }
2285
2286 PetscCall(PetscLogGpuTimeBegin());
2287 hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2288 /* (re)allocate mmBuffer if not initialized or LDAs are different */
2289 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2290 size_t mmBufferSize;
2291 if (mmdata->initialized && mmdata->Blda != blda) {
2292 PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2293 mmdata->matBDescr = NULL;
2294 }
2295 if (!mmdata->matBDescr) {
2296 PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2297 mmdata->Blda = blda;
2298 }
2299 if (mmdata->initialized && mmdata->Clda != clda) {
2300 PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2301 mmdata->matCDescr = NULL;
2302 }
2303 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2304 PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2305 mmdata->Clda = clda;
2306 }
2307 if (!mat->matDescr) {
2308 PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2309 HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2310 }
2311 PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2312 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2313 PetscCallHIP(hipFree(mmdata->mmBuffer));
2314 PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2315 mmdata->mmBufferSize = mmBufferSize;
2316 }
2317 mmdata->initialized = PETSC_TRUE;
2318 } else {
2319 /* to be safe, always update pointers of the mats */
2320 PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2321 PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2322 PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2323 }
2324
2325 /* do hipsparseSpMM, which supports transpose on B */
2326 PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2327
2328 PetscCall(PetscLogGpuTimeEnd());
2329 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2330 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2331 if (product->type == MATPRODUCT_RARt) {
2332 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2333 PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2334 } else if (product->type == MATPRODUCT_PtAP) {
2335 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2336 PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2337 } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2338 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2339 if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2340 PetscFunctionReturn(PETSC_SUCCESS);
2341 }
2342
MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)2343 static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2344 {
2345 Mat_Product *product = C->product;
2346 Mat A, B;
2347 PetscInt m, n;
2348 PetscBool cisdense, flg;
2349 MatProductCtx_MatMatHipsparse *mmdata;
2350 Mat_SeqAIJHIPSPARSE *cusp;
2351
2352 PetscFunctionBegin;
2353 MatCheckProduct(C, 1);
2354 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2355 A = product->A;
2356 B = product->B;
2357 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2358 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2359 cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2360 PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2361 switch (product->type) {
2362 case MATPRODUCT_AB:
2363 m = A->rmap->n;
2364 n = B->cmap->n;
2365 break;
2366 case MATPRODUCT_AtB:
2367 m = A->cmap->n;
2368 n = B->cmap->n;
2369 break;
2370 case MATPRODUCT_ABt:
2371 m = A->rmap->n;
2372 n = B->rmap->n;
2373 break;
2374 case MATPRODUCT_PtAP:
2375 m = B->cmap->n;
2376 n = B->cmap->n;
2377 break;
2378 case MATPRODUCT_RARt:
2379 m = B->rmap->n;
2380 n = B->rmap->n;
2381 break;
2382 default:
2383 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2384 }
2385 PetscCall(MatSetSizes(C, m, n, m, n));
2386 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2387 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2388 PetscCall(MatSetType(C, MATSEQDENSEHIP));
2389
2390 /* product data */
2391 PetscCall(PetscNew(&mmdata));
2392 mmdata->cisdense = cisdense;
2393 /* for these products we need intermediate storage */
2394 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2395 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2396 PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2397 /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2398 if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2399 else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2400 }
2401 C->product->data = mmdata;
2402 C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
2403 C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2404 PetscFunctionReturn(PETSC_SUCCESS);
2405 }
2406
MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)2407 static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2408 {
2409 Mat_Product *product = C->product;
2410 Mat A, B;
2411 Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2412 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2413 Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2414 CsrMatrix *Acsr, *Bcsr, *Ccsr;
2415 PetscBool flg;
2416 MatProductType ptype;
2417 MatProductCtx_MatMatHipsparse *mmdata;
2418 hipsparseSpMatDescr_t BmatSpDescr;
2419 hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2420
2421 PetscFunctionBegin;
2422 MatCheckProduct(C, 1);
2423 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2424 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2425 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2426 mmdata = (MatProductCtx_MatMatHipsparse *)C->product->data;
2427 A = product->A;
2428 B = product->B;
2429 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2430 mmdata->reusesym = PETSC_FALSE;
2431 Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2432 PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2433 Cmat = Ccusp->mat;
2434 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2435 Ccsr = (CsrMatrix *)Cmat->mat;
2436 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2437 goto finalize;
2438 }
2439 if (!c->nz) goto finalize;
2440 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2441 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2442 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2443 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2444 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2445 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2446 Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2447 Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2448 Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2449 PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2450 PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2451 PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2452 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2453 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2454
2455 ptype = product->type;
2456 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2457 ptype = MATPRODUCT_AB;
2458 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2459 }
2460 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2461 ptype = MATPRODUCT_AB;
2462 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2463 }
2464 switch (ptype) {
2465 case MATPRODUCT_AB:
2466 Amat = Acusp->mat;
2467 Bmat = Bcusp->mat;
2468 break;
2469 case MATPRODUCT_AtB:
2470 Amat = Acusp->matTranspose;
2471 Bmat = Bcusp->mat;
2472 break;
2473 case MATPRODUCT_ABt:
2474 Amat = Acusp->mat;
2475 Bmat = Bcusp->matTranspose;
2476 break;
2477 default:
2478 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2479 }
2480 Cmat = Ccusp->mat;
2481 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2482 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2483 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2484 Acsr = (CsrMatrix *)Amat->mat;
2485 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2486 Ccsr = (CsrMatrix *)Cmat->mat;
2487 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2488 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2489 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2490 PetscCall(PetscLogGpuTimeBegin());
2491 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2492 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2493 PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2494 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2495 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2496 #else
2497 PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2498 PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2499 #endif
2500 #else
2501 PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2502 Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2503 Ccsr->column_indices->data().get()));
2504 #endif
2505 PetscCall(PetscLogGpuFlops(mmdata->flops));
2506 PetscCallHIP(WaitForHIP());
2507 PetscCall(PetscLogGpuTimeEnd());
2508 C->offloadmask = PETSC_OFFLOAD_GPU;
2509 finalize:
2510 /* shorter version of MatAssemblyEnd_SeqAIJ */
2511 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2512 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2513 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2514 c->reallocs = 0;
2515 C->info.mallocs += 0;
2516 C->info.nz_unneeded = 0;
2517 C->assembled = C->was_assembled = PETSC_TRUE;
2518 C->num_ass++;
2519 PetscFunctionReturn(PETSC_SUCCESS);
2520 }
2521
MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)2522 static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2523 {
2524 Mat_Product *product = C->product;
2525 Mat A, B;
2526 Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2527 Mat_SeqAIJ *a, *b, *c;
2528 Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2529 CsrMatrix *Acsr, *Bcsr, *Ccsr;
2530 PetscInt i, j, m, n, k;
2531 PetscBool flg;
2532 MatProductType ptype;
2533 MatProductCtx_MatMatHipsparse *mmdata;
2534 PetscLogDouble flops;
2535 PetscBool biscompressed, ciscompressed;
2536 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2537 int64_t C_num_rows1, C_num_cols1, C_nnz1;
2538 hipsparseSpMatDescr_t BmatSpDescr;
2539 #else
2540 int cnz;
2541 #endif
2542 hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2543
2544 PetscFunctionBegin;
2545 MatCheckProduct(C, 1);
2546 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2547 A = product->A;
2548 B = product->B;
2549 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2550 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2551 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2552 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2553 a = (Mat_SeqAIJ *)A->data;
2554 b = (Mat_SeqAIJ *)B->data;
2555 /* product data */
2556 PetscCall(PetscNew(&mmdata));
2557 C->product->data = mmdata;
2558 C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
2559
2560 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2561 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2562 Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2563 Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2564 PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2565 PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2566
2567 ptype = product->type;
2568 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2569 ptype = MATPRODUCT_AB;
2570 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2571 }
2572 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2573 ptype = MATPRODUCT_AB;
2574 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2575 }
2576 biscompressed = PETSC_FALSE;
2577 ciscompressed = PETSC_FALSE;
2578 switch (ptype) {
2579 case MATPRODUCT_AB:
2580 m = A->rmap->n;
2581 n = B->cmap->n;
2582 k = A->cmap->n;
2583 Amat = Acusp->mat;
2584 Bmat = Bcusp->mat;
2585 if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2586 if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2587 break;
2588 case MATPRODUCT_AtB:
2589 m = A->cmap->n;
2590 n = B->cmap->n;
2591 k = A->rmap->n;
2592 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2593 Amat = Acusp->matTranspose;
2594 Bmat = Bcusp->mat;
2595 if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2596 break;
2597 case MATPRODUCT_ABt:
2598 m = A->rmap->n;
2599 n = B->rmap->n;
2600 k = A->cmap->n;
2601 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2602 Amat = Acusp->mat;
2603 Bmat = Bcusp->matTranspose;
2604 if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2605 break;
2606 default:
2607 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2608 }
2609
2610 /* create hipsparse matrix */
2611 PetscCall(MatSetSizes(C, m, n, m, n));
2612 PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2613 c = (Mat_SeqAIJ *)C->data;
2614 Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2615 Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
2616 Ccsr = new CsrMatrix;
2617
2618 c->compressedrow.use = ciscompressed;
2619 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2620 c->compressedrow.nrows = a->compressedrow.nrows;
2621 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2622 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2623 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2624 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2625 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2626 } else {
2627 c->compressedrow.nrows = 0;
2628 c->compressedrow.i = NULL;
2629 c->compressedrow.rindex = NULL;
2630 Ccusp->workVector = NULL;
2631 Cmat->cprowIndices = NULL;
2632 }
2633 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2634 Ccusp->mat = Cmat;
2635 Ccusp->mat->mat = Ccsr;
2636 Ccsr->num_rows = Ccusp->nrows;
2637 Ccsr->num_cols = n;
2638 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2639 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2640 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2641 PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2642 PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2643 PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2644 PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2645 PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2646 PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2647 PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2648 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2649 thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2650 c->nz = 0;
2651 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2652 Ccsr->values = new THRUSTARRAY(c->nz);
2653 goto finalizesym;
2654 }
2655
2656 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2657 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2658 Acsr = (CsrMatrix *)Amat->mat;
2659 if (!biscompressed) {
2660 Bcsr = (CsrMatrix *)Bmat->mat;
2661 BmatSpDescr = Bmat->matDescr;
2662 } else { /* we need to use row offsets for the full matrix */
2663 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2664 Bcsr = new CsrMatrix;
2665 Bcsr->num_rows = B->rmap->n;
2666 Bcsr->num_cols = cBcsr->num_cols;
2667 Bcsr->num_entries = cBcsr->num_entries;
2668 Bcsr->column_indices = cBcsr->column_indices;
2669 Bcsr->values = cBcsr->values;
2670 if (!Bcusp->rowoffsets_gpu) {
2671 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2672 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2673 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2674 }
2675 Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2676 mmdata->Bcsr = Bcsr;
2677 if (Bcsr->num_rows && Bcsr->num_cols) {
2678 PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2679 }
2680 BmatSpDescr = mmdata->matSpBDescr;
2681 }
2682 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2683 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2684 /* precompute flops count */
2685 if (ptype == MATPRODUCT_AB) {
2686 for (i = 0, flops = 0; i < A->rmap->n; i++) {
2687 const PetscInt st = a->i[i];
2688 const PetscInt en = a->i[i + 1];
2689 for (j = st; j < en; j++) {
2690 const PetscInt brow = a->j[j];
2691 flops += 2. * (b->i[brow + 1] - b->i[brow]);
2692 }
2693 }
2694 } else if (ptype == MATPRODUCT_AtB) {
2695 for (i = 0, flops = 0; i < A->rmap->n; i++) {
2696 const PetscInt anzi = a->i[i + 1] - a->i[i];
2697 const PetscInt bnzi = b->i[i + 1] - b->i[i];
2698 flops += (2. * anzi) * bnzi;
2699 }
2700 } else flops = 0.; /* TODO */
2701
2702 mmdata->flops = flops;
2703 PetscCall(PetscLogGpuTimeBegin());
2704 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2705 PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2706 PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2707 PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2708 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2709 {
2710 /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2711 We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2712 */
2713 void *dBuffer1 = NULL;
2714 void *dBuffer2 = NULL;
2715 void *dBuffer3 = NULL;
2716 /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2717 size_t bufferSize1 = 0;
2718 size_t bufferSize2 = 0;
2719 size_t bufferSize3 = 0;
2720 size_t bufferSize4 = 0;
2721 size_t bufferSize5 = 0;
2722
2723 /* ask bufferSize1 bytes for external memory */
2724 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2725 PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2726 /* inspect the matrices A and B to understand the memory requirement for the next step */
2727 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));
2728
2729 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2730 PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2731 PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2732 PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2733 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2734 PetscCallHIP(hipFree(dBuffer1));
2735 PetscCallHIP(hipFree(dBuffer2));
2736
2737 /* get matrix C non-zero entries C_nnz1 */
2738 PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2739 c->nz = (PetscInt)C_nnz1;
2740 /* allocate matrix C */
2741 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2742 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2743 Ccsr->values = new THRUSTARRAY(c->nz);
2744 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2745 /* update matC with the new pointers */
2746 if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */
2747 PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2748
2749 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2750 PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2751 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2752 PetscCallHIP(hipFree(dBuffer3));
2753 PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2754 }
2755 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2756 }
2757 #else
2758 size_t bufSize2;
2759 /* ask bufferSize bytes for external memory */
2760 PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2761 PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2762 /* inspect the matrices A and B to understand the memory requirement for the next step */
2763 PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2764 /* ask bufferSize again bytes for external memory */
2765 PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2766 /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2767 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2768 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2769 is stored in the descriptor! What a messy API... */
2770 PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2771 /* compute the intermediate product of A * B */
2772 PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2773 /* get matrix C non-zero entries C_nnz1 */
2774 PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2775 c->nz = (PetscInt)C_nnz1;
2776 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2777 mmdata->mmBufferSize / 1024));
2778 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2779 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2780 Ccsr->values = new THRUSTARRAY(c->nz);
2781 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2782 PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2783 PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2784 #endif
2785 #else
2786 PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2787 PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2788 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2789 c->nz = cnz;
2790 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2791 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2792 Ccsr->values = new THRUSTARRAY(c->nz);
2793 PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2794
2795 PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2796 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2797 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2798 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2799 PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2800 Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2801 Ccsr->column_indices->data().get()));
2802 #endif
2803 PetscCall(PetscLogGpuFlops(mmdata->flops));
2804 PetscCall(PetscLogGpuTimeEnd());
2805 finalizesym:
2806 c->free_a = PETSC_TRUE;
2807 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2808 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2809 c->free_ij = PETSC_TRUE;
2810 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2811 PetscInt *d_i = c->i;
2812 THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2813 THRUSTINTARRAY jj(Ccsr->column_indices->size());
2814 ii = *Ccsr->row_offsets;
2815 jj = *Ccsr->column_indices;
2816 if (ciscompressed) d_i = c->compressedrow.i;
2817 PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2818 PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2819 } else {
2820 PetscInt *d_i = c->i;
2821 if (ciscompressed) d_i = c->compressedrow.i;
2822 PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2823 PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2824 }
2825 if (ciscompressed) { /* need to expand host row offsets */
2826 PetscInt r = 0;
2827 c->i[0] = 0;
2828 for (k = 0; k < c->compressedrow.nrows; k++) {
2829 const PetscInt next = c->compressedrow.rindex[k];
2830 const PetscInt old = c->compressedrow.i[k];
2831 for (; r < next; r++) c->i[r + 1] = old;
2832 }
2833 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2834 }
2835 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2836 PetscCall(PetscMalloc1(m, &c->ilen));
2837 PetscCall(PetscMalloc1(m, &c->imax));
2838 c->maxnz = c->nz;
2839 c->nonzerorowcnt = 0;
2840 c->rmax = 0;
2841 for (k = 0; k < m; k++) {
2842 const PetscInt nn = c->i[k + 1] - c->i[k];
2843 c->ilen[k] = c->imax[k] = nn;
2844 c->nonzerorowcnt += (PetscInt)!!nn;
2845 c->rmax = PetscMax(c->rmax, nn);
2846 }
2847 PetscCall(PetscMalloc1(c->nz, &c->a));
2848 Ccsr->num_entries = c->nz;
2849
2850 C->nonzerostate++;
2851 PetscCall(PetscLayoutSetUp(C->rmap));
2852 PetscCall(PetscLayoutSetUp(C->cmap));
2853 Ccusp->nonzerostate = C->nonzerostate;
2854 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2855 C->preallocated = PETSC_TRUE;
2856 C->assembled = PETSC_FALSE;
2857 C->was_assembled = PETSC_FALSE;
2858 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2859 mmdata->reusesym = PETSC_TRUE;
2860 C->offloadmask = PETSC_OFFLOAD_GPU;
2861 }
2862 C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2863 PetscFunctionReturn(PETSC_SUCCESS);
2864 }
2865
2866 /* handles sparse or dense B */
MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)2867 static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2868 {
2869 Mat_Product *product = mat->product;
2870 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2871
2872 PetscFunctionBegin;
2873 MatCheckProduct(mat, 1);
2874 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2875 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2876 if (product->type == MATPRODUCT_ABC) {
2877 Ciscusp = PETSC_FALSE;
2878 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2879 }
2880 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2881 PetscBool usecpu = PETSC_FALSE;
2882 switch (product->type) {
2883 case MATPRODUCT_AB:
2884 if (product->api_user) {
2885 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2886 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2887 PetscOptionsEnd();
2888 } else {
2889 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2890 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2891 PetscOptionsEnd();
2892 }
2893 break;
2894 case MATPRODUCT_AtB:
2895 if (product->api_user) {
2896 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2897 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2898 PetscOptionsEnd();
2899 } else {
2900 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2901 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2902 PetscOptionsEnd();
2903 }
2904 break;
2905 case MATPRODUCT_PtAP:
2906 if (product->api_user) {
2907 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2908 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2909 PetscOptionsEnd();
2910 } else {
2911 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2912 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2913 PetscOptionsEnd();
2914 }
2915 break;
2916 case MATPRODUCT_RARt:
2917 if (product->api_user) {
2918 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2919 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2920 PetscOptionsEnd();
2921 } else {
2922 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2923 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2924 PetscOptionsEnd();
2925 }
2926 break;
2927 case MATPRODUCT_ABC:
2928 if (product->api_user) {
2929 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2930 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2931 PetscOptionsEnd();
2932 } else {
2933 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2934 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2935 PetscOptionsEnd();
2936 }
2937 break;
2938 default:
2939 break;
2940 }
2941 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2942 }
2943 /* dispatch */
2944 if (isdense) {
2945 switch (product->type) {
2946 case MATPRODUCT_AB:
2947 case MATPRODUCT_AtB:
2948 case MATPRODUCT_ABt:
2949 case MATPRODUCT_PtAP:
2950 case MATPRODUCT_RARt:
2951 if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2952 else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2953 break;
2954 case MATPRODUCT_ABC:
2955 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2956 break;
2957 default:
2958 break;
2959 }
2960 } else if (Biscusp && Ciscusp) {
2961 switch (product->type) {
2962 case MATPRODUCT_AB:
2963 case MATPRODUCT_AtB:
2964 case MATPRODUCT_ABt:
2965 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2966 break;
2967 case MATPRODUCT_PtAP:
2968 case MATPRODUCT_RARt:
2969 case MATPRODUCT_ABC:
2970 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2971 break;
2972 default:
2973 break;
2974 }
2975 } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2976 PetscFunctionReturn(PETSC_SUCCESS);
2977 }
2978
MatMult_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy)2979 static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2980 {
2981 PetscFunctionBegin;
2982 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2983 PetscFunctionReturn(PETSC_SUCCESS);
2984 }
2985
MatMultAdd_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz)2986 static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2987 {
2988 PetscFunctionBegin;
2989 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2990 PetscFunctionReturn(PETSC_SUCCESS);
2991 }
2992
MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy)2993 static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2994 {
2995 PetscFunctionBegin;
2996 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2997 PetscFunctionReturn(PETSC_SUCCESS);
2998 }
2999
MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3000 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3001 {
3002 PetscFunctionBegin;
3003 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3004 PetscFunctionReturn(PETSC_SUCCESS);
3005 }
3006
MatMultTranspose_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy)3007 static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3008 {
3009 PetscFunctionBegin;
3010 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3011 PetscFunctionReturn(PETSC_SUCCESS);
3012 }
3013
ScatterAdd(PetscInt n,PetscInt * idx,const PetscScalar * x,PetscScalar * y)3014 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3015 {
3016 int i = blockIdx.x * blockDim.x + threadIdx.x;
3017 if (i < n) y[idx[i]] += x[i];
3018 }
3019
3020 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
MatMultAddKernel_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)3021 static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3022 {
3023 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3024 Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3025 Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3026 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3027 hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3028 PetscBool compressed;
3029 PetscInt nx, ny;
3030
3031 PetscFunctionBegin;
3032 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3033 if (!a->nz) {
3034 if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
3035 else PetscCall(VecSeq_HIP::Set(zz, 0));
3036 PetscFunctionReturn(PETSC_SUCCESS);
3037 }
3038 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3039 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3040 if (!trans) {
3041 matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3042 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3043 } else {
3044 if (herm || !A->form_explicit_transpose) {
3045 opA = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3046 matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3047 } else {
3048 if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3049 matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3050 }
3051 }
3052 /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3053 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3054 try {
3055 PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3056 if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3057 else PetscCall(VecHIPGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3058
3059 PetscCall(PetscLogGpuTimeBegin());
3060 if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3061 /* z = A x + beta y.
3062 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3063 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3064 */
3065 xptr = xarray;
3066 dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3067 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3068 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3069 allocated to accommodate different uses. So we get the length info directly from mat.
3070 */
3071 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3072 CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3073 nx = mat->num_cols;
3074 ny = mat->num_rows;
3075 }
3076 } else {
3077 /* z = A^T x + beta y
3078 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3079 Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3080 */
3081 xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3082 dptr = zarray;
3083 beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3084 if (compressed) { /* Scatter x to work vector */
3085 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3086 thrust::for_each(
3087 #if PetscDefined(HAVE_THRUST_ASYNC)
3088 thrust::hip::par.on(PetscDefaultHipStream),
3089 #endif
3090 thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3091 thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3092 }
3093 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3094 CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3095 nx = mat->num_rows;
3096 ny = mat->num_cols;
3097 }
3098 }
3099 /* csr_spmv does y = alpha op(A) x + beta y */
3100 if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3101 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) && !PETSC_PKG_HIP_VERSION_EQ(7, 2, 0)
3102 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3103 if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3104 PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3105 PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3106 PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3107 &matstruct->hipSpMV[opA].spmvBufferSize));
3108 PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3109 matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3110 } else {
3111 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3112 PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3113 PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3114 }
3115 PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3116 matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3117 #else
3118 CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3119 nx = mat->num_rows; /* nx,ny are set before the #if block, set them again to avoid set-but-not-used warning */
3120 ny = mat->num_cols;
3121 PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, nx, ny, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3122 #endif
3123 } else {
3124 if (hipsparsestruct->nrows) {
3125 hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3126 PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3127 }
3128 }
3129 PetscCall(PetscLogGpuTimeEnd());
3130
3131 if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3132 if (yy) { /* MatMultAdd: zz = A*xx + yy */
3133 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3134 PetscCall(VecSeq_HIP::Copy(yy, zz)); /* zz = yy */
3135 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3136 PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3137 }
3138 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3139 PetscCall(VecSeq_HIP::Set(zz, 0));
3140 }
3141
3142 /* ScatterAdd the result from work vector into the full vector when A is compressed */
3143 if (compressed) {
3144 PetscCall(PetscLogGpuTimeBegin());
3145 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3146 and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3147 prevent that. So I just add a ScatterAdd kernel.
3148 */
3149 #if 0
3150 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3151 thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3152 thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3153 thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3154 VecHIPPlusEquals());
3155 #else
3156 PetscInt n = matstruct->cprowIndices->size();
3157 hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3158 #endif
3159 PetscCall(PetscLogGpuTimeEnd());
3160 }
3161 } else {
3162 if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3163 }
3164 PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3165 if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3166 else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3167 } catch (char *ex) {
3168 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3169 }
3170 if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3171 else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3172 PetscFunctionReturn(PETSC_SUCCESS);
3173 }
3174
MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3175 static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3176 {
3177 PetscFunctionBegin;
3178 PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3179 PetscFunctionReturn(PETSC_SUCCESS);
3180 }
3181
MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A,MatAssemblyType mode)3182 static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3183 {
3184 PetscFunctionBegin;
3185 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3186 PetscFunctionReturn(PETSC_SUCCESS);
3187 }
3188
3189 /*@
3190 MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3191 This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3192
3193 Collective
3194
3195 Input Parameters:
3196 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3197 . m - number of rows
3198 . n - number of columns
3199 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is set
3200 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3201
3202 Output Parameter:
3203 . A - the matrix
3204
3205 Level: intermediate
3206
3207 Notes:
3208 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3209 `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3210 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
3211
3212 The AIJ format (compressed row storage), is fully compatible with standard Fortran
3213 storage. That is, the stored row and column indices can begin at
3214 either one (as in Fortran) or zero.
3215
3216 Specify the preallocated storage with either `nz` or `nnz` (not both).
3217 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3218 allocation.
3219
3220 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3221 @*/
MatCreateSeqAIJHIPSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)3222 PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3223 {
3224 PetscFunctionBegin;
3225 PetscCall(MatCreate(comm, A));
3226 PetscCall(MatSetSizes(*A, m, n, m, n));
3227 PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3228 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3229 PetscFunctionReturn(PETSC_SUCCESS);
3230 }
3231
MatDestroy_SeqAIJHIPSPARSE(Mat A)3232 static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3233 {
3234 PetscFunctionBegin;
3235 if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy(A));
3236 else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3237 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3238 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3239 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3240 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3241 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3242 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3243 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3244 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3245 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3246 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3247 PetscCall(MatDestroy_SeqAIJ(A));
3248 PetscFunctionReturn(PETSC_SUCCESS);
3249 }
3250
MatDuplicate_SeqAIJHIPSPARSE(Mat A,MatDuplicateOption cpvalues,Mat * B)3251 static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3252 {
3253 PetscFunctionBegin;
3254 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3255 PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3256 PetscFunctionReturn(PETSC_SUCCESS);
3257 }
3258
MatAXPY_SeqAIJHIPSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)3259 static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3260 {
3261 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3262 Mat_SeqAIJHIPSPARSE *cy;
3263 Mat_SeqAIJHIPSPARSE *cx;
3264 PetscScalar *ay;
3265 const PetscScalar *ax;
3266 CsrMatrix *csry, *csrx;
3267
3268 PetscFunctionBegin;
3269 cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3270 cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3271 if (X->ops->axpy != Y->ops->axpy) {
3272 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3273 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3274 PetscFunctionReturn(PETSC_SUCCESS);
3275 }
3276 /* if we are here, it means both matrices are bound to GPU */
3277 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3278 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3279 PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3280 PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3281 csry = (CsrMatrix *)cy->mat->mat;
3282 csrx = (CsrMatrix *)cx->mat->mat;
3283 /* see if we can turn this into a hipblas axpy */
3284 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3285 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3286 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3287 if (eq) str = SAME_NONZERO_PATTERN;
3288 }
3289 /* spgeam is buggy with one column */
3290 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3291 if (str == SUBSET_NONZERO_PATTERN) {
3292 PetscScalar b = 1.0;
3293 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3294 size_t bufferSize;
3295 void *buffer;
3296 #endif
3297
3298 PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3299 PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3300 PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3301 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3302 PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3303 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3304 PetscCallHIP(hipMalloc(&buffer, bufferSize));
3305 PetscCall(PetscLogGpuTimeBegin());
3306 PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3307 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3308 PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3309 PetscCall(PetscLogGpuTimeEnd());
3310 PetscCallHIP(hipFree(buffer));
3311 #else
3312 PetscCall(PetscLogGpuTimeBegin());
3313 PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3314 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3315 PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3316 PetscCall(PetscLogGpuTimeEnd());
3317 #endif
3318 PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3319 PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3320 PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3321 } else if (str == SAME_NONZERO_PATTERN) {
3322 hipblasHandle_t hipblasv2handle;
3323 PetscBLASInt one = 1, bnz = 1;
3324
3325 PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3326 PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3327 PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3328 PetscCall(PetscBLASIntCast(x->nz, &bnz));
3329 PetscCall(PetscLogGpuTimeBegin());
3330 PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3331 PetscCall(PetscLogGpuFlops(2.0 * bnz));
3332 PetscCall(PetscLogGpuTimeEnd());
3333 PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3334 PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3335 } else {
3336 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3337 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3338 }
3339 PetscFunctionReturn(PETSC_SUCCESS);
3340 }
3341
MatScale_SeqAIJHIPSPARSE(Mat Y,PetscScalar a)3342 static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3343 {
3344 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3345 PetscScalar *ay;
3346 hipblasHandle_t hipblasv2handle;
3347 PetscBLASInt one = 1, bnz = 1;
3348
3349 PetscFunctionBegin;
3350 PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3351 PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3352 PetscCall(PetscBLASIntCast(y->nz, &bnz));
3353 PetscCall(PetscLogGpuTimeBegin());
3354 PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3355 PetscCall(PetscLogGpuFlops(bnz));
3356 PetscCall(PetscLogGpuTimeEnd());
3357 PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3358 PetscFunctionReturn(PETSC_SUCCESS);
3359 }
3360
MatZeroEntries_SeqAIJHIPSPARSE(Mat A)3361 static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3362 {
3363 PetscBool both = PETSC_FALSE;
3364 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3365
3366 PetscFunctionBegin;
3367 if (A->factortype == MAT_FACTOR_NONE) {
3368 Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3369 if (spptr->mat) {
3370 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3371 if (matrix->values) {
3372 both = PETSC_TRUE;
3373 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3374 }
3375 }
3376 if (spptr->matTranspose) {
3377 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3378 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3379 }
3380 }
3381 //PetscCall(MatZeroEntries_SeqAIJ(A));
3382 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3383 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3384 else A->offloadmask = PETSC_OFFLOAD_CPU;
3385 PetscFunctionReturn(PETSC_SUCCESS);
3386 }
3387
MatGetCurrentMemType_SeqAIJHIPSPARSE(PETSC_UNUSED Mat A,PetscMemType * m)3388 static PetscErrorCode MatGetCurrentMemType_SeqAIJHIPSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3389 {
3390 PetscFunctionBegin;
3391 *m = PETSC_MEMTYPE_HIP;
3392 PetscFunctionReturn(PETSC_SUCCESS);
3393 }
3394
MatBindToCPU_SeqAIJHIPSPARSE(Mat A,PetscBool flg)3395 static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3396 {
3397 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3398
3399 PetscFunctionBegin;
3400 if (A->factortype != MAT_FACTOR_NONE) {
3401 A->boundtocpu = flg;
3402 PetscFunctionReturn(PETSC_SUCCESS);
3403 }
3404 if (flg) {
3405 PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
3406
3407 A->ops->scale = MatScale_SeqAIJ;
3408 A->ops->axpy = MatAXPY_SeqAIJ;
3409 A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3410 A->ops->mult = MatMult_SeqAIJ;
3411 A->ops->multadd = MatMultAdd_SeqAIJ;
3412 A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3413 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3414 A->ops->multhermitiantranspose = NULL;
3415 A->ops->multhermitiantransposeadd = NULL;
3416 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3417 A->ops->getcurrentmemtype = NULL;
3418 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3419 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3420 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3421 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3422 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3423 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3424 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3425 } else {
3426 A->ops->scale = MatScale_SeqAIJHIPSPARSE;
3427 A->ops->axpy = MatAXPY_SeqAIJHIPSPARSE;
3428 A->ops->zeroentries = MatZeroEntries_SeqAIJHIPSPARSE;
3429 A->ops->mult = MatMult_SeqAIJHIPSPARSE;
3430 A->ops->multadd = MatMultAdd_SeqAIJHIPSPARSE;
3431 A->ops->multtranspose = MatMultTranspose_SeqAIJHIPSPARSE;
3432 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3433 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3434 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3435 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3436 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3437 a->ops->getarray = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3438 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3439 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3440 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3441 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3442 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3443 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3444 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3445 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3446 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3447 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3448 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3449 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3450 }
3451 A->boundtocpu = flg;
3452 if (flg && a->inode.size_csr) a->inode.use = PETSC_TRUE;
3453 else a->inode.use = PETSC_FALSE;
3454 PetscFunctionReturn(PETSC_SUCCESS);
3455 }
3456
MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A,MatType mtype,MatReuse reuse,Mat * newmat)3457 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3458 {
3459 Mat B;
3460
3461 PetscFunctionBegin;
3462 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3463 if (reuse == MAT_INITIAL_MATRIX) {
3464 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3465 } else if (reuse == MAT_REUSE_MATRIX) {
3466 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3467 }
3468 B = *newmat;
3469 PetscCall(PetscFree(B->defaultvectype));
3470 PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3471 if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3472 if (B->factortype == MAT_FACTOR_NONE) {
3473 Mat_SeqAIJHIPSPARSE *spptr;
3474 PetscCall(PetscNew(&spptr));
3475 PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3476 PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3477 spptr->format = MAT_HIPSPARSE_CSR;
3478 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3479 spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3480 #else
3481 spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3482 #endif
3483 spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3484 //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
3485
3486 B->spptr = spptr;
3487 } else {
3488 Mat_SeqAIJHIPSPARSETriFactors *spptr;
3489
3490 PetscCall(PetscNew(&spptr));
3491 PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3492 PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3493 B->spptr = spptr;
3494 }
3495 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3496 }
3497 B->ops->assemblyend = MatAssemblyEnd_SeqAIJHIPSPARSE;
3498 B->ops->destroy = MatDestroy_SeqAIJHIPSPARSE;
3499 B->ops->setoption = MatSetOption_SeqAIJHIPSPARSE;
3500 B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3501 B->ops->bindtocpu = MatBindToCPU_SeqAIJHIPSPARSE;
3502 B->ops->duplicate = MatDuplicate_SeqAIJHIPSPARSE;
3503 B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3504
3505 PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3506 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3507 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3508 #if defined(PETSC_HAVE_HYPRE)
3509 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3510 #endif
3511 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3512 PetscFunctionReturn(PETSC_SUCCESS);
3513 }
3514
MatCreate_SeqAIJHIPSPARSE(Mat B)3515 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3516 {
3517 PetscFunctionBegin;
3518 PetscCall(MatCreate_SeqAIJ(B));
3519 PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3520 PetscFunctionReturn(PETSC_SUCCESS);
3521 }
3522
3523 /*MC
3524 MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs
3525
3526 A matrix type whose data resides on AMD GPUs. These matrices can be in either
3527 CSR, ELL, or Hybrid format.
3528 All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.
3529
3530 Options Database Keys:
3531 + -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
3532 . -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3533 Other options include ell (ellpack) or hyb (hybrid).
3534 . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3535 - -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU
3536
3537 Level: beginner
3538
3539 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3540 M*/
3541
MatSolverTypeRegister_HIPSPARSE(void)3542 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3543 {
3544 PetscFunctionBegin;
3545 PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3546 PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3547 PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3548 PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3549 PetscFunctionReturn(PETSC_SUCCESS);
3550 }
3551
MatSeqAIJHIPSPARSE_Destroy(Mat mat)3552 static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
3553 {
3554 Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
3555
3556 PetscFunctionBegin;
3557 if (cusp) {
3558 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
3559 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3560 delete cusp->workVector;
3561 delete cusp->rowoffsets_gpu;
3562 delete cusp->csr2csc_i;
3563 delete cusp->coords;
3564 if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
3565 PetscCall(PetscFree(mat->spptr));
3566 }
3567 PetscFunctionReturn(PETSC_SUCCESS);
3568 }
3569
CsrMatrix_Destroy(CsrMatrix ** mat)3570 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3571 {
3572 PetscFunctionBegin;
3573 if (*mat) {
3574 delete (*mat)->values;
3575 delete (*mat)->column_indices;
3576 delete (*mat)->row_offsets;
3577 delete *mat;
3578 *mat = 0;
3579 }
3580 PetscFunctionReturn(PETSC_SUCCESS);
3581 }
3582
MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct ** trifactor)3583 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3584 {
3585 PetscFunctionBegin;
3586 if (*trifactor) {
3587 if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3588 if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3589 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3590 if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3591 if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3592 if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3593 PetscCall(PetscFree(*trifactor));
3594 }
3595 PetscFunctionReturn(PETSC_SUCCESS);
3596 }
3597
MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct ** matstruct,MatHIPSPARSEStorageFormat format)3598 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3599 {
3600 CsrMatrix *mat;
3601
3602 PetscFunctionBegin;
3603 if (*matstruct) {
3604 if ((*matstruct)->mat) {
3605 if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3606 hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3607 PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3608 } else {
3609 mat = (CsrMatrix *)(*matstruct)->mat;
3610 PetscCall(CsrMatrix_Destroy(&mat));
3611 }
3612 }
3613 if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3614 delete (*matstruct)->cprowIndices;
3615 if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one));
3616 if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero));
3617 if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one));
3618
3619 Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3620 if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3621 for (int i = 0; i < 3; i++) {
3622 if (mdata->hipSpMV[i].initialized) {
3623 PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3624 PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3625 PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3626 }
3627 }
3628 delete *matstruct;
3629 *matstruct = NULL;
3630 }
3631 PetscFunctionReturn(PETSC_SUCCESS);
3632 }
3633
MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p * trifactors)3634 PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3635 {
3636 Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
3637
3638 PetscFunctionBegin;
3639 if (fs) {
3640 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3641 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3642 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3643 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3644 delete fs->rpermIndices;
3645 delete fs->cpermIndices;
3646 delete fs->workVector;
3647 fs->rpermIndices = NULL;
3648 fs->cpermIndices = NULL;
3649 fs->workVector = NULL;
3650 fs->init_dev_prop = PETSC_FALSE;
3651 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3652 PetscCallHIP(hipFree(fs->csrRowPtr));
3653 PetscCallHIP(hipFree(fs->csrColIdx));
3654 PetscCallHIP(hipFree(fs->csrVal));
3655 PetscCallHIP(hipFree(fs->X));
3656 PetscCallHIP(hipFree(fs->Y));
3657 // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3658 PetscCallHIP(hipFree(fs->spsvBuffer_L));
3659 PetscCallHIP(hipFree(fs->spsvBuffer_U));
3660 PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3661 PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3662 PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3663 if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3664 if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3665 PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3666 PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3667 PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3668 PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3669 if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3670 if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3671 PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3672 PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
3673
3674 fs->createdTransposeSpSVDescr = PETSC_FALSE;
3675 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3676 #endif
3677 }
3678 PetscFunctionReturn(PETSC_SUCCESS);
3679 }
3680
MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors ** trifactors)3681 static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3682 {
3683 hipsparseHandle_t handle;
3684
3685 PetscFunctionBegin;
3686 if (*trifactors) {
3687 PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3688 if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3689 PetscCall(PetscFree(*trifactors));
3690 }
3691 PetscFunctionReturn(PETSC_SUCCESS);
3692 }
3693
3694 struct IJCompare {
operator ()IJCompare3695 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3696 {
3697 if (t1.get<0>() < t2.get<0>()) return true;
3698 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3699 return false;
3700 }
3701 };
3702
MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A,PetscBool destroy)3703 static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3704 {
3705 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3706
3707 PetscFunctionBegin;
3708 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3709 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3710 if (destroy) {
3711 PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3712 delete cusp->csr2csc_i;
3713 cusp->csr2csc_i = NULL;
3714 }
3715 A->transupdated = PETSC_FALSE;
3716 PetscFunctionReturn(PETSC_SUCCESS);
3717 }
3718
MatCOOStructDestroy_SeqAIJHIPSPARSE(PetscCtxRt data)3719 static PetscErrorCode MatCOOStructDestroy_SeqAIJHIPSPARSE(PetscCtxRt data)
3720 {
3721 MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)data;
3722
3723 PetscFunctionBegin;
3724 PetscCallHIP(hipFree(coo->perm));
3725 PetscCallHIP(hipFree(coo->jmap));
3726 PetscCall(PetscFree(coo));
3727 PetscFunctionReturn(PETSC_SUCCESS);
3728 }
3729
MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])3730 static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3731 {
3732 PetscBool dev_ij = PETSC_FALSE;
3733 PetscMemType mtype = PETSC_MEMTYPE_HOST;
3734 PetscInt *i, *j;
3735 PetscContainer container_h;
3736 MatCOOStruct_SeqAIJ *coo_h, *coo_d;
3737
3738 PetscFunctionBegin;
3739 PetscCall(PetscGetMemType(coo_i, &mtype));
3740 if (PetscMemTypeDevice(mtype)) {
3741 dev_ij = PETSC_TRUE;
3742 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
3743 PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3744 PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3745 } else {
3746 i = coo_i;
3747 j = coo_j;
3748 }
3749 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
3750 if (dev_ij) PetscCall(PetscFree2(i, j));
3751 mat->offloadmask = PETSC_OFFLOAD_CPU;
3752 // Create the GPU memory
3753 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));
3754
3755 // Copy the COO struct to device
3756 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
3757 PetscCall(PetscContainerGetPointer(container_h, &coo_h));
3758 PetscCall(PetscMalloc1(1, &coo_d));
3759 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
3760 PetscCallHIP(hipMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
3761 PetscCallHIP(hipMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
3762 PetscCallHIP(hipMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
3763 PetscCallHIP(hipMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));
3764
3765 // Put the COO struct in a container and then attach that to the matrix
3766 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJHIPSPARSE));
3767 PetscFunctionReturn(PETSC_SUCCESS);
3768 }
3769
MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])3770 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3771 {
3772 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
3773 const PetscCount grid_size = gridDim.x * blockDim.x;
3774 for (; i < nnz; i += grid_size) {
3775 PetscScalar sum = 0.0;
3776 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3777 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3778 }
3779 }
3780
MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A,const PetscScalar v[],InsertMode imode)3781 static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3782 {
3783 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
3784 Mat_SeqAIJHIPSPARSE *dev = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3785 PetscCount Annz = seq->nz;
3786 PetscMemType memtype;
3787 const PetscScalar *v1 = v;
3788 PetscScalar *Aa;
3789 PetscContainer container;
3790 MatCOOStruct_SeqAIJ *coo;
3791
3792 PetscFunctionBegin;
3793 if (!dev->mat) PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3794
3795 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
3796 PetscCall(PetscContainerGetPointer(container, &coo));
3797
3798 PetscCall(PetscGetMemType(v, &memtype));
3799 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3800 PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
3801 PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
3802 }
3803
3804 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
3805 else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));
3806
3807 PetscCall(PetscLogGpuTimeBegin());
3808 if (Annz) {
3809 hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, coo->jmap, coo->perm, imode, Aa);
3810 PetscCallHIP(hipPeekAtLastError());
3811 }
3812 PetscCall(PetscLogGpuTimeEnd());
3813
3814 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
3815 else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
3816
3817 if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
3818 PetscFunctionReturn(PETSC_SUCCESS);
3819 }
3820
3821 /*@C
3822 MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.
3823
3824 Not Collective
3825
3826 Input Parameters:
3827 + A - the matrix
3828 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3829
3830 Output Parameters:
3831 + i - the CSR row pointers
3832 - j - the CSR column indices
3833
3834 Level: developer
3835
3836 Note:
3837 When compressed is true, the CSR structure does not contain empty rows
3838
3839 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3840 @*/
MatSeqAIJHIPSPARSEGetIJ(Mat A,PetscBool compressed,const int * i[],const int * j[])3841 PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3842 {
3843 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3844 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3845 CsrMatrix *csr;
3846
3847 PetscFunctionBegin;
3848 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3849 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
3850 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3851 PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3852 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3853 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3854 csr = (CsrMatrix *)cusp->mat->mat;
3855 if (i) {
3856 if (!compressed && a->compressedrow.use) { /* need full row offset */
3857 if (!cusp->rowoffsets_gpu) {
3858 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3859 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3860 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3861 }
3862 *i = cusp->rowoffsets_gpu->data().get();
3863 } else *i = csr->row_offsets->data().get();
3864 }
3865 if (j) *j = csr->column_indices->data().get();
3866 PetscFunctionReturn(PETSC_SUCCESS);
3867 }
3868
3869 /*@C
3870 MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`
3871
3872 Not Collective
3873
3874 Input Parameters:
3875 + A - the matrix
3876 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3877 . i - the CSR row pointers
3878 - j - the CSR column indices
3879
3880 Level: developer
3881
3882 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3883 @*/
MatSeqAIJHIPSPARSERestoreIJ(Mat A,PetscBool compressed,const int * i[],const int * j[])3884 PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3885 {
3886 PetscFunctionBegin;
3887 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3888 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3889 if (i) *i = NULL;
3890 if (j) *j = NULL;
3891 PetscFunctionReturn(PETSC_SUCCESS);
3892 }
3893
3894 /*@C
3895 MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3896
3897 Not Collective
3898
3899 Input Parameter:
3900 . A - a `MATSEQAIJHIPSPARSE` matrix
3901
3902 Output Parameter:
3903 . a - pointer to the device data
3904
3905 Level: developer
3906
3907 Note:
3908 May trigger host-device copies if the up-to-date matrix data is on host
3909
3910 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3911 @*/
MatSeqAIJHIPSPARSEGetArrayRead(Mat A,const PetscScalar * a[])3912 PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[])
3913 {
3914 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3915 CsrMatrix *csr;
3916
3917 PetscFunctionBegin;
3918 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3919 PetscAssertPointer(a, 2);
3920 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3921 PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3922 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3923 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3924 csr = (CsrMatrix *)cusp->mat->mat;
3925 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3926 *a = csr->values->data().get();
3927 PetscFunctionReturn(PETSC_SUCCESS);
3928 }
3929
3930 /*@C
3931 MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`
3932
3933 Not Collective
3934
3935 Input Parameters:
3936 + A - a `MATSEQAIJHIPSPARSE` matrix
3937 - a - pointer to the device data
3938
3939 Level: developer
3940
3941 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3942 @*/
MatSeqAIJHIPSPARSERestoreArrayRead(Mat A,const PetscScalar * a[])3943 PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[])
3944 {
3945 PetscFunctionBegin;
3946 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3947 PetscAssertPointer(a, 2);
3948 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3949 *a = NULL;
3950 PetscFunctionReturn(PETSC_SUCCESS);
3951 }
3952
3953 /*@C
3954 MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3955
3956 Not Collective
3957
3958 Input Parameter:
3959 . A - a `MATSEQAIJHIPSPARSE` matrix
3960
3961 Output Parameter:
3962 . a - pointer to the device data
3963
3964 Level: developer
3965
3966 Note:
3967 May trigger host-device copies if up-to-date matrix data is on host
3968
3969 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3970 @*/
MatSeqAIJHIPSPARSEGetArray(Mat A,PetscScalar * a[])3971 PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[])
3972 {
3973 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3974 CsrMatrix *csr;
3975
3976 PetscFunctionBegin;
3977 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3978 PetscAssertPointer(a, 2);
3979 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3980 PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3981 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3982 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3983 csr = (CsrMatrix *)cusp->mat->mat;
3984 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3985 *a = csr->values->data().get();
3986 A->offloadmask = PETSC_OFFLOAD_GPU;
3987 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
3988 PetscFunctionReturn(PETSC_SUCCESS);
3989 }
3990 /*@C
3991 MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`
3992
3993 Not Collective
3994
3995 Input Parameters:
3996 + A - a `MATSEQAIJHIPSPARSE` matrix
3997 - a - pointer to the device data
3998
3999 Level: developer
4000
4001 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4002 @*/
MatSeqAIJHIPSPARSERestoreArray(Mat A,PetscScalar * a[])4003 PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[])
4004 {
4005 PetscFunctionBegin;
4006 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4007 PetscAssertPointer(a, 2);
4008 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4009 PetscCall(PetscObjectStateIncrease((PetscObject)A));
4010 *a = NULL;
4011 PetscFunctionReturn(PETSC_SUCCESS);
4012 }
4013
4014 /*@C
4015 MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4016
4017 Not Collective
4018
4019 Input Parameter:
4020 . A - a `MATSEQAIJHIPSPARSE` matrix
4021
4022 Output Parameter:
4023 . a - pointer to the device data
4024
4025 Level: developer
4026
4027 Note:
4028 Does not trigger host-device copies and flags data validity on the GPU
4029
4030 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4031 @*/
MatSeqAIJHIPSPARSEGetArrayWrite(Mat A,PetscScalar * a[])4032 PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[])
4033 {
4034 Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4035 CsrMatrix *csr;
4036
4037 PetscFunctionBegin;
4038 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4039 PetscAssertPointer(a, 2);
4040 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4041 PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4042 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4043 csr = (CsrMatrix *)cusp->mat->mat;
4044 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4045 *a = csr->values->data().get();
4046 A->offloadmask = PETSC_OFFLOAD_GPU;
4047 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4048 PetscFunctionReturn(PETSC_SUCCESS);
4049 }
4050
4051 /*@C
4052 MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`
4053
4054 Not Collective
4055
4056 Input Parameters:
4057 + A - a `MATSEQAIJHIPSPARSE` matrix
4058 - a - pointer to the device data
4059
4060 Level: developer
4061
4062 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4063 @*/
MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A,PetscScalar * a[])4064 PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[])
4065 {
4066 PetscFunctionBegin;
4067 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4068 PetscAssertPointer(a, 2);
4069 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4070 PetscCall(PetscObjectStateIncrease((PetscObject)A));
4071 *a = NULL;
4072 PetscFunctionReturn(PETSC_SUCCESS);
4073 }
4074
4075 struct IJCompare4 {
operator ()IJCompare44076 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4077 {
4078 if (t1.get<0>() < t2.get<0>()) return true;
4079 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4080 return false;
4081 }
4082 };
4083
4084 struct Shift {
4085 int _shift;
4086
ShiftShift4087 Shift(int shift) : _shift(shift) { }
operator ()Shift4088 __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4089 };
4090
4091 /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
MatSeqAIJHIPSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat * C)4092 PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4093 {
4094 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4095 Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4096 Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4097 CsrMatrix *Acsr, *Bcsr, *Ccsr;
4098 PetscInt Annz, Bnnz;
4099 PetscInt i, m, n, zero = 0;
4100
4101 PetscFunctionBegin;
4102 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4103 PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4104 PetscAssertPointer(C, 4);
4105 PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4106 PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4107 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4108 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4109 PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4110 PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4111 if (reuse == MAT_INITIAL_MATRIX) {
4112 m = A->rmap->n;
4113 n = A->cmap->n + B->cmap->n;
4114 PetscCall(MatCreate(PETSC_COMM_SELF, C));
4115 PetscCall(MatSetSizes(*C, m, n, m, n));
4116 PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4117 c = (Mat_SeqAIJ *)(*C)->data;
4118 Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4119 Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
4120 Ccsr = new CsrMatrix;
4121 Cmat->cprowIndices = NULL;
4122 c->compressedrow.use = PETSC_FALSE;
4123 c->compressedrow.nrows = 0;
4124 c->compressedrow.i = NULL;
4125 c->compressedrow.rindex = NULL;
4126 Ccusp->workVector = NULL;
4127 Ccusp->nrows = m;
4128 Ccusp->mat = Cmat;
4129 Ccusp->mat->mat = Ccsr;
4130 Ccsr->num_rows = m;
4131 Ccsr->num_cols = n;
4132 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4133 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4134 PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4135 PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4136 PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4137 PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4138 PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4139 PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4140 PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4141 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4142 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4143 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4144 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4145
4146 Acsr = (CsrMatrix *)Acusp->mat->mat;
4147 Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4148 Annz = (PetscInt)Acsr->column_indices->size();
4149 Bnnz = (PetscInt)Bcsr->column_indices->size();
4150 c->nz = Annz + Bnnz;
4151 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4152 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4153 Ccsr->values = new THRUSTARRAY(c->nz);
4154 Ccsr->num_entries = c->nz;
4155 Ccusp->coords = new THRUSTINTARRAY(c->nz);
4156 if (c->nz) {
4157 auto Acoo = new THRUSTINTARRAY32(Annz);
4158 auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4159 auto Ccoo = new THRUSTINTARRAY32(c->nz);
4160 THRUSTINTARRAY32 *Aroff, *Broff;
4161
4162 if (a->compressedrow.use) { /* need full row offset */
4163 if (!Acusp->rowoffsets_gpu) {
4164 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4165 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4166 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4167 }
4168 Aroff = Acusp->rowoffsets_gpu;
4169 } else Aroff = Acsr->row_offsets;
4170 if (b->compressedrow.use) { /* need full row offset */
4171 if (!Bcusp->rowoffsets_gpu) {
4172 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4173 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4174 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4175 }
4176 Broff = Bcusp->rowoffsets_gpu;
4177 } else Broff = Bcsr->row_offsets;
4178 PetscCall(PetscLogGpuTimeBegin());
4179 PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4180 PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4181 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4182 auto Aperm = thrust::make_constant_iterator(1);
4183 auto Bperm = thrust::make_constant_iterator(0);
4184 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4185 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4186 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4187 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4188 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4189 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4190 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4191 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4192 auto p1 = Ccusp->coords->begin();
4193 auto p2 = Ccusp->coords->begin();
4194 thrust::advance(p2, Annz);
4195 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4196 auto cci = thrust::make_counting_iterator(zero);
4197 auto cce = thrust::make_counting_iterator(c->nz);
4198 #if 0 //Errors on SUMMIT cuda 11.1.0
4199 PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4200 #else
4201 auto pred = [](const int &x) { return x; };
4202 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4203 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4204 #endif
4205 PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4206 PetscCall(PetscLogGpuTimeEnd());
4207 delete wPerm;
4208 delete Acoo;
4209 delete Bcoo;
4210 delete Ccoo;
4211 PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4212
4213 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4214 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4215 PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4216 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4217 Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4218 CsrMatrix *CcsrT = new CsrMatrix;
4219 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4220 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4221
4222 (*C)->form_explicit_transpose = PETSC_TRUE;
4223 (*C)->transupdated = PETSC_TRUE;
4224 Ccusp->rowoffsets_gpu = NULL;
4225 CmatT->cprowIndices = NULL;
4226 CmatT->mat = CcsrT;
4227 CcsrT->num_rows = n;
4228 CcsrT->num_cols = m;
4229 CcsrT->num_entries = c->nz;
4230 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4231 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4232 CcsrT->values = new THRUSTARRAY(c->nz);
4233
4234 PetscCall(PetscLogGpuTimeBegin());
4235 auto rT = CcsrT->row_offsets->begin();
4236 if (AT) {
4237 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4238 thrust::advance(rT, -1);
4239 }
4240 if (BT) {
4241 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4242 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4243 thrust::copy(titb, tite, rT);
4244 }
4245 auto cT = CcsrT->column_indices->begin();
4246 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4247 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4248 auto vT = CcsrT->values->begin();
4249 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4250 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4251 PetscCall(PetscLogGpuTimeEnd());
4252
4253 PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4254 PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4255 PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4256 PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4257 PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4258 PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4259 PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4260 PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4261 PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4262
4263 PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4264 Ccusp->matTranspose = CmatT;
4265 }
4266 }
4267
4268 c->free_a = PETSC_TRUE;
4269 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4270 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4271 c->free_ij = PETSC_TRUE;
4272 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4273 THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4274 THRUSTINTARRAY jj(Ccsr->column_indices->size());
4275 ii = *Ccsr->row_offsets;
4276 jj = *Ccsr->column_indices;
4277 PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4278 PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4279 } else {
4280 PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4281 PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4282 }
4283 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4284 PetscCall(PetscMalloc1(m, &c->ilen));
4285 PetscCall(PetscMalloc1(m, &c->imax));
4286 c->maxnz = c->nz;
4287 c->nonzerorowcnt = 0;
4288 c->rmax = 0;
4289 for (i = 0; i < m; i++) {
4290 const PetscInt nn = c->i[i + 1] - c->i[i];
4291 c->ilen[i] = c->imax[i] = nn;
4292 c->nonzerorowcnt += (PetscInt)!!nn;
4293 c->rmax = PetscMax(c->rmax, nn);
4294 }
4295 PetscCall(PetscMalloc1(c->nz, &c->a));
4296 (*C)->nonzerostate++;
4297 PetscCall(PetscLayoutSetUp((*C)->rmap));
4298 PetscCall(PetscLayoutSetUp((*C)->cmap));
4299 Ccusp->nonzerostate = (*C)->nonzerostate;
4300 (*C)->preallocated = PETSC_TRUE;
4301 } else {
4302 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4303 c = (Mat_SeqAIJ *)(*C)->data;
4304 if (c->nz) {
4305 Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4306 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4307 PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4308 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4309 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4310 PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4311 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4312 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4313 Acsr = (CsrMatrix *)Acusp->mat->mat;
4314 Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4315 Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4316 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4317 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4318 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4319 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4320 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4321 auto pmid = Ccusp->coords->begin();
4322 thrust::advance(pmid, Acsr->num_entries);
4323 PetscCall(PetscLogGpuTimeBegin());
4324 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4325 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4326 thrust::for_each(zibait, zieait, VecHIPEquals());
4327 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4328 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4329 thrust::for_each(zibbit, ziebit, VecHIPEquals());
4330 PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4331 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4332 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4333 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4334 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4335 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4336 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4337 auto vT = CcsrT->values->begin();
4338 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4339 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4340 (*C)->transupdated = PETSC_TRUE;
4341 }
4342 PetscCall(PetscLogGpuTimeEnd());
4343 }
4344 }
4345 PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4346 (*C)->assembled = PETSC_TRUE;
4347 (*C)->was_assembled = PETSC_FALSE;
4348 (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4349 PetscFunctionReturn(PETSC_SUCCESS);
4350 }
4351
MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A,PetscInt n,const PetscInt idx[],PetscScalar v[])4352 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4353 {
4354 bool dmem;
4355 const PetscScalar *av;
4356
4357 PetscFunctionBegin;
4358 dmem = isHipMem(v);
4359 PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4360 if (n && idx) {
4361 THRUSTINTARRAY widx(n);
4362 widx.assign(idx, idx + n);
4363 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4364
4365 THRUSTARRAY *w = NULL;
4366 thrust::device_ptr<PetscScalar> dv;
4367 if (dmem) dv = thrust::device_pointer_cast(v);
4368 else {
4369 w = new THRUSTARRAY(n);
4370 dv = w->data();
4371 }
4372 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4373
4374 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4375 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4376 thrust::for_each(zibit, zieit, VecHIPEquals());
4377 if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4378 delete w;
4379 } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
4380
4381 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4382 PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4383 PetscFunctionReturn(PETSC_SUCCESS);
4384 }
4385