xref: /petsc/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cxx (revision 5a884c48ab0c46bab83cd9bb8710f380fa6d8bcf)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the HIPSPARSE library,
4   Portions of this code are under:
5   Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6 */
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal()
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/iterator/transform_iterator.h>
17 #if PETSC_CPP_VERSION >= 14
18   #define PETSC_HAVE_THRUST_ASYNC 1
19   #include <thrust/async/for_each.h>
20 #endif
21 #include <thrust/iterator/constant_iterator.h>
22 #include <thrust/iterator/discard_iterator.h>
23 #include <thrust/binary_search.h>
24 #include <thrust/remove.h>
25 #include <thrust/sort.h>
26 #include <thrust/unique.h>
27 
28 const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
29 const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
30 const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
31 //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
32 
33 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
34 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
35 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
36 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
37 static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
38 static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
39 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
40 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
41 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
42 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
43 static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems PetscOptionsObject);
44 static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
45 static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
46 static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
47 static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
48 static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
49 static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
50 static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
51 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
52 static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
53 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
54 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
55 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
56 static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
57 static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
58 static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
59 static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
60 static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
61 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
62 static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
63 static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
64 static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
65 
66 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
67 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
68 
69 /*
70 PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
71 {
72   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
73 
74   PetscFunctionBegin;
75   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
76   hipsparsestruct->stream = stream;
77   PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
78   PetscFunctionReturn(PETSC_SUCCESS);
79 }
80 
81 PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
82 {
83   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
84 
85   PetscFunctionBegin;
86   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
87   if (hipsparsestruct->handle != handle) {
88     if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
89     hipsparsestruct->handle = handle;
90   }
91   PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
96 {
97   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
98   PetscBool            flg;
99 
100   PetscFunctionBegin;
101   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
102   if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
103   if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 */
107 
MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A,MatHIPSPARSEFormatOperation op,MatHIPSPARSEStorageFormat format)108 PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
109 {
110   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
111 
112   PetscFunctionBegin;
113   switch (op) {
114   case MAT_HIPSPARSE_MULT:
115     hipsparsestruct->format = format;
116     break;
117   case MAT_HIPSPARSE_ALL:
118     hipsparsestruct->format = format;
119     break;
120   default:
121     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
122   }
123   PetscFunctionReturn(PETSC_SUCCESS);
124 }
125 
126 /*@
127   MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
128   operation. Only the `MatMult()` operation can use different GPU storage formats
129 
130   Not Collective
131 
132   Input Parameters:
133 + A      - Matrix of type `MATSEQAIJHIPSPARSE`
134 . op     - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
135          `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
136 - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
137 
138   Level: intermediate
139 
140 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
141 @*/
MatHIPSPARSESetFormat(Mat A,MatHIPSPARSEFormatOperation op,MatHIPSPARSEStorageFormat format)142 PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
143 {
144   PetscFunctionBegin;
145   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
146   PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
147   PetscFunctionReturn(PETSC_SUCCESS);
148 }
149 
MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A,PetscBool use_cpu)150 PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
151 {
152   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
153 
154   PetscFunctionBegin;
155   hipsparsestruct->use_cpu_solve = use_cpu;
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 /*@
160   MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
161 
162   Input Parameters:
163 + A       - Matrix of type `MATSEQAIJHIPSPARSE`
164 - use_cpu - set flag for using the built-in CPU `MatSolve()`
165 
166   Level: intermediate
167 
168   Notes:
169   The hipSparse LU solver currently computes the factors with the built-in CPU method
170   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
171   This method to specifies if the solve is done on the CPU or GPU (GPU is the default).
172 
173 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
174 @*/
MatHIPSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)175 PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
176 {
177   PetscFunctionBegin;
178   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
179   PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
180   PetscFunctionReturn(PETSC_SUCCESS);
181 }
182 
MatSetOption_SeqAIJHIPSPARSE(Mat A,MatOption op,PetscBool flg)183 static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
184 {
185   PetscFunctionBegin;
186   switch (op) {
187   case MAT_FORM_EXPLICIT_TRANSPOSE:
188     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
189     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
190     A->form_explicit_transpose = flg;
191     break;
192   default:
193     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
194     break;
195   }
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B,Mat A,const MatFactorInfo * info)199 static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
200 {
201   PetscBool            row_identity, col_identity;
202   Mat_SeqAIJ          *b     = (Mat_SeqAIJ *)B->data;
203   IS                   isrow = b->row, iscol = b->col;
204   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;
205 
206   PetscFunctionBegin;
207   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
208   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
209   B->offloadmask = PETSC_OFFLOAD_CPU;
210   /* determine which version of MatSolve needs to be used. */
211   PetscCall(ISIdentity(isrow, &row_identity));
212   PetscCall(ISIdentity(iscol, &col_identity));
213   if (!hipsparsestruct->use_cpu_solve) {
214     if (row_identity && col_identity) {
215       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
216       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
217     } else {
218       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE;
219       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
220     }
221   }
222   B->ops->matsolve          = NULL;
223   B->ops->matsolvetranspose = NULL;
224 
225   /* get the triangular factors */
226   if (!hipsparsestruct->use_cpu_solve) PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B));
227   PetscFunctionReturn(PETSC_SUCCESS);
228 }
229 
MatSetFromOptions_SeqAIJHIPSPARSE(Mat A,PetscOptionItems PetscOptionsObject)230 static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
231 {
232   MatHIPSPARSEStorageFormat format;
233   PetscBool                 flg;
234   Mat_SeqAIJHIPSPARSE      *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
235 
236   PetscFunctionBegin;
237   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
238   if (A->factortype == MAT_FACTOR_NONE) {
239     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
240     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
241     PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
242     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
243     PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
244     if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
245     PetscCall(
246       PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
247     /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
248     PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249     PetscCall(
250       PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
251     PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
252     /*
253     PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
254     PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
255     */
256   }
257   PetscOptionsHeadEnd();
258   PetscFunctionReturn(PETSC_SUCCESS);
259 }
260 
MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)261 static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
262 {
263   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
264   PetscInt                            n                   = A->rmap->n;
265   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
266   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
267   const PetscInt                     *ai = a->i, *aj = a->j, *vi;
268   const MatScalar                    *aa = a->a, *v;
269   PetscInt                           *AiLo, *AjLo;
270   PetscInt                            i, nz, nzLower, offset, rowOffset;
271 
272   PetscFunctionBegin;
273   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
274   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
275     try {
276       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
277       nzLower = n + ai[n] - ai[1];
278       if (!loTriFactor) {
279         PetscScalar *AALo;
280         PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));
281 
282         /* Allocate Space for the lower triangular matrix */
283         PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
284         PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));
285 
286         /* Fill the lower triangular matrix */
287         AiLo[0]   = (PetscInt)0;
288         AiLo[n]   = nzLower;
289         AjLo[0]   = (PetscInt)0;
290         AALo[0]   = (MatScalar)1.0;
291         v         = aa;
292         vi        = aj;
293         offset    = 1;
294         rowOffset = 1;
295         for (i = 1; i < n; i++) {
296           nz = ai[i + 1] - ai[i];
297           /* additional 1 for the term on the diagonal */
298           AiLo[i] = rowOffset;
299           rowOffset += nz + 1;
300 
301           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
302           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
303           offset += nz;
304           AjLo[offset] = (PetscInt)i;
305           AALo[offset] = (MatScalar)1.0;
306           offset += 1;
307           v += nz;
308           vi += nz;
309         }
310 
311         /* allocate space for the triangular factor information */
312         PetscCall(PetscNew(&loTriFactor));
313         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
314         /* Create the matrix description */
315         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
316         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
317         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
318         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
319         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
320 
321         /* set the operation */
322         loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
323 
324         /* set the matrix */
325         loTriFactor->csrMat                 = new CsrMatrix;
326         loTriFactor->csrMat->num_rows       = n;
327         loTriFactor->csrMat->num_cols       = n;
328         loTriFactor->csrMat->num_entries    = nzLower;
329         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
330         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
331         loTriFactor->csrMat->values         = new THRUSTARRAY(nzLower);
332 
333         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
334         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
335         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
336 
337         /* Create the solve analysis information */
338         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
339         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
340         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
342         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
343 
344         /* perform the solve analysis */
345         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
346                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
347 
348         PetscCallHIP(WaitForHIP());
349         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
350 
351         /* assign the pointer */
352         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
353         loTriFactor->AA_h                                           = AALo;
354         PetscCallHIP(hipHostFree(AiLo));
355         PetscCallHIP(hipHostFree(AjLo));
356         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
357       } else { /* update values only */
358         if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
359         /* Fill the lower triangular matrix */
360         loTriFactor->AA_h[0] = 1.0;
361         v                    = aa;
362         vi                   = aj;
363         offset               = 1;
364         for (i = 1; i < n; i++) {
365           nz = ai[i + 1] - ai[i];
366           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
367           offset += nz;
368           loTriFactor->AA_h[offset] = 1.0;
369           offset += 1;
370           v += nz;
371         }
372         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
373         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
374       }
375     } catch (char *ex) {
376       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
377     }
378   }
379   PetscFunctionReturn(PETSC_SUCCESS);
380 }
381 
MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)382 static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
383 {
384   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
385   PetscInt                            n                   = A->rmap->n;
386   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
387   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
388   const PetscInt                     *aj                  = a->j, *adiag, *vi;
389   const MatScalar                    *aa                  = a->a, *v;
390   PetscInt                           *AiUp, *AjUp;
391   PetscInt                            i, nz, nzUpper, offset;
392 
393   PetscFunctionBegin;
394   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
395   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
396   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
397     try {
398       /* next, figure out the number of nonzeros in the upper triangular matrix. */
399       nzUpper = adiag[0] - adiag[n];
400       if (!upTriFactor) {
401         PetscScalar *AAUp;
402         PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
403 
404         /* Allocate Space for the upper triangular matrix */
405         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
406         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
407 
408         /* Fill the upper triangular matrix */
409         AiUp[0] = (PetscInt)0;
410         AiUp[n] = nzUpper;
411         offset  = nzUpper;
412         for (i = n - 1; i >= 0; i--) {
413           v  = aa + adiag[i + 1] + 1;
414           vi = aj + adiag[i + 1] + 1;
415           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
416           offset -= (nz + 1);               /* decrement the offset */
417 
418           /* first, set the diagonal elements */
419           AjUp[offset] = (PetscInt)i;
420           AAUp[offset] = (MatScalar)1. / v[nz];
421           AiUp[i]      = AiUp[i + 1] - (nz + 1);
422 
423           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
424           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
425         }
426 
427         /* allocate space for the triangular factor information */
428         PetscCall(PetscNew(&upTriFactor));
429         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
430 
431         /* Create the matrix description */
432         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
433         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
434         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
435         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
436         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
437 
438         /* set the operation */
439         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
440 
441         /* set the matrix */
442         upTriFactor->csrMat                 = new CsrMatrix;
443         upTriFactor->csrMat->num_rows       = n;
444         upTriFactor->csrMat->num_cols       = n;
445         upTriFactor->csrMat->num_entries    = nzUpper;
446         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
447         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
448         upTriFactor->csrMat->values         = new THRUSTARRAY(nzUpper);
449         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
450         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
451         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
452 
453         /* Create the solve analysis information */
454         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
455         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
456         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
457                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
458         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
459 
460         /* perform the solve analysis */
461         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
462                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
463 
464         PetscCallHIP(WaitForHIP());
465         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
466 
467         /* assign the pointer */
468         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
469         upTriFactor->AA_h                                           = AAUp;
470         PetscCallHIP(hipHostFree(AiUp));
471         PetscCallHIP(hipHostFree(AjUp));
472         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
473       } else {
474         if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
475         /* Fill the upper triangular matrix */
476         offset = nzUpper;
477         for (i = n - 1; i >= 0; i--) {
478           v  = aa + adiag[i + 1] + 1;
479           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
480           offset -= (nz + 1);               /* decrement the offset */
481 
482           /* first, set the diagonal elements */
483           upTriFactor->AA_h[offset] = 1. / v[nz];
484           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
485         }
486         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
487         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
488       }
489     } catch (char *ex) {
490       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
491     }
492   }
493   PetscFunctionReturn(PETSC_SUCCESS);
494 }
495 
MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)496 static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
497 {
498   PetscBool                      row_identity, col_identity;
499   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
500   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
501   IS                             isrow = a->row, iscol = a->icol;
502   PetscInt                       n = A->rmap->n;
503 
504   PetscFunctionBegin;
505   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
506   PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
507   PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));
508 
509   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
510   hipsparseTriFactors->nnz = a->nz;
511 
512   A->offloadmask = PETSC_OFFLOAD_BOTH;
513   /* lower triangular indices */
514   PetscCall(ISIdentity(isrow, &row_identity));
515   if (!row_identity && !hipsparseTriFactors->rpermIndices) {
516     const PetscInt *r;
517 
518     PetscCall(ISGetIndices(isrow, &r));
519     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
520     hipsparseTriFactors->rpermIndices->assign(r, r + n);
521     PetscCall(ISRestoreIndices(isrow, &r));
522     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
523   }
524   /* upper triangular indices */
525   PetscCall(ISIdentity(iscol, &col_identity));
526   if (!col_identity && !hipsparseTriFactors->cpermIndices) {
527     const PetscInt *c;
528 
529     PetscCall(ISGetIndices(iscol, &c));
530     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
531     hipsparseTriFactors->cpermIndices->assign(c, c + n);
532     PetscCall(ISRestoreIndices(iscol, &c));
533     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
534   }
535   PetscFunctionReturn(PETSC_SUCCESS);
536 }
537 
MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)538 static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
539 {
540   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
541   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
542   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
543   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
544   PetscInt                           *AiUp, *AjUp;
545   PetscScalar                        *AAUp;
546   PetscScalar                        *AALo;
547   PetscInt                            nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
548   Mat_SeqSBAIJ                       *b  = (Mat_SeqSBAIJ *)A->data;
549   const PetscInt                     *ai = b->i, *aj = b->j, *vj;
550   const MatScalar                    *aa = b->a, *v;
551 
552   PetscFunctionBegin;
553   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
554   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
555     try {
556       PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
557       PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
558       if (!upTriFactor && !loTriFactor) {
559         /* Allocate Space for the upper triangular matrix */
560         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
561         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
562 
563         /* Fill the upper triangular matrix */
564         AiUp[0] = (PetscInt)0;
565         AiUp[n] = nzUpper;
566         offset  = 0;
567         for (i = 0; i < n; i++) {
568           /* set the pointers */
569           v  = aa + ai[i];
570           vj = aj + ai[i];
571           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
572 
573           /* first, set the diagonal elements */
574           AjUp[offset] = (PetscInt)i;
575           AAUp[offset] = (MatScalar)1.0 / v[nz];
576           AiUp[i]      = offset;
577           AALo[offset] = (MatScalar)1.0 / v[nz];
578 
579           offset += 1;
580           if (nz > 0) {
581             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
582             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
583             for (j = offset; j < offset + nz; j++) {
584               AAUp[j] = -AAUp[j];
585               AALo[j] = AAUp[j] / v[nz];
586             }
587             offset += nz;
588           }
589         }
590 
591         /* allocate space for the triangular factor information */
592         PetscCall(PetscNew(&upTriFactor));
593         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
594 
595         /* Create the matrix description */
596         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
597         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
598         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
599         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
600         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
601 
602         /* set the matrix */
603         upTriFactor->csrMat                 = new CsrMatrix;
604         upTriFactor->csrMat->num_rows       = A->rmap->n;
605         upTriFactor->csrMat->num_cols       = A->cmap->n;
606         upTriFactor->csrMat->num_entries    = a->nz;
607         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
608         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609         upTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
610         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
611         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
612         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
613 
614         /* set the operation */
615         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
616 
617         /* Create the solve analysis information */
618         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
619         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
620         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
621                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
622         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
623 
624         /* perform the solve analysis */
625         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
626                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
627 
628         PetscCallHIP(WaitForHIP());
629         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
630 
631         /* assign the pointer */
632         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
633 
634         /* allocate space for the triangular factor information */
635         PetscCall(PetscNew(&loTriFactor));
636         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
637 
638         /* Create the matrix description */
639         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
640         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
641         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
642         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
643         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
644 
645         /* set the operation */
646         loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;
647 
648         /* set the matrix */
649         loTriFactor->csrMat                 = new CsrMatrix;
650         loTriFactor->csrMat->num_rows       = A->rmap->n;
651         loTriFactor->csrMat->num_cols       = A->cmap->n;
652         loTriFactor->csrMat->num_entries    = a->nz;
653         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
654         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
655         loTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
656         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
657         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
658         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
659 
660         /* Create the solve analysis information */
661         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
662         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
663         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
664                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
665         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
666 
667         /* perform the solve analysis */
668         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
669                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
670 
671         PetscCallHIP(WaitForHIP());
672         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
673 
674         /* assign the pointer */
675         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
676 
677         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
678         PetscCallHIP(hipHostFree(AiUp));
679         PetscCallHIP(hipHostFree(AjUp));
680       } else {
681         /* Fill the upper triangular matrix */
682         offset = 0;
683         for (i = 0; i < n; i++) {
684           /* set the pointers */
685           v  = aa + ai[i];
686           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
687 
688           /* first, set the diagonal elements */
689           AAUp[offset] = 1.0 / v[nz];
690           AALo[offset] = 1.0 / v[nz];
691 
692           offset += 1;
693           if (nz > 0) {
694             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
695             for (j = offset; j < offset + nz; j++) {
696               AAUp[j] = -AAUp[j];
697               AALo[j] = AAUp[j] / v[nz];
698             }
699             offset += nz;
700           }
701         }
702         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
703         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
704         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
705         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
706         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
707       }
708       PetscCallHIP(hipHostFree(AAUp));
709       PetscCallHIP(hipHostFree(AALo));
710     } catch (char *ex) {
711       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
712     }
713   }
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)717 static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
718 {
719   PetscBool                      perm_identity;
720   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
721   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
722   IS                             ip                  = a->row;
723   PetscInt                       n                   = A->rmap->n;
724 
725   PetscFunctionBegin;
726   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
727   PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
728   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
729   hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
730 
731   A->offloadmask = PETSC_OFFLOAD_BOTH;
732   /* lower triangular indices */
733   PetscCall(ISIdentity(ip, &perm_identity));
734   if (!perm_identity) {
735     IS              iip;
736     const PetscInt *irip, *rip;
737 
738     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
739     PetscCall(ISGetIndices(iip, &irip));
740     PetscCall(ISGetIndices(ip, &rip));
741     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
742     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
743     hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
744     hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
745     PetscCall(ISRestoreIndices(iip, &irip));
746     PetscCall(ISDestroy(&iip));
747     PetscCall(ISRestoreIndices(ip, &rip));
748     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
749   }
750   PetscFunctionReturn(PETSC_SUCCESS);
751 }
752 
MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B,Mat A,const MatFactorInfo * info)753 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
754 {
755   PetscBool   perm_identity;
756   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
757   IS          ip = b->row;
758 
759   PetscFunctionBegin;
760   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
761   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
762   B->offloadmask = PETSC_OFFLOAD_CPU;
763   /* determine which version of MatSolve needs to be used. */
764   PetscCall(ISIdentity(ip, &perm_identity));
765   if (perm_identity) {
766     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
767     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
768     B->ops->matsolve          = NULL;
769     B->ops->matsolvetranspose = NULL;
770   } else {
771     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE;
772     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE;
773     B->ops->matsolve          = NULL;
774     B->ops->matsolvetranspose = NULL;
775   }
776 
777   /* get the triangular factors */
778   PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
779   PetscFunctionReturn(PETSC_SUCCESS);
780 }
781 
MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)782 static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
783 {
784   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
785   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
786   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
787   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
788   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
789   hipsparseIndexBase_t                indexBase;
790   hipsparseMatrixType_t               matrixType;
791   hipsparseFillMode_t                 fillMode;
792   hipsparseDiagType_t                 diagType;
793 
794   PetscFunctionBegin;
795   /* allocate space for the transpose of the lower triangular factor */
796   PetscCall(PetscNew(&loTriFactorT));
797   loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
798 
799   /* set the matrix descriptors of the lower triangular factor */
800   matrixType = hipsparseGetMatType(loTriFactor->descr);
801   indexBase  = hipsparseGetMatIndexBase(loTriFactor->descr);
802   fillMode   = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
803   diagType   = hipsparseGetMatDiagType(loTriFactor->descr);
804 
805   /* Create the matrix description */
806   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
807   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
808   PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
809   PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
810   PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));
811 
812   /* set the operation */
813   loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
814 
815   /* allocate GPU space for the CSC of the lower triangular factor*/
816   loTriFactorT->csrMat                 = new CsrMatrix;
817   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
818   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
819   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
820   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
821   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
822   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
823 
824   /* compute the transpose of the lower triangular factor, i.e. the CSC */
825   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
826 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
827   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
828                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
829                                                   loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
830   PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
831 #endif
832 */
833   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
834 
835   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
836                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
837 #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
838                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
839                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
840 #else
841                                        loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
842 #endif
843 
844   PetscCallHIP(WaitForHIP());
845   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
846 
847   /* Create the solve analysis information */
848   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
849   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
850   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
851                                               loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
852   PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
853 
854   /* perform the solve analysis */
855   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
856                                               loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
857 
858   PetscCallHIP(WaitForHIP());
859   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
860 
861   /* assign the pointer */
862   ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
863 
864   /*********************************************/
865   /* Now the Transpose of the Upper Tri Factor */
866   /*********************************************/
867 
868   /* allocate space for the transpose of the upper triangular factor */
869   PetscCall(PetscNew(&upTriFactorT));
870   upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
871 
872   /* set the matrix descriptors of the upper triangular factor */
873   matrixType = hipsparseGetMatType(upTriFactor->descr);
874   indexBase  = hipsparseGetMatIndexBase(upTriFactor->descr);
875   fillMode   = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
876   diagType   = hipsparseGetMatDiagType(upTriFactor->descr);
877 
878   /* Create the matrix description */
879   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
880   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
881   PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
882   PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
883   PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));
884 
885   /* set the operation */
886   upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
887 
888   /* allocate GPU space for the CSC of the upper triangular factor*/
889   upTriFactorT->csrMat                 = new CsrMatrix;
890   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
891   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
892   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
893   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
894   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
895   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
896 
897   /* compute the transpose of the upper triangular factor, i.e. the CSC */
898   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
899 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
900   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
901                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
902                                                   upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
903   PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
904 #endif
905 */
906   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
907   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
908                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
909 #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
910                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
911                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
912 #else
913                                        upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
914 #endif
915 
916   PetscCallHIP(WaitForHIP());
917   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
918 
919   /* Create the solve analysis information */
920   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
921   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
922   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
923                                               upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
924   PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
925 
926   /* perform the solve analysis */
927   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
928                                               upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
929 
930   PetscCallHIP(WaitForHIP());
931   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
932 
933   /* assign the pointer */
934   ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
935   PetscFunctionReturn(PETSC_SUCCESS);
936 }
937 
938 struct PetscScalarToPetscInt {
operator ()PetscScalarToPetscInt939   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
940 };
941 
MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)942 static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
943 {
944   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
945   Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
946   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data;
947   hipsparseIndexBase_t           indexBase;
948 
949   PetscFunctionBegin;
950   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
951   matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
952   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
953   matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
954   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
955   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
956   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
957   PetscCall(PetscLogGpuTimeBegin());
958   if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
959   if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
960     matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
961     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
962     indexBase = hipsparseGetMatIndexBase(matstruct->descr);
963     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
964     PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
965 
966     /* set alpha and beta */
967     PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
968     PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
969     PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
970     PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
971     PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
972     PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
973 
974     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
975       CsrMatrix *matrixT      = new CsrMatrix;
976       matstructT->mat         = matrixT;
977       matrixT->num_rows       = A->cmap->n;
978       matrixT->num_cols       = A->rmap->n;
979       matrixT->num_entries    = a->nz;
980       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
981       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
982       matrixT->values         = new THRUSTARRAY(a->nz);
983 
984       if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
985       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
986 
987       PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
988                                             indexBase, hipsparse_scalartype));
989     } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
990       CsrMatrix *temp  = new CsrMatrix;
991       CsrMatrix *tempT = new CsrMatrix;
992       /* First convert HYB to CSR */
993       temp->num_rows       = A->rmap->n;
994       temp->num_cols       = A->cmap->n;
995       temp->num_entries    = a->nz;
996       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
997       temp->column_indices = new THRUSTINTARRAY32(a->nz);
998       temp->values         = new THRUSTARRAY(a->nz);
999 
1000       PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));
1001 
1002       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1003       tempT->num_rows       = A->rmap->n;
1004       tempT->num_cols       = A->cmap->n;
1005       tempT->num_entries    = a->nz;
1006       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1007       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1008       tempT->values         = new THRUSTARRAY(a->nz);
1009 
1010       PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1011                                            tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1012 
1013       /* Last, convert CSC to HYB */
1014       hipsparseHybMat_t hybMat;
1015       PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1016       hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1017       PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));
1018 
1019       /* assign the pointer */
1020       matstructT->mat = hybMat;
1021       A->transupdated = PETSC_TRUE;
1022       /* delete temporaries */
1023       if (tempT) {
1024         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1025         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1026         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1027         delete (CsrMatrix *)tempT;
1028       }
1029       if (temp) {
1030         if (temp->values) delete (THRUSTARRAY *)temp->values;
1031         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1032         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1033         delete (CsrMatrix *)temp;
1034       }
1035     }
1036   }
1037   if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1038     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1039     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1040     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1041     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1042     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1043     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1044     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1045     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1046     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1047     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1048     if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1049       hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1050       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1051       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1052     }
1053     if (!hipsparsestruct->csr2csc_i) {
1054       THRUSTARRAY csr2csc_a(matrix->num_entries);
1055       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1056 
1057       indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1058       if (matrix->num_entries) {
1059         /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1060            Need to verify this for ROCm.
1061         */
1062         PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1063                                              matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1064       } else {
1065         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1066       }
1067 
1068       hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1069       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1070     }
1071     PetscCallThrust(
1072       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1073   }
1074   PetscCall(PetscLogGpuTimeEnd());
1075   PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1076   /* the compressed row indices is not used for matTranspose */
1077   matstructT->cprowIndices = NULL;
1078   /* assign the pointer */
1079   ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1080   A->transupdated                                 = PETSC_TRUE;
1081   PetscFunctionReturn(PETSC_SUCCESS);
1082 }
1083 
1084 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
MatSolveTranspose_SeqAIJHIPSPARSE(Mat A,Vec bb,Vec xx)1085 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1086 {
1087   PetscInt                              n = xx->map->n;
1088   const PetscScalar                    *barray;
1089   PetscScalar                          *xarray;
1090   thrust::device_ptr<const PetscScalar> bGPU;
1091   thrust::device_ptr<PetscScalar>       xGPU;
1092   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1093   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1094   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1095   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1096 
1097   PetscFunctionBegin;
1098   /* Analyze the matrix and create the transpose ... on the fly */
1099   if (!loTriFactorT && !upTriFactorT) {
1100     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1101     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1102     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1103   }
1104 
1105   /* Get the GPU pointers */
1106   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1107   PetscCall(VecHIPGetArrayRead(bb, &barray));
1108   xGPU = thrust::device_pointer_cast(xarray);
1109   bGPU = thrust::device_pointer_cast(barray);
1110 
1111   PetscCall(PetscLogGpuTimeBegin());
1112   /* First, reorder with the row permutation */
1113   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);
1114 
1115   /* First, solve U */
1116   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1117                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1118 
1119   /* Then, solve L */
1120   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1121                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1122 
1123   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1124   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());
1125 
1126   /* Copy the temporary to the full solution. */
1127   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);
1128 
1129   /* restore */
1130   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1131   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1132   PetscCall(PetscLogGpuTimeEnd());
1133   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1134   PetscFunctionReturn(PETSC_SUCCESS);
1135 }
1136 
MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1137 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1138 {
1139   const PetscScalar                  *barray;
1140   PetscScalar                        *xarray;
1141   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1142   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1143   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1144   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1145 
1146   PetscFunctionBegin;
1147   /* Analyze the matrix and create the transpose ... on the fly */
1148   if (!loTriFactorT && !upTriFactorT) {
1149     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1150     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1151     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1152   }
1153 
1154   /* Get the GPU pointers */
1155   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1156   PetscCall(VecHIPGetArrayRead(bb, &barray));
1157 
1158   PetscCall(PetscLogGpuTimeBegin());
1159   /* First, solve U */
1160   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1161                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1162 
1163   /* Then, solve L */
1164   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1165                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1166 
1167   /* restore */
1168   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1169   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1170   PetscCall(PetscLogGpuTimeEnd());
1171   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1172   PetscFunctionReturn(PETSC_SUCCESS);
1173 }
1174 
MatSolve_SeqAIJHIPSPARSE(Mat A,Vec bb,Vec xx)1175 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1176 {
1177   const PetscScalar                    *barray;
1178   PetscScalar                          *xarray;
1179   thrust::device_ptr<const PetscScalar> bGPU;
1180   thrust::device_ptr<PetscScalar>       xGPU;
1181   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1182   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1183   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1184   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1185 
1186   PetscFunctionBegin;
1187   /* Get the GPU pointers */
1188   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1189   PetscCall(VecHIPGetArrayRead(bb, &barray));
1190   xGPU = thrust::device_pointer_cast(xarray);
1191   bGPU = thrust::device_pointer_cast(barray);
1192 
1193   PetscCall(PetscLogGpuTimeBegin());
1194   /* First, reorder with the row permutation */
1195   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
1196 
1197   /* Next, solve L */
1198   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1199                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1200 
1201   /* Then, solve U */
1202   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1203                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1204 
1205   /* Last, reorder with the column permutation */
1206   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
1207 
1208   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1209   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1210   PetscCall(PetscLogGpuTimeEnd());
1211   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1212   PetscFunctionReturn(PETSC_SUCCESS);
1213 }
1214 
MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)1215 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1216 {
1217   const PetscScalar                  *barray;
1218   PetscScalar                        *xarray;
1219   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1220   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1221   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1222   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1223 
1224   PetscFunctionBegin;
1225   /* Get the GPU pointers */
1226   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1227   PetscCall(VecHIPGetArrayRead(bb, &barray));
1228 
1229   PetscCall(PetscLogGpuTimeBegin());
1230   /* First, solve L */
1231   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1232                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1233 
1234   /* Next, solve U */
1235   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1236                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1237 
1238   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1239   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1240   PetscCall(PetscLogGpuTimeEnd());
1241   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1242   PetscFunctionReturn(PETSC_SUCCESS);
1243 }
1244 
1245 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1246 /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact,Vec b,Vec x)1247 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1248 {
1249   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1250   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1251   const PetscScalar             *barray;
1252   PetscScalar                   *xarray;
1253 
1254   PetscFunctionBegin;
1255   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1256   PetscCall(VecHIPGetArrayRead(b, &barray));
1257   PetscCall(PetscLogGpuTimeBegin());
1258 
1259   /* Solve L*y = b */
1260   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1261   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1262   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1263   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                   /* L Y = X */
1264                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1265   #else
1266   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                                     /* L Y = X */
1267                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1268   #endif
1269   /* Solve U*x = y */
1270   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1271   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1272   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1273                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1274   #else
1275   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1276                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1277   #endif
1278   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1279   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1280 
1281   PetscCall(PetscLogGpuTimeEnd());
1282   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1283   PetscFunctionReturn(PETSC_SUCCESS);
1284 }
1285 
MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact,Vec b,Vec x)1286 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1287 {
1288   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1289   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1290   const PetscScalar             *barray;
1291   PetscScalar                   *xarray;
1292 
1293   PetscFunctionBegin;
1294   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1295     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1296     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1297                                                 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1298 
1299     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1300     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1301     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1302     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1303     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1304   }
1305 
1306   if (!fs->updatedTransposeSpSVAnalysis) {
1307     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1308 
1309     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1310     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1311   }
1312 
1313   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1314   PetscCall(VecHIPGetArrayRead(b, &barray));
1315   PetscCall(PetscLogGpuTimeBegin());
1316 
1317   /* Solve Ut*y = b */
1318   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1319   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1320   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1321   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1322                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1323   #else
1324   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1325                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1326   #endif
1327   /* Solve Lt*x = y */
1328   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1329   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1330   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1331                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1332   #else
1333   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1334                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1335   #endif
1336   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1337   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1338   PetscCall(PetscLogGpuTimeEnd());
1339   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1340   PetscFunctionReturn(PETSC_SUCCESS);
1341 }
1342 
MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo * info)1343 static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1344 {
1345   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1346   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1347   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1348   CsrMatrix                     *Acsr;
1349   PetscInt                       m, nz;
1350   PetscBool                      flg;
1351 
1352   PetscFunctionBegin;
1353   if (PetscDefined(USE_DEBUG)) {
1354     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1355     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1356   }
1357 
1358   /* Copy A's value to fact */
1359   m  = fact->rmap->n;
1360   nz = aij->nz;
1361   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1362   Acsr = (CsrMatrix *)Acusp->mat->mat;
1363   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1364 
1365   /* Factorize fact inplace */
1366   if (m)
1367     PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1368                                           fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1369   if (PetscDefined(USE_DEBUG)) {
1370     int               numerical_zero;
1371     hipsparseStatus_t status;
1372     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1373     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1374   }
1375 
1376   /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1377   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1378 
1379   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1380 
1381   /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1382   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1383 
1384   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1385   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ILU0;
1386   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1387   fact->ops->matsolve          = NULL;
1388   fact->ops->matsolvetranspose = NULL;
1389   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1390   PetscFunctionReturn(PETSC_SUCCESS);
1391 }
1392 
MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)1393 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1394 {
1395   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1396   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1397   PetscInt                       m, nz;
1398 
1399   PetscFunctionBegin;
1400   if (PetscDefined(USE_DEBUG)) {
1401     PetscBool flg, diagDense;
1402 
1403     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1404     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1405     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1406     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1407     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1408   }
1409 
1410   /* Free the old stale stuff */
1411   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1412 
1413   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1414      but they will not be used. Allocate them just for easy debugging.
1415    */
1416   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1417 
1418   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1419   fact->factortype             = MAT_FACTOR_ILU;
1420   fact->info.factor_mallocs    = 0;
1421   fact->info.fill_ratio_given  = info->fill;
1422   fact->info.fill_ratio_needed = 1.0;
1423 
1424   aij->row = NULL;
1425   aij->col = NULL;
1426 
1427   /* ====================================================================== */
1428   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1429   /* We'll do in-place factorization on fact                                */
1430   /* ====================================================================== */
1431   const int *Ai, *Aj;
1432 
1433   m  = fact->rmap->n;
1434   nz = aij->nz;
1435 
1436   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1437   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1438   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1439   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1440   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1441   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1442 
1443   /* ====================================================================== */
1444   /* Create descriptors for M, L, U                                         */
1445   /* ====================================================================== */
1446   hipsparseFillMode_t fillMode;
1447   hipsparseDiagType_t diagType;
1448 
1449   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1450   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1451   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1452 
1453   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1454     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1455     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1456     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1457     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1458   */
1459   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1460   diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1461   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1462   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1463   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1464 
1465   fillMode = HIPSPARSE_FILL_MODE_UPPER;
1466   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1467   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1468   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1469   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1470 
1471   /* ========================================================================= */
1472   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1473   /* ========================================================================= */
1474   PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1475   if (m)
1476     PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1477                                                      fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1478 
1479   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1480   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1481 
1482   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1483   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1484 
1485   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1486   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1487 
1488   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1489   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1490 
1491   /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1492      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1493    */
1494   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1495     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1496     fs->spsvBuffer_L = fs->factBuffer_M;
1497     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1498   } else {
1499     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1500     fs->spsvBuffer_U = fs->factBuffer_M;
1501     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1502   }
1503 
1504   /* ========================================================================== */
1505   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1506   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1507   /* ========================================================================== */
1508   int structural_zero;
1509 
1510   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1511   if (m)
1512     PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1513                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1514   if (PetscDefined(USE_DEBUG)) {
1515     /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1516     hipsparseStatus_t status;
1517     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1518     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1519   }
1520 
1521   /* Estimate FLOPs of the numeric factorization */
1522   {
1523     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
1524     PetscInt       *Ai, nzRow, nzLeft;
1525     PetscLogDouble  flops = 0.0;
1526     const PetscInt *Adiag;
1527 
1528     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &Adiag, NULL));
1529     Ai = Aseq->i;
1530     for (PetscInt i = 0; i < m; i++) {
1531       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1532         nzRow  = Ai[i + 1] - Ai[i];
1533         nzLeft = Adiag[i] - Ai[i];
1534         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1535           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1536         */
1537         nzLeft = (nzRow - 1) / 2;
1538         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1539       }
1540     }
1541     fs->numericFactFlops = flops;
1542   }
1543   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1544   PetscFunctionReturn(PETSC_SUCCESS);
1545 }
1546 
MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact,Vec b,Vec x)1547 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1548 {
1549   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1550   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1551   const PetscScalar             *barray;
1552   PetscScalar                   *xarray;
1553 
1554   PetscFunctionBegin;
1555   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1556   PetscCall(VecHIPGetArrayRead(b, &barray));
1557   PetscCall(PetscLogGpuTimeBegin());
1558 
1559   /* Solve L*y = b */
1560   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1561   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1562   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1563   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1564                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1565   #else
1566   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1567                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1568   #endif
1569   /* Solve Lt*x = y */
1570   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1571   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1572   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1573                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1574   #else
1575   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1576                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1577   #endif
1578   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1579   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1580 
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo * info)1586 static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1587 {
1588   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1589   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1590   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1591   CsrMatrix                     *Acsr;
1592   PetscInt                       m, nz;
1593   PetscBool                      flg;
1594 
1595   PetscFunctionBegin;
1596   if (PetscDefined(USE_DEBUG)) {
1597     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1598     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1599   }
1600 
1601   /* Copy A's value to fact */
1602   m  = fact->rmap->n;
1603   nz = aij->nz;
1604   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1605   Acsr = (CsrMatrix *)Acusp->mat->mat;
1606   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1607 
1608   /* Factorize fact inplace */
1609   /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1610      The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1611      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1612      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1613    */
1614   if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1615   if (PetscDefined(USE_DEBUG)) {
1616     int               numerical_zero;
1617     hipsparseStatus_t status;
1618     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1619     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1620   }
1621 
1622   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1623 
1624   /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1625     ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1626   */
1627   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1628 
1629   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1630   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ICC0;
1631   fact->ops->solvetranspose    = MatSolve_SeqAIJHIPSPARSE_ICC0;
1632   fact->ops->matsolve          = NULL;
1633   fact->ops->matsolvetranspose = NULL;
1634   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1635   PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637 
MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo * info)1638 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1639 {
1640   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1641   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1642   PetscInt                       m, nz;
1643 
1644   PetscFunctionBegin;
1645   if (PetscDefined(USE_DEBUG)) {
1646     PetscBool flg, diagDense;
1647 
1648     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1649     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1650     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1651     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1652     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1653   }
1654 
1655   /* Free the old stale stuff */
1656   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1657 
1658   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1659      but they will not be used. Allocate them just for easy debugging.
1660    */
1661   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1662 
1663   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1664   fact->factortype             = MAT_FACTOR_ICC;
1665   fact->info.factor_mallocs    = 0;
1666   fact->info.fill_ratio_given  = info->fill;
1667   fact->info.fill_ratio_needed = 1.0;
1668 
1669   aij->row = NULL;
1670   aij->col = NULL;
1671 
1672   /* ====================================================================== */
1673   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1674   /* We'll do in-place factorization on fact                                */
1675   /* ====================================================================== */
1676   const int *Ai, *Aj;
1677 
1678   m  = fact->rmap->n;
1679   nz = aij->nz;
1680 
1681   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1682   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1683   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1684   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1685   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1686   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1687 
1688   /* ====================================================================== */
1689   /* Create mat descriptors for M, L                                        */
1690   /* ====================================================================== */
1691   hipsparseFillMode_t fillMode;
1692   hipsparseDiagType_t diagType;
1693 
1694   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1695   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1696   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1697 
1698   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1699     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1700     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1701     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1702     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1703   */
1704   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1705   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1706   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1707   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1708   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1709 
1710   /* ========================================================================= */
1711   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1712   /* ========================================================================= */
1713   PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1714   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1715 
1716   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1717   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1718 
1719   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1720   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1721 
1722   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1723   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1724 
1725   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1726   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1727 
1728   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1729      See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1730    */
1731   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1732     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1733     fs->spsvBuffer_L = fs->factBuffer_M;
1734     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1735   } else {
1736     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1737     fs->spsvBuffer_Lt = fs->factBuffer_M;
1738     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1739   }
1740 
1741   /* ========================================================================== */
1742   /* Perform analysis of ic0 on M                                               */
1743   /* The lower triangular part of M has the same sparsity pattern as L          */
1744   /* ========================================================================== */
1745   int structural_zero;
1746 
1747   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1748   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1749   if (PetscDefined(USE_DEBUG)) {
1750     hipsparseStatus_t status;
1751     /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1752     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1753     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1754   }
1755 
1756   /* Estimate FLOPs of the numeric factorization */
1757   {
1758     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1759     PetscInt      *Ai, nzRow, nzLeft;
1760     PetscLogDouble flops = 0.0;
1761 
1762     Ai = Aseq->i;
1763     for (PetscInt i = 0; i < m; i++) {
1764       nzRow = Ai[i + 1] - Ai[i];
1765       if (nzRow > 1) {
1766         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1767           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1768         */
1769         nzLeft = (nzRow - 1) / 2;
1770         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1771       }
1772     }
1773     fs->numericFactFlops = flops;
1774   }
1775   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1776   PetscFunctionReturn(PETSC_SUCCESS);
1777 }
1778 #endif
1779 
MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)1780 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1781 {
1782   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1783 
1784   PetscFunctionBegin;
1785 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1786   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1787   if (!info->factoronhost) {
1788     PetscCall(ISIdentity(isrow, &row_identity));
1789     PetscCall(ISIdentity(iscol, &col_identity));
1790   }
1791   if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1792   else
1793 #endif
1794   {
1795     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1796     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1797     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1798   }
1799   PetscFunctionReturn(PETSC_SUCCESS);
1800 }
1801 
MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)1802 static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1803 {
1804   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1805 
1806   PetscFunctionBegin;
1807   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1808   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1809   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1810   PetscFunctionReturn(PETSC_SUCCESS);
1811 }
1812 
MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)1813 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1814 {
1815   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1816 
1817   PetscFunctionBegin;
1818 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1819   PetscBool perm_identity = PETSC_FALSE;
1820   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1821   if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1822   else
1823 #endif
1824   {
1825     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1826     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1827     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1828   }
1829   PetscFunctionReturn(PETSC_SUCCESS);
1830 }
1831 
MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo * info)1832 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1833 {
1834   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1835 
1836   PetscFunctionBegin;
1837   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1838   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1839   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1840   PetscFunctionReturn(PETSC_SUCCESS);
1841 }
1842 
MatFactorGetSolverType_seqaij_hipsparse(Mat A,MatSolverType * type)1843 static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1844 {
1845   PetscFunctionBegin;
1846   *type = MATSOLVERHIPSPARSE;
1847   PetscFunctionReturn(PETSC_SUCCESS);
1848 }
1849 
1850 /*MC
1851   MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1852   on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1853   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1854   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1855   HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1856   algorithms are not recommended. This class does NOT support direct solver operations.
1857 
1858   Level: beginner
1859 
1860 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1861 M*/
1862 
MatGetFactor_seqaijhipsparse_hipsparse(Mat A,MatFactorType ftype,Mat * B)1863 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1864 {
1865   PetscInt n = A->rmap->n;
1866 
1867   PetscFunctionBegin;
1868   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1869   PetscCall(MatSetSizes(*B, n, n, n, n));
1870   (*B)->factortype = ftype;
1871   PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
1872 
1873   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1874   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1875     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1876     if (!A->boundtocpu) {
1877       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1878       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1879     } else {
1880       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1881       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1882     }
1883     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1884     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1885     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1886   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1887     if (!A->boundtocpu) {
1888       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1889       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1890     } else {
1891       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1892       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1893     }
1894     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1895     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1896   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1897 
1898   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1899   (*B)->canuseordering = PETSC_TRUE;
1900   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1901   PetscFunctionReturn(PETSC_SUCCESS);
1902 }
1903 
MatSeqAIJHIPSPARSECopyFromGPU(Mat A)1904 static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1905 {
1906   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
1907   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1908 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1909   Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1910 #endif
1911 
1912   PetscFunctionBegin;
1913   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1914     PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1915     if (A->factortype == MAT_FACTOR_NONE) {
1916       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1917       PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1918     }
1919 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1920     else if (fs->csrVal) {
1921       /* We have a factorized matrix on device and are able to copy it to host */
1922       PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1923     }
1924 #endif
1925     else
1926       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1927     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1928     PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1929     A->offloadmask = PETSC_OFFLOAD_BOTH;
1930   }
1931   PetscFunctionReturn(PETSC_SUCCESS);
1932 }
1933 
MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A,PetscScalar * array[])1934 static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1935 {
1936   PetscFunctionBegin;
1937   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1938   *array = ((Mat_SeqAIJ *)A->data)->a;
1939   PetscFunctionReturn(PETSC_SUCCESS);
1940 }
1941 
MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A,PetscScalar * array[])1942 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1943 {
1944   PetscFunctionBegin;
1945   A->offloadmask = PETSC_OFFLOAD_CPU;
1946   *array         = NULL;
1947   PetscFunctionReturn(PETSC_SUCCESS);
1948 }
1949 
MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A,const PetscScalar * array[])1950 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1951 {
1952   PetscFunctionBegin;
1953   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1954   *array = ((Mat_SeqAIJ *)A->data)->a;
1955   PetscFunctionReturn(PETSC_SUCCESS);
1956 }
1957 
MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A,const PetscScalar * array[])1958 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1959 {
1960   PetscFunctionBegin;
1961   *array = NULL;
1962   PetscFunctionReturn(PETSC_SUCCESS);
1963 }
1964 
MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A,PetscScalar * array[])1965 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1966 {
1967   PetscFunctionBegin;
1968   *array = ((Mat_SeqAIJ *)A->data)->a;
1969   PetscFunctionReturn(PETSC_SUCCESS);
1970 }
1971 
MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A,PetscScalar * array[])1972 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1973 {
1974   PetscFunctionBegin;
1975   A->offloadmask = PETSC_OFFLOAD_CPU;
1976   *array         = NULL;
1977   PetscFunctionReturn(PETSC_SUCCESS);
1978 }
1979 
MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A,const PetscInt ** i,const PetscInt ** j,PetscScalar ** a,PetscMemType * mtype)1980 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1981 {
1982   Mat_SeqAIJHIPSPARSE *cusp;
1983   CsrMatrix           *matrix;
1984 
1985   PetscFunctionBegin;
1986   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1987   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1988   cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1989   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1990   matrix = (CsrMatrix *)cusp->mat->mat;
1991 
1992   if (i) {
1993 #if !defined(PETSC_USE_64BIT_INDICES)
1994     *i = matrix->row_offsets->data().get();
1995 #else
1996     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
1997 #endif
1998   }
1999   if (j) {
2000 #if !defined(PETSC_USE_64BIT_INDICES)
2001     *j = matrix->column_indices->data().get();
2002 #else
2003     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2004 #endif
2005   }
2006   if (a) *a = matrix->values->data().get();
2007   if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2008   PetscFunctionReturn(PETSC_SUCCESS);
2009 }
2010 
MatSeqAIJHIPSPARSECopyToGPU(Mat A)2011 PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2012 {
2013   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2014   Mat_SeqAIJHIPSPARSEMultStruct *matstruct       = hipsparsestruct->mat;
2015   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
2016   PetscBool                      both            = PETSC_TRUE;
2017   PetscInt                       m               = A->rmap->n, *ii, *ridx, tmp;
2018 
2019   PetscFunctionBegin;
2020   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2021   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2022     if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2023       CsrMatrix *matrix;
2024       matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
2025 
2026       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2027       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2028       matrix->values->assign(a->a, a->a + a->nz);
2029       PetscCallHIP(WaitForHIP());
2030       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2031       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2032       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2033     } else {
2034       PetscInt nnz;
2035       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2036       PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2037       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2038       delete hipsparsestruct->workVector;
2039       delete hipsparsestruct->rowoffsets_gpu;
2040       hipsparsestruct->workVector     = NULL;
2041       hipsparsestruct->rowoffsets_gpu = NULL;
2042       try {
2043         if (a->compressedrow.use) {
2044           m    = a->compressedrow.nrows;
2045           ii   = a->compressedrow.i;
2046           ridx = a->compressedrow.rindex;
2047         } else {
2048           m    = A->rmap->n;
2049           ii   = a->i;
2050           ridx = NULL;
2051         }
2052         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2053         if (!a->a) {
2054           nnz  = ii[m];
2055           both = PETSC_FALSE;
2056         } else nnz = a->nz;
2057         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2058 
2059         /* create hipsparse matrix */
2060         hipsparsestruct->nrows = m;
2061         matstruct              = new Mat_SeqAIJHIPSPARSEMultStruct;
2062         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2063         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2064         PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2065 
2066         PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2067         PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2068         PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2069         PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2070         PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2071         PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2072         PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2073 
2074         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2075         if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2076           /* set the matrix */
2077           CsrMatrix *mat      = new CsrMatrix;
2078           mat->num_rows       = m;
2079           mat->num_cols       = A->cmap->n;
2080           mat->num_entries    = nnz;
2081           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2082           mat->column_indices = new THRUSTINTARRAY32(nnz);
2083           mat->values         = new THRUSTARRAY(nnz);
2084           mat->row_offsets->assign(ii, ii + m + 1);
2085           mat->column_indices->assign(a->j, a->j + nnz);
2086           if (a->a) mat->values->assign(a->a, a->a + nnz);
2087 
2088           /* assign the pointer */
2089           matstruct->mat = mat;
2090           if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2091             PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2092                                                   HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2093           }
2094         } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2095           CsrMatrix *mat      = new CsrMatrix;
2096           mat->num_rows       = m;
2097           mat->num_cols       = A->cmap->n;
2098           mat->num_entries    = nnz;
2099           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2100           mat->column_indices = new THRUSTINTARRAY32(nnz);
2101           mat->values         = new THRUSTARRAY(nnz);
2102           mat->row_offsets->assign(ii, ii + m + 1);
2103           mat->column_indices->assign(a->j, a->j + nnz);
2104           if (a->a) mat->values->assign(a->a, a->a + nnz);
2105 
2106           hipsparseHybMat_t hybMat;
2107           PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2108           hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2109           PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2110           /* assign the pointer */
2111           matstruct->mat = hybMat;
2112 
2113           if (mat) {
2114             if (mat->values) delete (THRUSTARRAY *)mat->values;
2115             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2116             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2117             delete (CsrMatrix *)mat;
2118           }
2119         }
2120 
2121         /* assign the compressed row indices */
2122         if (a->compressedrow.use) {
2123           hipsparsestruct->workVector = new THRUSTARRAY(m);
2124           matstruct->cprowIndices     = new THRUSTINTARRAY(m);
2125           matstruct->cprowIndices->assign(ridx, ridx + m);
2126           tmp = m;
2127         } else {
2128           hipsparsestruct->workVector = NULL;
2129           matstruct->cprowIndices     = NULL;
2130           tmp                         = 0;
2131         }
2132         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2133 
2134         /* assign the pointer */
2135         hipsparsestruct->mat = matstruct;
2136       } catch (char *ex) {
2137         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2138       }
2139       PetscCallHIP(WaitForHIP());
2140       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2141       hipsparsestruct->nonzerostate = A->nonzerostate;
2142     }
2143     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2144   }
2145   PetscFunctionReturn(PETSC_SUCCESS);
2146 }
2147 
2148 struct VecHIPPlusEquals {
2149   template <typename Tuple>
operator ()VecHIPPlusEquals2150   __host__ __device__ void operator()(Tuple t)
2151   {
2152     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2153   }
2154 };
2155 
2156 struct VecHIPEquals {
2157   template <typename Tuple>
operator ()VecHIPEquals2158   __host__ __device__ void operator()(Tuple t)
2159   {
2160     thrust::get<1>(t) = thrust::get<0>(t);
2161   }
2162 };
2163 
2164 struct VecHIPEqualsReverse {
2165   template <typename Tuple>
operator ()VecHIPEqualsReverse2166   __host__ __device__ void operator()(Tuple t)
2167   {
2168     thrust::get<0>(t) = thrust::get<1>(t);
2169   }
2170 };
2171 
2172 struct MatProductCtx_MatMatHipsparse {
2173   PetscBool             cisdense;
2174   PetscScalar          *Bt;
2175   Mat                   X;
2176   PetscBool             reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2177   PetscLogDouble        flops;
2178   CsrMatrix            *Bcsr;
2179   hipsparseSpMatDescr_t matSpBDescr;
2180   PetscBool             initialized; /* C = alpha op(A) op(B) + beta C */
2181   hipsparseDnMatDescr_t matBDescr;
2182   hipsparseDnMatDescr_t matCDescr;
2183   PetscInt              Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2184 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2185   void *dBuffer4, *dBuffer5;
2186 #endif
2187   size_t                 mmBufferSize;
2188   void                  *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2189   hipsparseSpGEMMDescr_t spgemmDesc;
2190 };
2191 
MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)2192 static PetscErrorCode MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)
2193 {
2194   MatProductCtx_MatMatHipsparse *mmdata = *(MatProductCtx_MatMatHipsparse **)data;
2195 
2196   PetscFunctionBegin;
2197   PetscCallHIP(hipFree(mmdata->Bt));
2198   delete mmdata->Bcsr;
2199   if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2200   if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2201   if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2202   if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2203 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2204   if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4));
2205   if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5));
2206 #endif
2207   if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer));
2208   if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2));
2209   PetscCall(MatDestroy(&mmdata->X));
2210   PetscCall(PetscFree(*(void **)data));
2211   PetscFunctionReturn(PETSC_SUCCESS);
2212 }
2213 
MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)2214 static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2215 {
2216   Mat_Product                   *product = C->product;
2217   Mat                            A, B;
2218   PetscInt                       m, n, blda, clda;
2219   PetscBool                      flg, biship;
2220   Mat_SeqAIJHIPSPARSE           *cusp;
2221   hipsparseOperation_t           opA;
2222   const PetscScalar             *barray;
2223   PetscScalar                   *carray;
2224   MatProductCtx_MatMatHipsparse *mmdata;
2225   Mat_SeqAIJHIPSPARSEMultStruct *mat;
2226   CsrMatrix                     *csrmat;
2227 
2228   PetscFunctionBegin;
2229   MatCheckProduct(C, 1);
2230   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2231   mmdata = (MatProductCtx_MatMatHipsparse *)product->data;
2232   A      = product->A;
2233   B      = product->B;
2234   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2235   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2236   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2237      Instead of silently accepting the wrong answer, I prefer to raise the error */
2238   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2239   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2240   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2241   switch (product->type) {
2242   case MATPRODUCT_AB:
2243   case MATPRODUCT_PtAP:
2244     mat = cusp->mat;
2245     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2246     m   = A->rmap->n;
2247     n   = B->cmap->n;
2248     break;
2249   case MATPRODUCT_AtB:
2250     if (!A->form_explicit_transpose) {
2251       mat = cusp->mat;
2252       opA = HIPSPARSE_OPERATION_TRANSPOSE;
2253     } else {
2254       PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2255       mat = cusp->matTranspose;
2256       opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2257     }
2258     m = A->cmap->n;
2259     n = B->cmap->n;
2260     break;
2261   case MATPRODUCT_ABt:
2262   case MATPRODUCT_RARt:
2263     mat = cusp->mat;
2264     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2265     m   = A->rmap->n;
2266     n   = B->rmap->n;
2267     break;
2268   default:
2269     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2270   }
2271   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2272   csrmat = (CsrMatrix *)mat->mat;
2273   /* if the user passed a CPU matrix, copy the data to the GPU */
2274   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2275   if (!biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
2276   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2277   PetscCall(MatDenseGetLDA(B, &blda));
2278   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2279     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2280     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2281   } else {
2282     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2283     PetscCall(MatDenseGetLDA(C, &clda));
2284   }
2285 
2286   PetscCall(PetscLogGpuTimeBegin());
2287   hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2288   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2289   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2290     size_t mmBufferSize;
2291     if (mmdata->initialized && mmdata->Blda != blda) {
2292       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2293       mmdata->matBDescr = NULL;
2294     }
2295     if (!mmdata->matBDescr) {
2296       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2297       mmdata->Blda = blda;
2298     }
2299     if (mmdata->initialized && mmdata->Clda != clda) {
2300       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2301       mmdata->matCDescr = NULL;
2302     }
2303     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2304       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2305       mmdata->Clda = clda;
2306     }
2307     if (!mat->matDescr) {
2308       PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2309                                             HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2310     }
2311     PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2312     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2313       PetscCallHIP(hipFree(mmdata->mmBuffer));
2314       PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2315       mmdata->mmBufferSize = mmBufferSize;
2316     }
2317     mmdata->initialized = PETSC_TRUE;
2318   } else {
2319     /* to be safe, always update pointers of the mats */
2320     PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2321     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2322     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2323   }
2324 
2325   /* do hipsparseSpMM, which supports transpose on B */
2326   PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2327 
2328   PetscCall(PetscLogGpuTimeEnd());
2329   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2330   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2331   if (product->type == MATPRODUCT_RARt) {
2332     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2333     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2334   } else if (product->type == MATPRODUCT_PtAP) {
2335     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2336     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2337   } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2338   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2339   if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2340   PetscFunctionReturn(PETSC_SUCCESS);
2341 }
2342 
MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)2343 static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2344 {
2345   Mat_Product                   *product = C->product;
2346   Mat                            A, B;
2347   PetscInt                       m, n;
2348   PetscBool                      cisdense, flg;
2349   MatProductCtx_MatMatHipsparse *mmdata;
2350   Mat_SeqAIJHIPSPARSE           *cusp;
2351 
2352   PetscFunctionBegin;
2353   MatCheckProduct(C, 1);
2354   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2355   A = product->A;
2356   B = product->B;
2357   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2358   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2359   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2360   PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2361   switch (product->type) {
2362   case MATPRODUCT_AB:
2363     m = A->rmap->n;
2364     n = B->cmap->n;
2365     break;
2366   case MATPRODUCT_AtB:
2367     m = A->cmap->n;
2368     n = B->cmap->n;
2369     break;
2370   case MATPRODUCT_ABt:
2371     m = A->rmap->n;
2372     n = B->rmap->n;
2373     break;
2374   case MATPRODUCT_PtAP:
2375     m = B->cmap->n;
2376     n = B->cmap->n;
2377     break;
2378   case MATPRODUCT_RARt:
2379     m = B->rmap->n;
2380     n = B->rmap->n;
2381     break;
2382   default:
2383     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2384   }
2385   PetscCall(MatSetSizes(C, m, n, m, n));
2386   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2387   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2388   PetscCall(MatSetType(C, MATSEQDENSEHIP));
2389 
2390   /* product data */
2391   PetscCall(PetscNew(&mmdata));
2392   mmdata->cisdense = cisdense;
2393   /* for these products we need intermediate storage */
2394   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2395     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2396     PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2397     /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2398     if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2399     else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2400   }
2401   C->product->data       = mmdata;
2402   C->product->destroy    = MatProductCtxDestroy_MatMatHipsparse;
2403   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2404   PetscFunctionReturn(PETSC_SUCCESS);
2405 }
2406 
MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)2407 static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2408 {
2409   Mat_Product                   *product = C->product;
2410   Mat                            A, B;
2411   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2412   Mat_SeqAIJ                    *c = (Mat_SeqAIJ *)C->data;
2413   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2414   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2415   PetscBool                      flg;
2416   MatProductType                 ptype;
2417   MatProductCtx_MatMatHipsparse *mmdata;
2418   hipsparseSpMatDescr_t          BmatSpDescr;
2419   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2420 
2421   PetscFunctionBegin;
2422   MatCheckProduct(C, 1);
2423   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2424   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2425   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2426   mmdata = (MatProductCtx_MatMatHipsparse *)C->product->data;
2427   A      = product->A;
2428   B      = product->B;
2429   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2430     mmdata->reusesym = PETSC_FALSE;
2431     Ccusp            = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2432     PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2433     Cmat = Ccusp->mat;
2434     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2435     Ccsr = (CsrMatrix *)Cmat->mat;
2436     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2437     goto finalize;
2438   }
2439   if (!c->nz) goto finalize;
2440   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2441   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2442   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2443   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2444   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2445   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2446   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2447   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2448   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2449   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2450   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2451   PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2452   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2453   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2454 
2455   ptype = product->type;
2456   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2457     ptype = MATPRODUCT_AB;
2458     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2459   }
2460   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2461     ptype = MATPRODUCT_AB;
2462     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2463   }
2464   switch (ptype) {
2465   case MATPRODUCT_AB:
2466     Amat = Acusp->mat;
2467     Bmat = Bcusp->mat;
2468     break;
2469   case MATPRODUCT_AtB:
2470     Amat = Acusp->matTranspose;
2471     Bmat = Bcusp->mat;
2472     break;
2473   case MATPRODUCT_ABt:
2474     Amat = Acusp->mat;
2475     Bmat = Bcusp->matTranspose;
2476     break;
2477   default:
2478     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2479   }
2480   Cmat = Ccusp->mat;
2481   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2482   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2483   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2484   Acsr = (CsrMatrix *)Amat->mat;
2485   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2486   Ccsr = (CsrMatrix *)Cmat->mat;
2487   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2488   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2489   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2490   PetscCall(PetscLogGpuTimeBegin());
2491 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2492   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2493   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2494   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2495   PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2496   #else
2497   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2498   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2499   #endif
2500 #else
2501   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2502                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2503                                           Ccsr->column_indices->data().get()));
2504 #endif
2505   PetscCall(PetscLogGpuFlops(mmdata->flops));
2506   PetscCallHIP(WaitForHIP());
2507   PetscCall(PetscLogGpuTimeEnd());
2508   C->offloadmask = PETSC_OFFLOAD_GPU;
2509 finalize:
2510   /* shorter version of MatAssemblyEnd_SeqAIJ */
2511   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2512   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2513   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2514   c->reallocs = 0;
2515   C->info.mallocs += 0;
2516   C->info.nz_unneeded = 0;
2517   C->assembled = C->was_assembled = PETSC_TRUE;
2518   C->num_ass++;
2519   PetscFunctionReturn(PETSC_SUCCESS);
2520 }
2521 
MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)2522 static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2523 {
2524   Mat_Product                   *product = C->product;
2525   Mat                            A, B;
2526   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2527   Mat_SeqAIJ                    *a, *b, *c;
2528   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2529   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2530   PetscInt                       i, j, m, n, k;
2531   PetscBool                      flg;
2532   MatProductType                 ptype;
2533   MatProductCtx_MatMatHipsparse *mmdata;
2534   PetscLogDouble                 flops;
2535   PetscBool                      biscompressed, ciscompressed;
2536 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2537   int64_t               C_num_rows1, C_num_cols1, C_nnz1;
2538   hipsparseSpMatDescr_t BmatSpDescr;
2539 #else
2540   int cnz;
2541 #endif
2542   hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2543 
2544   PetscFunctionBegin;
2545   MatCheckProduct(C, 1);
2546   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2547   A = product->A;
2548   B = product->B;
2549   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2550   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2551   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2552   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2553   a = (Mat_SeqAIJ *)A->data;
2554   b = (Mat_SeqAIJ *)B->data;
2555   /* product data */
2556   PetscCall(PetscNew(&mmdata));
2557   C->product->data    = mmdata;
2558   C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
2559 
2560   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2561   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2562   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2563   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2564   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2565   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2566 
2567   ptype = product->type;
2568   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2569     ptype                                          = MATPRODUCT_AB;
2570     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2571   }
2572   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2573     ptype                                          = MATPRODUCT_AB;
2574     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2575   }
2576   biscompressed = PETSC_FALSE;
2577   ciscompressed = PETSC_FALSE;
2578   switch (ptype) {
2579   case MATPRODUCT_AB:
2580     m    = A->rmap->n;
2581     n    = B->cmap->n;
2582     k    = A->cmap->n;
2583     Amat = Acusp->mat;
2584     Bmat = Bcusp->mat;
2585     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2586     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2587     break;
2588   case MATPRODUCT_AtB:
2589     m = A->cmap->n;
2590     n = B->cmap->n;
2591     k = A->rmap->n;
2592     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2593     Amat = Acusp->matTranspose;
2594     Bmat = Bcusp->mat;
2595     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2596     break;
2597   case MATPRODUCT_ABt:
2598     m = A->rmap->n;
2599     n = B->rmap->n;
2600     k = A->cmap->n;
2601     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2602     Amat = Acusp->mat;
2603     Bmat = Bcusp->matTranspose;
2604     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2605     break;
2606   default:
2607     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2608   }
2609 
2610   /* create hipsparse matrix */
2611   PetscCall(MatSetSizes(C, m, n, m, n));
2612   PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2613   c     = (Mat_SeqAIJ *)C->data;
2614   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2615   Cmat  = new Mat_SeqAIJHIPSPARSEMultStruct;
2616   Ccsr  = new CsrMatrix;
2617 
2618   c->compressedrow.use = ciscompressed;
2619   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2620     c->compressedrow.nrows = a->compressedrow.nrows;
2621     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2622     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2623     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2624     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2625     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2626   } else {
2627     c->compressedrow.nrows  = 0;
2628     c->compressedrow.i      = NULL;
2629     c->compressedrow.rindex = NULL;
2630     Ccusp->workVector       = NULL;
2631     Cmat->cprowIndices      = NULL;
2632   }
2633   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2634   Ccusp->mat        = Cmat;
2635   Ccusp->mat->mat   = Ccsr;
2636   Ccsr->num_rows    = Ccusp->nrows;
2637   Ccsr->num_cols    = n;
2638   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2639   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2640   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2641   PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2642   PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2643   PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2644   PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2645   PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2646   PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2647   PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2648   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2649     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2650     c->nz                = 0;
2651     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2652     Ccsr->values         = new THRUSTARRAY(c->nz);
2653     goto finalizesym;
2654   }
2655 
2656   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2657   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2658   Acsr = (CsrMatrix *)Amat->mat;
2659   if (!biscompressed) {
2660     Bcsr        = (CsrMatrix *)Bmat->mat;
2661     BmatSpDescr = Bmat->matDescr;
2662   } else { /* we need to use row offsets for the full matrix */
2663     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2664     Bcsr                 = new CsrMatrix;
2665     Bcsr->num_rows       = B->rmap->n;
2666     Bcsr->num_cols       = cBcsr->num_cols;
2667     Bcsr->num_entries    = cBcsr->num_entries;
2668     Bcsr->column_indices = cBcsr->column_indices;
2669     Bcsr->values         = cBcsr->values;
2670     if (!Bcusp->rowoffsets_gpu) {
2671       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2672       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2673       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2674     }
2675     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2676     mmdata->Bcsr      = Bcsr;
2677     if (Bcsr->num_rows && Bcsr->num_cols) {
2678       PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2679     }
2680     BmatSpDescr = mmdata->matSpBDescr;
2681   }
2682   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2683   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2684   /* precompute flops count */
2685   if (ptype == MATPRODUCT_AB) {
2686     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2687       const PetscInt st = a->i[i];
2688       const PetscInt en = a->i[i + 1];
2689       for (j = st; j < en; j++) {
2690         const PetscInt brow = a->j[j];
2691         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2692       }
2693     }
2694   } else if (ptype == MATPRODUCT_AtB) {
2695     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2696       const PetscInt anzi = a->i[i + 1] - a->i[i];
2697       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2698       flops += (2. * anzi) * bnzi;
2699     }
2700   } else flops = 0.; /* TODO */
2701 
2702   mmdata->flops = flops;
2703   PetscCall(PetscLogGpuTimeBegin());
2704 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2705   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2706   PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2707   PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2708   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2709   {
2710     /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2711      We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2712   */
2713     void *dBuffer1 = NULL;
2714     void *dBuffer2 = NULL;
2715     void *dBuffer3 = NULL;
2716     /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2717     size_t bufferSize1 = 0;
2718     size_t bufferSize2 = 0;
2719     size_t bufferSize3 = 0;
2720     size_t bufferSize4 = 0;
2721     size_t bufferSize5 = 0;
2722 
2723     /* ask bufferSize1 bytes for external memory */
2724     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2725     PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2726     /* inspect the matrices A and B to understand the memory requirement for the next step */
2727     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));
2728 
2729     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2730     PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2731     PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2732     PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2733     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2734     PetscCallHIP(hipFree(dBuffer1));
2735     PetscCallHIP(hipFree(dBuffer2));
2736 
2737     /* get matrix C non-zero entries C_nnz1 */
2738     PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2739     c->nz = (PetscInt)C_nnz1;
2740     /* allocate matrix C */
2741     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2742     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2743     Ccsr->values = new THRUSTARRAY(c->nz);
2744     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2745     /* update matC with the new pointers */
2746     if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */
2747       PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2748 
2749       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2750       PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2751       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2752       PetscCallHIP(hipFree(dBuffer3));
2753       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2754     }
2755     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2756   }
2757   #else
2758   size_t bufSize2;
2759   /* ask bufferSize bytes for external memory */
2760   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2761   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2762   /* inspect the matrices A and B to understand the memory requirement for the next step */
2763   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2764   /* ask bufferSize again bytes for external memory */
2765   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2766   /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2767      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2768      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2769      is stored in the descriptor! What a messy API... */
2770   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2771   /* compute the intermediate product of A * B */
2772   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2773   /* get matrix C non-zero entries C_nnz1 */
2774   PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2775   c->nz = (PetscInt)C_nnz1;
2776   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2777                       mmdata->mmBufferSize / 1024));
2778   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2779   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2780   Ccsr->values = new THRUSTARRAY(c->nz);
2781   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2782   PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2783   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2784   #endif
2785 #else
2786   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2787   PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2788                                           Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2789   c->nz                = cnz;
2790   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2791   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2792   Ccsr->values = new THRUSTARRAY(c->nz);
2793   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2794 
2795   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2796   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2797       I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2798       D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2799   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2800                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2801                                           Ccsr->column_indices->data().get()));
2802 #endif
2803   PetscCall(PetscLogGpuFlops(mmdata->flops));
2804   PetscCall(PetscLogGpuTimeEnd());
2805 finalizesym:
2806   c->free_a = PETSC_TRUE;
2807   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2808   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2809   c->free_ij = PETSC_TRUE;
2810   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2811     PetscInt      *d_i = c->i;
2812     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2813     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2814     ii = *Ccsr->row_offsets;
2815     jj = *Ccsr->column_indices;
2816     if (ciscompressed) d_i = c->compressedrow.i;
2817     PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2818     PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2819   } else {
2820     PetscInt *d_i = c->i;
2821     if (ciscompressed) d_i = c->compressedrow.i;
2822     PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2823     PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2824   }
2825   if (ciscompressed) { /* need to expand host row offsets */
2826     PetscInt r = 0;
2827     c->i[0]    = 0;
2828     for (k = 0; k < c->compressedrow.nrows; k++) {
2829       const PetscInt next = c->compressedrow.rindex[k];
2830       const PetscInt old  = c->compressedrow.i[k];
2831       for (; r < next; r++) c->i[r + 1] = old;
2832     }
2833     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2834   }
2835   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2836   PetscCall(PetscMalloc1(m, &c->ilen));
2837   PetscCall(PetscMalloc1(m, &c->imax));
2838   c->maxnz         = c->nz;
2839   c->nonzerorowcnt = 0;
2840   c->rmax          = 0;
2841   for (k = 0; k < m; k++) {
2842     const PetscInt nn = c->i[k + 1] - c->i[k];
2843     c->ilen[k] = c->imax[k] = nn;
2844     c->nonzerorowcnt += (PetscInt)!!nn;
2845     c->rmax = PetscMax(c->rmax, nn);
2846   }
2847   PetscCall(PetscMalloc1(c->nz, &c->a));
2848   Ccsr->num_entries = c->nz;
2849 
2850   C->nonzerostate++;
2851   PetscCall(PetscLayoutSetUp(C->rmap));
2852   PetscCall(PetscLayoutSetUp(C->cmap));
2853   Ccusp->nonzerostate = C->nonzerostate;
2854   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
2855   C->preallocated     = PETSC_TRUE;
2856   C->assembled        = PETSC_FALSE;
2857   C->was_assembled    = PETSC_FALSE;
2858   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2859     mmdata->reusesym = PETSC_TRUE;
2860     C->offloadmask   = PETSC_OFFLOAD_GPU;
2861   }
2862   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2863   PetscFunctionReturn(PETSC_SUCCESS);
2864 }
2865 
2866 /* handles sparse or dense B */
MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)2867 static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2868 {
2869   Mat_Product *product = mat->product;
2870   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2871 
2872   PetscFunctionBegin;
2873   MatCheckProduct(mat, 1);
2874   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2875   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2876   if (product->type == MATPRODUCT_ABC) {
2877     Ciscusp = PETSC_FALSE;
2878     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2879   }
2880   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2881     PetscBool usecpu = PETSC_FALSE;
2882     switch (product->type) {
2883     case MATPRODUCT_AB:
2884       if (product->api_user) {
2885         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2886         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2887         PetscOptionsEnd();
2888       } else {
2889         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2890         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2891         PetscOptionsEnd();
2892       }
2893       break;
2894     case MATPRODUCT_AtB:
2895       if (product->api_user) {
2896         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2897         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2898         PetscOptionsEnd();
2899       } else {
2900         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2901         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2902         PetscOptionsEnd();
2903       }
2904       break;
2905     case MATPRODUCT_PtAP:
2906       if (product->api_user) {
2907         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2908         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2909         PetscOptionsEnd();
2910       } else {
2911         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2912         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2913         PetscOptionsEnd();
2914       }
2915       break;
2916     case MATPRODUCT_RARt:
2917       if (product->api_user) {
2918         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2919         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2920         PetscOptionsEnd();
2921       } else {
2922         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2923         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2924         PetscOptionsEnd();
2925       }
2926       break;
2927     case MATPRODUCT_ABC:
2928       if (product->api_user) {
2929         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2930         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2931         PetscOptionsEnd();
2932       } else {
2933         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2934         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2935         PetscOptionsEnd();
2936       }
2937       break;
2938     default:
2939       break;
2940     }
2941     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2942   }
2943   /* dispatch */
2944   if (isdense) {
2945     switch (product->type) {
2946     case MATPRODUCT_AB:
2947     case MATPRODUCT_AtB:
2948     case MATPRODUCT_ABt:
2949     case MATPRODUCT_PtAP:
2950     case MATPRODUCT_RARt:
2951       if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2952       else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2953       break;
2954     case MATPRODUCT_ABC:
2955       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2956       break;
2957     default:
2958       break;
2959     }
2960   } else if (Biscusp && Ciscusp) {
2961     switch (product->type) {
2962     case MATPRODUCT_AB:
2963     case MATPRODUCT_AtB:
2964     case MATPRODUCT_ABt:
2965       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2966       break;
2967     case MATPRODUCT_PtAP:
2968     case MATPRODUCT_RARt:
2969     case MATPRODUCT_ABC:
2970       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2971       break;
2972     default:
2973       break;
2974     }
2975   } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2976   PetscFunctionReturn(PETSC_SUCCESS);
2977 }
2978 
MatMult_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy)2979 static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2980 {
2981   PetscFunctionBegin;
2982   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2983   PetscFunctionReturn(PETSC_SUCCESS);
2984 }
2985 
MatMultAdd_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz)2986 static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2987 {
2988   PetscFunctionBegin;
2989   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2990   PetscFunctionReturn(PETSC_SUCCESS);
2991 }
2992 
MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy)2993 static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2994 {
2995   PetscFunctionBegin;
2996   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2997   PetscFunctionReturn(PETSC_SUCCESS);
2998 }
2999 
MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3000 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3001 {
3002   PetscFunctionBegin;
3003   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3004   PetscFunctionReturn(PETSC_SUCCESS);
3005 }
3006 
MatMultTranspose_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy)3007 static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3008 {
3009   PetscFunctionBegin;
3010   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3011   PetscFunctionReturn(PETSC_SUCCESS);
3012 }
3013 
ScatterAdd(PetscInt n,PetscInt * idx,const PetscScalar * x,PetscScalar * y)3014 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3015 {
3016   int i = blockIdx.x * blockDim.x + threadIdx.x;
3017   if (i < n) y[idx[i]] += x[i];
3018 }
3019 
3020 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
MatMultAddKernel_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)3021 static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3022 {
3023   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
3024   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3025   Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3026   PetscScalar                   *xarray, *zarray, *dptr, *beta, *xptr;
3027   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3028   PetscBool                      compressed;
3029   PetscInt                       nx, ny;
3030 
3031   PetscFunctionBegin;
3032   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3033   if (!a->nz) {
3034     if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
3035     else PetscCall(VecSeq_HIP::Set(zz, 0));
3036     PetscFunctionReturn(PETSC_SUCCESS);
3037   }
3038   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3039   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3040   if (!trans) {
3041     matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3042     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3043   } else {
3044     if (herm || !A->form_explicit_transpose) {
3045       opA       = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3046       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3047     } else {
3048       if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3049       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3050     }
3051   }
3052   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3053   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3054   try {
3055     PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3056     if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3057     else PetscCall(VecHIPGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3058 
3059     PetscCall(PetscLogGpuTimeBegin());
3060     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3061       /* z = A x + beta y.
3062          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3063          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3064       */
3065       xptr = xarray;
3066       dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3067       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3068       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3069           allocated to accommodate different uses. So we get the length info directly from mat.
3070        */
3071       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3072         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3073         nx             = mat->num_cols;
3074         ny             = mat->num_rows;
3075       }
3076     } else {
3077       /* z = A^T x + beta y
3078          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3079          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3080        */
3081       xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3082       dptr = zarray;
3083       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3084       if (compressed) { /* Scatter x to work vector */
3085         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3086         thrust::for_each(
3087 #if PetscDefined(HAVE_THRUST_ASYNC)
3088           thrust::hip::par.on(PetscDefaultHipStream),
3089 #endif
3090           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3091           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3092       }
3093       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3094         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3095         nx             = mat->num_rows;
3096         ny             = mat->num_cols;
3097       }
3098     }
3099     /* csr_spmv does y = alpha op(A) x + beta y */
3100     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3101 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) && !PETSC_PKG_HIP_VERSION_EQ(7, 2, 0)
3102       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3103       if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3104         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3105         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3106         PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3107                                                     &matstruct->hipSpMV[opA].spmvBufferSize));
3108         PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3109         matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3110       } else {
3111         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3112         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3113         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3114       }
3115       PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3116                                        matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3117 #else
3118       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3119       nx             = mat->num_rows; /* nx,ny are set before the #if block, set them again to avoid set-but-not-used warning */
3120       ny             = mat->num_cols;
3121       PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, nx, ny, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3122 #endif
3123     } else {
3124       if (hipsparsestruct->nrows) {
3125         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3126         PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3127       }
3128     }
3129     PetscCall(PetscLogGpuTimeEnd());
3130 
3131     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3132       if (yy) {                                     /* MatMultAdd: zz = A*xx + yy */
3133         if (compressed) {                           /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3134           PetscCall(VecSeq_HIP::Copy(yy, zz));      /* zz = yy */
3135         } else if (zz != yy) {                      /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3136           PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3137         }
3138       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3139         PetscCall(VecSeq_HIP::Set(zz, 0));
3140       }
3141 
3142       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3143       if (compressed) {
3144         PetscCall(PetscLogGpuTimeBegin());
3145         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3146            and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3147            prevent that. So I just add a ScatterAdd kernel.
3148          */
3149 #if 0
3150         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3151         thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3152                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3153                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3154                          VecHIPPlusEquals());
3155 #else
3156         PetscInt n = matstruct->cprowIndices->size();
3157         hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3158 #endif
3159         PetscCall(PetscLogGpuTimeEnd());
3160       }
3161     } else {
3162       if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3163     }
3164     PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3165     if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3166     else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3167   } catch (char *ex) {
3168     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3169   }
3170   if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3171   else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3172   PetscFunctionReturn(PETSC_SUCCESS);
3173 }
3174 
MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A,Vec xx,Vec yy,Vec zz)3175 static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3176 {
3177   PetscFunctionBegin;
3178   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3179   PetscFunctionReturn(PETSC_SUCCESS);
3180 }
3181 
MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A,MatAssemblyType mode)3182 static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3183 {
3184   PetscFunctionBegin;
3185   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3186   PetscFunctionReturn(PETSC_SUCCESS);
3187 }
3188 
3189 /*@
3190   MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3191   This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3192 
3193   Collective
3194 
3195   Input Parameters:
3196 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3197 . m    - number of rows
3198 . n    - number of columns
3199 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is set
3200 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3201 
3202   Output Parameter:
3203 . A - the matrix
3204 
3205   Level: intermediate
3206 
3207   Notes:
3208   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3209   `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3210   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
3211 
3212   The AIJ format (compressed row storage), is fully compatible with standard Fortran
3213   storage.  That is, the stored row and column indices can begin at
3214   either one (as in Fortran) or zero.
3215 
3216   Specify the preallocated storage with either `nz` or `nnz` (not both).
3217   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3218   allocation.
3219 
3220 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3221 @*/
MatCreateSeqAIJHIPSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)3222 PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3223 {
3224   PetscFunctionBegin;
3225   PetscCall(MatCreate(comm, A));
3226   PetscCall(MatSetSizes(*A, m, n, m, n));
3227   PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3228   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3229   PetscFunctionReturn(PETSC_SUCCESS);
3230 }
3231 
MatDestroy_SeqAIJHIPSPARSE(Mat A)3232 static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3233 {
3234   PetscFunctionBegin;
3235   if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy(A));
3236   else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3237   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3238   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3239   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3240   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3241   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3242   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3243   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3244   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3245   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3246   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3247   PetscCall(MatDestroy_SeqAIJ(A));
3248   PetscFunctionReturn(PETSC_SUCCESS);
3249 }
3250 
MatDuplicate_SeqAIJHIPSPARSE(Mat A,MatDuplicateOption cpvalues,Mat * B)3251 static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3252 {
3253   PetscFunctionBegin;
3254   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3255   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3256   PetscFunctionReturn(PETSC_SUCCESS);
3257 }
3258 
MatAXPY_SeqAIJHIPSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)3259 static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3260 {
3261   Mat_SeqAIJ          *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3262   Mat_SeqAIJHIPSPARSE *cy;
3263   Mat_SeqAIJHIPSPARSE *cx;
3264   PetscScalar         *ay;
3265   const PetscScalar   *ax;
3266   CsrMatrix           *csry, *csrx;
3267 
3268   PetscFunctionBegin;
3269   cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3270   cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3271   if (X->ops->axpy != Y->ops->axpy) {
3272     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3273     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3274     PetscFunctionReturn(PETSC_SUCCESS);
3275   }
3276   /* if we are here, it means both matrices are bound to GPU */
3277   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3278   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3279   PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3280   PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3281   csry = (CsrMatrix *)cy->mat->mat;
3282   csrx = (CsrMatrix *)cx->mat->mat;
3283   /* see if we can turn this into a hipblas axpy */
3284   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3285     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3286     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3287     if (eq) str = SAME_NONZERO_PATTERN;
3288   }
3289   /* spgeam is buggy with one column */
3290   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3291   if (str == SUBSET_NONZERO_PATTERN) {
3292     PetscScalar b = 1.0;
3293 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3294     size_t bufferSize;
3295     void  *buffer;
3296 #endif
3297 
3298     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3299     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3300     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3301 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3302     PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3303                                                        csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3304     PetscCallHIP(hipMalloc(&buffer, bufferSize));
3305     PetscCall(PetscLogGpuTimeBegin());
3306     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3307                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3308     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3309     PetscCall(PetscLogGpuTimeEnd());
3310     PetscCallHIP(hipFree(buffer));
3311 #else
3312     PetscCall(PetscLogGpuTimeBegin());
3313     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3314                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3315     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3316     PetscCall(PetscLogGpuTimeEnd());
3317 #endif
3318     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3319     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3320     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3321   } else if (str == SAME_NONZERO_PATTERN) {
3322     hipblasHandle_t hipblasv2handle;
3323     PetscBLASInt    one = 1, bnz = 1;
3324 
3325     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3326     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3327     PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3328     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3329     PetscCall(PetscLogGpuTimeBegin());
3330     PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3331     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3332     PetscCall(PetscLogGpuTimeEnd());
3333     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3334     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3335   } else {
3336     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3337     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3338   }
3339   PetscFunctionReturn(PETSC_SUCCESS);
3340 }
3341 
MatScale_SeqAIJHIPSPARSE(Mat Y,PetscScalar a)3342 static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3343 {
3344   Mat_SeqAIJ     *y = (Mat_SeqAIJ *)Y->data;
3345   PetscScalar    *ay;
3346   hipblasHandle_t hipblasv2handle;
3347   PetscBLASInt    one = 1, bnz = 1;
3348 
3349   PetscFunctionBegin;
3350   PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3351   PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3352   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3353   PetscCall(PetscLogGpuTimeBegin());
3354   PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3355   PetscCall(PetscLogGpuFlops(bnz));
3356   PetscCall(PetscLogGpuTimeEnd());
3357   PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3358   PetscFunctionReturn(PETSC_SUCCESS);
3359 }
3360 
MatZeroEntries_SeqAIJHIPSPARSE(Mat A)3361 static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3362 {
3363   PetscBool   both = PETSC_FALSE;
3364   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3365 
3366   PetscFunctionBegin;
3367   if (A->factortype == MAT_FACTOR_NONE) {
3368     Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3369     if (spptr->mat) {
3370       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3371       if (matrix->values) {
3372         both = PETSC_TRUE;
3373         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3374       }
3375     }
3376     if (spptr->matTranspose) {
3377       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3378       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3379     }
3380   }
3381   //PetscCall(MatZeroEntries_SeqAIJ(A));
3382   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3383   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3384   else A->offloadmask = PETSC_OFFLOAD_CPU;
3385   PetscFunctionReturn(PETSC_SUCCESS);
3386 }
3387 
MatGetCurrentMemType_SeqAIJHIPSPARSE(PETSC_UNUSED Mat A,PetscMemType * m)3388 static PetscErrorCode MatGetCurrentMemType_SeqAIJHIPSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3389 {
3390   PetscFunctionBegin;
3391   *m = PETSC_MEMTYPE_HIP;
3392   PetscFunctionReturn(PETSC_SUCCESS);
3393 }
3394 
MatBindToCPU_SeqAIJHIPSPARSE(Mat A,PetscBool flg)3395 static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3396 {
3397   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3398 
3399   PetscFunctionBegin;
3400   if (A->factortype != MAT_FACTOR_NONE) {
3401     A->boundtocpu = flg;
3402     PetscFunctionReturn(PETSC_SUCCESS);
3403   }
3404   if (flg) {
3405     PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
3406 
3407     A->ops->scale                     = MatScale_SeqAIJ;
3408     A->ops->axpy                      = MatAXPY_SeqAIJ;
3409     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3410     A->ops->mult                      = MatMult_SeqAIJ;
3411     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3412     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3413     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3414     A->ops->multhermitiantranspose    = NULL;
3415     A->ops->multhermitiantransposeadd = NULL;
3416     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3417     A->ops->getcurrentmemtype         = NULL;
3418     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3419     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3420     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3421     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3422     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3423     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3424     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3425   } else {
3426     A->ops->scale                     = MatScale_SeqAIJHIPSPARSE;
3427     A->ops->axpy                      = MatAXPY_SeqAIJHIPSPARSE;
3428     A->ops->zeroentries               = MatZeroEntries_SeqAIJHIPSPARSE;
3429     A->ops->mult                      = MatMult_SeqAIJHIPSPARSE;
3430     A->ops->multadd                   = MatMultAdd_SeqAIJHIPSPARSE;
3431     A->ops->multtranspose             = MatMultTranspose_SeqAIJHIPSPARSE;
3432     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3433     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3434     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3435     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3436     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3437     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3438     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3439     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3440     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3441     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3442     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3443     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3444     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3445     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3446     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3447     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3448     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3449     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3450   }
3451   A->boundtocpu = flg;
3452   if (flg && a->inode.size_csr) a->inode.use = PETSC_TRUE;
3453   else a->inode.use = PETSC_FALSE;
3454   PetscFunctionReturn(PETSC_SUCCESS);
3455 }
3456 
MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A,MatType mtype,MatReuse reuse,Mat * newmat)3457 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3458 {
3459   Mat B;
3460 
3461   PetscFunctionBegin;
3462   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3463   if (reuse == MAT_INITIAL_MATRIX) {
3464     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3465   } else if (reuse == MAT_REUSE_MATRIX) {
3466     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3467   }
3468   B = *newmat;
3469   PetscCall(PetscFree(B->defaultvectype));
3470   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3471   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3472     if (B->factortype == MAT_FACTOR_NONE) {
3473       Mat_SeqAIJHIPSPARSE *spptr;
3474       PetscCall(PetscNew(&spptr));
3475       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3476       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3477       spptr->format = MAT_HIPSPARSE_CSR;
3478 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3479       spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3480 #else
3481       spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3482 #endif
3483       spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3484       //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
3485 
3486       B->spptr = spptr;
3487     } else {
3488       Mat_SeqAIJHIPSPARSETriFactors *spptr;
3489 
3490       PetscCall(PetscNew(&spptr));
3491       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3492       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3493       B->spptr = spptr;
3494     }
3495     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3496   }
3497   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJHIPSPARSE;
3498   B->ops->destroy           = MatDestroy_SeqAIJHIPSPARSE;
3499   B->ops->setoption         = MatSetOption_SeqAIJHIPSPARSE;
3500   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJHIPSPARSE;
3501   B->ops->bindtocpu         = MatBindToCPU_SeqAIJHIPSPARSE;
3502   B->ops->duplicate         = MatDuplicate_SeqAIJHIPSPARSE;
3503   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3504 
3505   PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3506   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3507   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3508 #if defined(PETSC_HAVE_HYPRE)
3509   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3510 #endif
3511   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3512   PetscFunctionReturn(PETSC_SUCCESS);
3513 }
3514 
MatCreate_SeqAIJHIPSPARSE(Mat B)3515 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3516 {
3517   PetscFunctionBegin;
3518   PetscCall(MatCreate_SeqAIJ(B));
3519   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3520   PetscFunctionReturn(PETSC_SUCCESS);
3521 }
3522 
3523 /*MC
3524    MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs
3525 
3526    A matrix type whose data resides on AMD GPUs. These matrices can be in either
3527    CSR, ELL, or Hybrid format.
3528    All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.
3529 
3530    Options Database Keys:
3531 +  -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
3532 .  -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3533                                        Other options include ell (ellpack) or hyb (hybrid).
3534 . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3535 -  -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU
3536 
3537   Level: beginner
3538 
3539 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3540 M*/
3541 
MatSolverTypeRegister_HIPSPARSE(void)3542 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3543 {
3544   PetscFunctionBegin;
3545   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3546   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3547   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3548   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3549   PetscFunctionReturn(PETSC_SUCCESS);
3550 }
3551 
MatSeqAIJHIPSPARSE_Destroy(Mat mat)3552 static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
3553 {
3554   Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
3555 
3556   PetscFunctionBegin;
3557   if (cusp) {
3558     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
3559     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3560     delete cusp->workVector;
3561     delete cusp->rowoffsets_gpu;
3562     delete cusp->csr2csc_i;
3563     delete cusp->coords;
3564     if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
3565     PetscCall(PetscFree(mat->spptr));
3566   }
3567   PetscFunctionReturn(PETSC_SUCCESS);
3568 }
3569 
CsrMatrix_Destroy(CsrMatrix ** mat)3570 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3571 {
3572   PetscFunctionBegin;
3573   if (*mat) {
3574     delete (*mat)->values;
3575     delete (*mat)->column_indices;
3576     delete (*mat)->row_offsets;
3577     delete *mat;
3578     *mat = 0;
3579   }
3580   PetscFunctionReturn(PETSC_SUCCESS);
3581 }
3582 
MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct ** trifactor)3583 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3584 {
3585   PetscFunctionBegin;
3586   if (*trifactor) {
3587     if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3588     if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3589     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3590     if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3591     if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3592     if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3593     PetscCall(PetscFree(*trifactor));
3594   }
3595   PetscFunctionReturn(PETSC_SUCCESS);
3596 }
3597 
MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct ** matstruct,MatHIPSPARSEStorageFormat format)3598 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3599 {
3600   CsrMatrix *mat;
3601 
3602   PetscFunctionBegin;
3603   if (*matstruct) {
3604     if ((*matstruct)->mat) {
3605       if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3606         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3607         PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3608       } else {
3609         mat = (CsrMatrix *)(*matstruct)->mat;
3610         PetscCall(CsrMatrix_Destroy(&mat));
3611       }
3612     }
3613     if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3614     delete (*matstruct)->cprowIndices;
3615     if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one));
3616     if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero));
3617     if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one));
3618 
3619     Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3620     if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3621     for (int i = 0; i < 3; i++) {
3622       if (mdata->hipSpMV[i].initialized) {
3623         PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3624         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3625         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3626       }
3627     }
3628     delete *matstruct;
3629     *matstruct = NULL;
3630   }
3631   PetscFunctionReturn(PETSC_SUCCESS);
3632 }
3633 
MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p * trifactors)3634 PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3635 {
3636   Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
3637 
3638   PetscFunctionBegin;
3639   if (fs) {
3640     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3641     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3642     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3643     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3644     delete fs->rpermIndices;
3645     delete fs->cpermIndices;
3646     delete fs->workVector;
3647     fs->rpermIndices  = NULL;
3648     fs->cpermIndices  = NULL;
3649     fs->workVector    = NULL;
3650     fs->init_dev_prop = PETSC_FALSE;
3651 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3652     PetscCallHIP(hipFree(fs->csrRowPtr));
3653     PetscCallHIP(hipFree(fs->csrColIdx));
3654     PetscCallHIP(hipFree(fs->csrVal));
3655     PetscCallHIP(hipFree(fs->X));
3656     PetscCallHIP(hipFree(fs->Y));
3657     // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3658     PetscCallHIP(hipFree(fs->spsvBuffer_L));
3659     PetscCallHIP(hipFree(fs->spsvBuffer_U));
3660     PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3661     PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3662     PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3663     if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3664     if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3665     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3666     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3667     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3668     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3669     if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3670     if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3671     PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3672     PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
3673 
3674     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3675     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3676 #endif
3677   }
3678   PetscFunctionReturn(PETSC_SUCCESS);
3679 }
3680 
MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors ** trifactors)3681 static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3682 {
3683   hipsparseHandle_t handle;
3684 
3685   PetscFunctionBegin;
3686   if (*trifactors) {
3687     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3688     if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3689     PetscCall(PetscFree(*trifactors));
3690   }
3691   PetscFunctionReturn(PETSC_SUCCESS);
3692 }
3693 
3694 struct IJCompare {
operator ()IJCompare3695   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3696   {
3697     if (t1.get<0>() < t2.get<0>()) return true;
3698     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3699     return false;
3700   }
3701 };
3702 
MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A,PetscBool destroy)3703 static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3704 {
3705   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3706 
3707   PetscFunctionBegin;
3708   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3709   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3710   if (destroy) {
3711     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3712     delete cusp->csr2csc_i;
3713     cusp->csr2csc_i = NULL;
3714   }
3715   A->transupdated = PETSC_FALSE;
3716   PetscFunctionReturn(PETSC_SUCCESS);
3717 }
3718 
MatCOOStructDestroy_SeqAIJHIPSPARSE(PetscCtxRt data)3719 static PetscErrorCode MatCOOStructDestroy_SeqAIJHIPSPARSE(PetscCtxRt data)
3720 {
3721   MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)data;
3722 
3723   PetscFunctionBegin;
3724   PetscCallHIP(hipFree(coo->perm));
3725   PetscCallHIP(hipFree(coo->jmap));
3726   PetscCall(PetscFree(coo));
3727   PetscFunctionReturn(PETSC_SUCCESS);
3728 }
3729 
MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])3730 static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3731 {
3732   PetscBool            dev_ij = PETSC_FALSE;
3733   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
3734   PetscInt            *i, *j;
3735   PetscContainer       container_h;
3736   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
3737 
3738   PetscFunctionBegin;
3739   PetscCall(PetscGetMemType(coo_i, &mtype));
3740   if (PetscMemTypeDevice(mtype)) {
3741     dev_ij = PETSC_TRUE;
3742     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
3743     PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3744     PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3745   } else {
3746     i = coo_i;
3747     j = coo_j;
3748   }
3749   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
3750   if (dev_ij) PetscCall(PetscFree2(i, j));
3751   mat->offloadmask = PETSC_OFFLOAD_CPU;
3752   // Create the GPU memory
3753   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));
3754 
3755   // Copy the COO struct to device
3756   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
3757   PetscCall(PetscContainerGetPointer(container_h, &coo_h));
3758   PetscCall(PetscMalloc1(1, &coo_d));
3759   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
3760   PetscCallHIP(hipMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
3761   PetscCallHIP(hipMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
3762   PetscCallHIP(hipMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
3763   PetscCallHIP(hipMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));
3764 
3765   // Put the COO struct in a container and then attach that to the matrix
3766   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJHIPSPARSE));
3767   PetscFunctionReturn(PETSC_SUCCESS);
3768 }
3769 
MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])3770 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3771 {
3772   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
3773   const PetscCount grid_size = gridDim.x * blockDim.x;
3774   for (; i < nnz; i += grid_size) {
3775     PetscScalar sum = 0.0;
3776     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3777     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3778   }
3779 }
3780 
MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A,const PetscScalar v[],InsertMode imode)3781 static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3782 {
3783   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
3784   Mat_SeqAIJHIPSPARSE *dev  = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3785   PetscCount           Annz = seq->nz;
3786   PetscMemType         memtype;
3787   const PetscScalar   *v1 = v;
3788   PetscScalar         *Aa;
3789   PetscContainer       container;
3790   MatCOOStruct_SeqAIJ *coo;
3791 
3792   PetscFunctionBegin;
3793   if (!dev->mat) PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3794 
3795   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
3796   PetscCall(PetscContainerGetPointer(container, &coo));
3797 
3798   PetscCall(PetscGetMemType(v, &memtype));
3799   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3800     PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
3801     PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
3802   }
3803 
3804   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
3805   else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));
3806 
3807   PetscCall(PetscLogGpuTimeBegin());
3808   if (Annz) {
3809     hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, coo->jmap, coo->perm, imode, Aa);
3810     PetscCallHIP(hipPeekAtLastError());
3811   }
3812   PetscCall(PetscLogGpuTimeEnd());
3813 
3814   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
3815   else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
3816 
3817   if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
3818   PetscFunctionReturn(PETSC_SUCCESS);
3819 }
3820 
3821 /*@C
3822   MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.
3823 
3824   Not Collective
3825 
3826   Input Parameters:
3827 + A          - the matrix
3828 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3829 
3830   Output Parameters:
3831 + i - the CSR row pointers
3832 - j - the CSR column indices
3833 
3834   Level: developer
3835 
3836   Note:
3837   When compressed is true, the CSR structure does not contain empty rows
3838 
3839 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3840 @*/
MatSeqAIJHIPSPARSEGetIJ(Mat A,PetscBool compressed,const int * i[],const int * j[])3841 PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3842 {
3843   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3844   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
3845   CsrMatrix           *csr;
3846 
3847   PetscFunctionBegin;
3848   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3849   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
3850   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3851   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3852   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3853   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3854   csr = (CsrMatrix *)cusp->mat->mat;
3855   if (i) {
3856     if (!compressed && a->compressedrow.use) { /* need full row offset */
3857       if (!cusp->rowoffsets_gpu) {
3858         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3859         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3860         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3861       }
3862       *i = cusp->rowoffsets_gpu->data().get();
3863     } else *i = csr->row_offsets->data().get();
3864   }
3865   if (j) *j = csr->column_indices->data().get();
3866   PetscFunctionReturn(PETSC_SUCCESS);
3867 }
3868 
3869 /*@C
3870   MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`
3871 
3872   Not Collective
3873 
3874   Input Parameters:
3875 + A          - the matrix
3876 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3877 . i          - the CSR row pointers
3878 - j          - the CSR column indices
3879 
3880   Level: developer
3881 
3882 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3883 @*/
MatSeqAIJHIPSPARSERestoreIJ(Mat A,PetscBool compressed,const int * i[],const int * j[])3884 PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3885 {
3886   PetscFunctionBegin;
3887   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3888   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3889   if (i) *i = NULL;
3890   if (j) *j = NULL;
3891   PetscFunctionReturn(PETSC_SUCCESS);
3892 }
3893 
3894 /*@C
3895   MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3896 
3897   Not Collective
3898 
3899   Input Parameter:
3900 . A - a `MATSEQAIJHIPSPARSE` matrix
3901 
3902   Output Parameter:
3903 . a - pointer to the device data
3904 
3905   Level: developer
3906 
3907   Note:
3908   May trigger host-device copies if the up-to-date matrix data is on host
3909 
3910 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3911 @*/
MatSeqAIJHIPSPARSEGetArrayRead(Mat A,const PetscScalar * a[])3912 PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[])
3913 {
3914   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3915   CsrMatrix           *csr;
3916 
3917   PetscFunctionBegin;
3918   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3919   PetscAssertPointer(a, 2);
3920   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3921   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3922   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3923   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3924   csr = (CsrMatrix *)cusp->mat->mat;
3925   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3926   *a = csr->values->data().get();
3927   PetscFunctionReturn(PETSC_SUCCESS);
3928 }
3929 
3930 /*@C
3931   MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`
3932 
3933   Not Collective
3934 
3935   Input Parameters:
3936 + A - a `MATSEQAIJHIPSPARSE` matrix
3937 - a - pointer to the device data
3938 
3939   Level: developer
3940 
3941 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3942 @*/
MatSeqAIJHIPSPARSERestoreArrayRead(Mat A,const PetscScalar * a[])3943 PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[])
3944 {
3945   PetscFunctionBegin;
3946   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3947   PetscAssertPointer(a, 2);
3948   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3949   *a = NULL;
3950   PetscFunctionReturn(PETSC_SUCCESS);
3951 }
3952 
3953 /*@C
3954   MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3955 
3956   Not Collective
3957 
3958   Input Parameter:
3959 . A - a `MATSEQAIJHIPSPARSE` matrix
3960 
3961   Output Parameter:
3962 . a - pointer to the device data
3963 
3964   Level: developer
3965 
3966   Note:
3967   May trigger host-device copies if up-to-date matrix data is on host
3968 
3969 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3970 @*/
MatSeqAIJHIPSPARSEGetArray(Mat A,PetscScalar * a[])3971 PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[])
3972 {
3973   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3974   CsrMatrix           *csr;
3975 
3976   PetscFunctionBegin;
3977   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3978   PetscAssertPointer(a, 2);
3979   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3980   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3981   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3982   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3983   csr = (CsrMatrix *)cusp->mat->mat;
3984   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3985   *a             = csr->values->data().get();
3986   A->offloadmask = PETSC_OFFLOAD_GPU;
3987   PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
3988   PetscFunctionReturn(PETSC_SUCCESS);
3989 }
3990 /*@C
3991   MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`
3992 
3993   Not Collective
3994 
3995   Input Parameters:
3996 + A - a `MATSEQAIJHIPSPARSE` matrix
3997 - a - pointer to the device data
3998 
3999   Level: developer
4000 
4001 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4002 @*/
MatSeqAIJHIPSPARSERestoreArray(Mat A,PetscScalar * a[])4003 PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[])
4004 {
4005   PetscFunctionBegin;
4006   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4007   PetscAssertPointer(a, 2);
4008   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4009   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4010   *a = NULL;
4011   PetscFunctionReturn(PETSC_SUCCESS);
4012 }
4013 
4014 /*@C
4015   MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4016 
4017   Not Collective
4018 
4019   Input Parameter:
4020 . A - a `MATSEQAIJHIPSPARSE` matrix
4021 
4022   Output Parameter:
4023 . a - pointer to the device data
4024 
4025   Level: developer
4026 
4027   Note:
4028   Does not trigger host-device copies and flags data validity on the GPU
4029 
4030 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4031 @*/
MatSeqAIJHIPSPARSEGetArrayWrite(Mat A,PetscScalar * a[])4032 PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[])
4033 {
4034   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4035   CsrMatrix           *csr;
4036 
4037   PetscFunctionBegin;
4038   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4039   PetscAssertPointer(a, 2);
4040   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4041   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4042   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4043   csr = (CsrMatrix *)cusp->mat->mat;
4044   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4045   *a             = csr->values->data().get();
4046   A->offloadmask = PETSC_OFFLOAD_GPU;
4047   PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4048   PetscFunctionReturn(PETSC_SUCCESS);
4049 }
4050 
4051 /*@C
4052   MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`
4053 
4054   Not Collective
4055 
4056   Input Parameters:
4057 + A - a `MATSEQAIJHIPSPARSE` matrix
4058 - a - pointer to the device data
4059 
4060   Level: developer
4061 
4062 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4063 @*/
MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A,PetscScalar * a[])4064 PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[])
4065 {
4066   PetscFunctionBegin;
4067   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4068   PetscAssertPointer(a, 2);
4069   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4070   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4071   *a = NULL;
4072   PetscFunctionReturn(PETSC_SUCCESS);
4073 }
4074 
4075 struct IJCompare4 {
operator ()IJCompare44076   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4077   {
4078     if (t1.get<0>() < t2.get<0>()) return true;
4079     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4080     return false;
4081   }
4082 };
4083 
4084 struct Shift {
4085   int _shift;
4086 
ShiftShift4087   Shift(int shift) : _shift(shift) { }
operator ()Shift4088   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4089 };
4090 
4091 /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
MatSeqAIJHIPSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat * C)4092 PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4093 {
4094   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4095   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4096   Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4097   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
4098   PetscInt                       Annz, Bnnz;
4099   PetscInt                       i, m, n, zero = 0;
4100 
4101   PetscFunctionBegin;
4102   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4103   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4104   PetscAssertPointer(C, 4);
4105   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4106   PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4107   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4108   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4109   PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4110   PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4111   if (reuse == MAT_INITIAL_MATRIX) {
4112     m = A->rmap->n;
4113     n = A->cmap->n + B->cmap->n;
4114     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4115     PetscCall(MatSetSizes(*C, m, n, m, n));
4116     PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4117     c                       = (Mat_SeqAIJ *)(*C)->data;
4118     Ccusp                   = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4119     Cmat                    = new Mat_SeqAIJHIPSPARSEMultStruct;
4120     Ccsr                    = new CsrMatrix;
4121     Cmat->cprowIndices      = NULL;
4122     c->compressedrow.use    = PETSC_FALSE;
4123     c->compressedrow.nrows  = 0;
4124     c->compressedrow.i      = NULL;
4125     c->compressedrow.rindex = NULL;
4126     Ccusp->workVector       = NULL;
4127     Ccusp->nrows            = m;
4128     Ccusp->mat              = Cmat;
4129     Ccusp->mat->mat         = Ccsr;
4130     Ccsr->num_rows          = m;
4131     Ccsr->num_cols          = n;
4132     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4133     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4134     PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4135     PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4136     PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4137     PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4138     PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4139     PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4140     PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4141     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4142     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4143     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4144     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4145 
4146     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4147     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4148     Annz                 = (PetscInt)Acsr->column_indices->size();
4149     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4150     c->nz                = Annz + Bnnz;
4151     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4152     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4153     Ccsr->values         = new THRUSTARRAY(c->nz);
4154     Ccsr->num_entries    = c->nz;
4155     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4156     if (c->nz) {
4157       auto              Acoo = new THRUSTINTARRAY32(Annz);
4158       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4159       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4160       THRUSTINTARRAY32 *Aroff, *Broff;
4161 
4162       if (a->compressedrow.use) { /* need full row offset */
4163         if (!Acusp->rowoffsets_gpu) {
4164           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4165           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4166           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4167         }
4168         Aroff = Acusp->rowoffsets_gpu;
4169       } else Aroff = Acsr->row_offsets;
4170       if (b->compressedrow.use) { /* need full row offset */
4171         if (!Bcusp->rowoffsets_gpu) {
4172           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4173           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4174           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4175         }
4176         Broff = Bcusp->rowoffsets_gpu;
4177       } else Broff = Bcsr->row_offsets;
4178       PetscCall(PetscLogGpuTimeBegin());
4179       PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4180       PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4181       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4182       auto Aperm = thrust::make_constant_iterator(1);
4183       auto Bperm = thrust::make_constant_iterator(0);
4184       auto Bcib  = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4185       auto Bcie  = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4186       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4187       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4188       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4189       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4190       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4191       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4192       auto p1    = Ccusp->coords->begin();
4193       auto p2    = Ccusp->coords->begin();
4194       thrust::advance(p2, Annz);
4195       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4196       auto cci = thrust::make_counting_iterator(zero);
4197       auto cce = thrust::make_counting_iterator(c->nz);
4198 #if 0 //Errors on SUMMIT cuda 11.1.0
4199       PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4200 #else
4201       auto pred = [](const int &x) { return x; };
4202       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4203       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4204 #endif
4205       PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4206       PetscCall(PetscLogGpuTimeEnd());
4207       delete wPerm;
4208       delete Acoo;
4209       delete Bcoo;
4210       delete Ccoo;
4211       PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4212 
4213       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4214         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4215         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4216         PetscBool                      AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4217         Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4218         CsrMatrix                     *CcsrT = new CsrMatrix;
4219         CsrMatrix                     *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4220         CsrMatrix                     *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4221 
4222         (*C)->form_explicit_transpose = PETSC_TRUE;
4223         (*C)->transupdated            = PETSC_TRUE;
4224         Ccusp->rowoffsets_gpu         = NULL;
4225         CmatT->cprowIndices           = NULL;
4226         CmatT->mat                    = CcsrT;
4227         CcsrT->num_rows               = n;
4228         CcsrT->num_cols               = m;
4229         CcsrT->num_entries            = c->nz;
4230         CcsrT->row_offsets            = new THRUSTINTARRAY32(n + 1);
4231         CcsrT->column_indices         = new THRUSTINTARRAY32(c->nz);
4232         CcsrT->values                 = new THRUSTARRAY(c->nz);
4233 
4234         PetscCall(PetscLogGpuTimeBegin());
4235         auto rT = CcsrT->row_offsets->begin();
4236         if (AT) {
4237           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4238           thrust::advance(rT, -1);
4239         }
4240         if (BT) {
4241           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4242           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4243           thrust::copy(titb, tite, rT);
4244         }
4245         auto cT = CcsrT->column_indices->begin();
4246         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4247         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4248         auto vT = CcsrT->values->begin();
4249         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4250         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4251         PetscCall(PetscLogGpuTimeEnd());
4252 
4253         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4254         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4255         PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4256         PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4257         PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4258         PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4259         PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4260         PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4261         PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4262 
4263         PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4264         Ccusp->matTranspose = CmatT;
4265       }
4266     }
4267 
4268     c->free_a = PETSC_TRUE;
4269     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4270     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4271     c->free_ij = PETSC_TRUE;
4272     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4273       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4274       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4275       ii = *Ccsr->row_offsets;
4276       jj = *Ccsr->column_indices;
4277       PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4278       PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4279     } else {
4280       PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4281       PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4282     }
4283     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4284     PetscCall(PetscMalloc1(m, &c->ilen));
4285     PetscCall(PetscMalloc1(m, &c->imax));
4286     c->maxnz         = c->nz;
4287     c->nonzerorowcnt = 0;
4288     c->rmax          = 0;
4289     for (i = 0; i < m; i++) {
4290       const PetscInt nn = c->i[i + 1] - c->i[i];
4291       c->ilen[i] = c->imax[i] = nn;
4292       c->nonzerorowcnt += (PetscInt)!!nn;
4293       c->rmax = PetscMax(c->rmax, nn);
4294     }
4295     PetscCall(PetscMalloc1(c->nz, &c->a));
4296     (*C)->nonzerostate++;
4297     PetscCall(PetscLayoutSetUp((*C)->rmap));
4298     PetscCall(PetscLayoutSetUp((*C)->cmap));
4299     Ccusp->nonzerostate = (*C)->nonzerostate;
4300     (*C)->preallocated  = PETSC_TRUE;
4301   } else {
4302     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4303     c = (Mat_SeqAIJ *)(*C)->data;
4304     if (c->nz) {
4305       Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4306       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4307       PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4308       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4309       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4310       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4311       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4312       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4313       Acsr = (CsrMatrix *)Acusp->mat->mat;
4314       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4315       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4316       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4317       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4318       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4319       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4320       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4321       auto pmid = Ccusp->coords->begin();
4322       thrust::advance(pmid, Acsr->num_entries);
4323       PetscCall(PetscLogGpuTimeBegin());
4324       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4325       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4326       thrust::for_each(zibait, zieait, VecHIPEquals());
4327       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4328       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4329       thrust::for_each(zibbit, ziebit, VecHIPEquals());
4330       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4331       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4332         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4333         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4334         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4335         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4336         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4337         auto       vT    = CcsrT->values->begin();
4338         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4339         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4340         (*C)->transupdated = PETSC_TRUE;
4341       }
4342       PetscCall(PetscLogGpuTimeEnd());
4343     }
4344   }
4345   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4346   (*C)->assembled     = PETSC_TRUE;
4347   (*C)->was_assembled = PETSC_FALSE;
4348   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4349   PetscFunctionReturn(PETSC_SUCCESS);
4350 }
4351 
MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A,PetscInt n,const PetscInt idx[],PetscScalar v[])4352 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4353 {
4354   bool               dmem;
4355   const PetscScalar *av;
4356 
4357   PetscFunctionBegin;
4358   dmem = isHipMem(v);
4359   PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4360   if (n && idx) {
4361     THRUSTINTARRAY widx(n);
4362     widx.assign(idx, idx + n);
4363     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4364 
4365     THRUSTARRAY                    *w = NULL;
4366     thrust::device_ptr<PetscScalar> dv;
4367     if (dmem) dv = thrust::device_pointer_cast(v);
4368     else {
4369       w  = new THRUSTARRAY(n);
4370       dv = w->data();
4371     }
4372     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4373 
4374     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4375     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4376     thrust::for_each(zibit, zieit, VecHIPEquals());
4377     if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4378     delete w;
4379   } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
4380 
4381   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4382   PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4383   PetscFunctionReturn(PETSC_SUCCESS);
4384 }
4385