xref: /petsc/src/mat/impls/aij/seq/seqhipsparse/aijhipsparse.hip.cxx (revision d756bedd70a89ca052be956bccd75c5761cb2ab4)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the HIPSPARSE library,
4   Portions of this code are under:
5   Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6 */
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal()
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/iterator/transform_iterator.h>
17 #if PETSC_CPP_VERSION >= 14
18   #define PETSC_HAVE_THRUST_ASYNC 1
19   #include <thrust/async/for_each.h>
20 #endif
21 #include <thrust/iterator/constant_iterator.h>
22 #include <thrust/iterator/discard_iterator.h>
23 #include <thrust/binary_search.h>
24 #include <thrust/remove.h>
25 #include <thrust/sort.h>
26 #include <thrust/unique.h>
27 
28 const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
29 const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
30 const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
31 //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
32 
33 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
34 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
35 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
36 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
37 static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
38 static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
39 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
40 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
41 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
42 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
43 static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems PetscOptionsObject);
44 static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
45 static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
46 static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
47 static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
48 static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
49 static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
50 static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
51 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
52 static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
53 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
54 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
55 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
56 static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
57 static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
58 static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
59 static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
60 static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
61 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
62 static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
63 static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
64 static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
65 
66 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
67 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
68 
69 /*
70 PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
71 {
72   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
73 
74   PetscFunctionBegin;
75   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
76   hipsparsestruct->stream = stream;
77   PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
78   PetscFunctionReturn(PETSC_SUCCESS);
79 }
80 
81 PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
82 {
83   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
84 
85   PetscFunctionBegin;
86   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
87   if (hipsparsestruct->handle != handle) {
88     if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
89     hipsparsestruct->handle = handle;
90   }
91   PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
96 {
97   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
98   PetscBool            flg;
99 
100   PetscFunctionBegin;
101   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
102   if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
103   if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 */
107 
108 PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
109 {
110   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
111 
112   PetscFunctionBegin;
113   switch (op) {
114   case MAT_HIPSPARSE_MULT:
115     hipsparsestruct->format = format;
116     break;
117   case MAT_HIPSPARSE_ALL:
118     hipsparsestruct->format = format;
119     break;
120   default:
121     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
122   }
123   PetscFunctionReturn(PETSC_SUCCESS);
124 }
125 
126 /*@
127   MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
128   operation. Only the `MatMult()` operation can use different GPU storage formats
129 
130   Not Collective
131 
132   Input Parameters:
133 + A      - Matrix of type `MATSEQAIJHIPSPARSE`
134 . op     - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
135          `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
136 - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
137 
138   Level: intermediate
139 
140 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
141 @*/
142 PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
143 {
144   PetscFunctionBegin;
145   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
146   PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
147   PetscFunctionReturn(PETSC_SUCCESS);
148 }
149 
150 PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
151 {
152   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
153 
154   PetscFunctionBegin;
155   hipsparsestruct->use_cpu_solve = use_cpu;
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 /*@
160   MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
161 
162   Input Parameters:
163 + A       - Matrix of type `MATSEQAIJHIPSPARSE`
164 - use_cpu - set flag for using the built-in CPU `MatSolve()`
165 
166   Level: intermediate
167 
168   Notes:
169   The hipSparse LU solver currently computes the factors with the built-in CPU method
170   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
171   This method to specifies if the solve is done on the CPU or GPU (GPU is the default).
172 
173 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
174 @*/
175 PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
176 {
177   PetscFunctionBegin;
178   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
179   PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
180   PetscFunctionReturn(PETSC_SUCCESS);
181 }
182 
183 static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
184 {
185   PetscFunctionBegin;
186   switch (op) {
187   case MAT_FORM_EXPLICIT_TRANSPOSE:
188     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
189     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
190     A->form_explicit_transpose = flg;
191     break;
192   default:
193     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
194     break;
195   }
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
200 {
201   PetscBool            row_identity, col_identity;
202   Mat_SeqAIJ          *b     = (Mat_SeqAIJ *)B->data;
203   IS                   isrow = b->row, iscol = b->col;
204   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;
205 
206   PetscFunctionBegin;
207   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
208   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
209   B->offloadmask = PETSC_OFFLOAD_CPU;
210   /* determine which version of MatSolve needs to be used. */
211   PetscCall(ISIdentity(isrow, &row_identity));
212   PetscCall(ISIdentity(iscol, &col_identity));
213   if (!hipsparsestruct->use_cpu_solve) {
214     if (row_identity && col_identity) {
215       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
216       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
217     } else {
218       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE;
219       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
220     }
221   }
222   B->ops->matsolve          = NULL;
223   B->ops->matsolvetranspose = NULL;
224 
225   /* get the triangular factors */
226   if (!hipsparsestruct->use_cpu_solve) PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B));
227   PetscFunctionReturn(PETSC_SUCCESS);
228 }
229 
230 static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
231 {
232   MatHIPSPARSEStorageFormat format;
233   PetscBool                 flg;
234   Mat_SeqAIJHIPSPARSE      *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
235 
236   PetscFunctionBegin;
237   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
238   if (A->factortype == MAT_FACTOR_NONE) {
239     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
240     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
241     PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
242     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
243     PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
244     if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
245     PetscCall(
246       PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
247     /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
248     PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249     PetscCall(
250       PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
251     PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
252     /*
253     PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
254     PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
255     */
256   }
257   PetscOptionsHeadEnd();
258   PetscFunctionReturn(PETSC_SUCCESS);
259 }
260 
261 static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
262 {
263   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
264   PetscInt                            n                   = A->rmap->n;
265   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
266   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
267   const PetscInt                     *ai = a->i, *aj = a->j, *vi;
268   const MatScalar                    *aa = a->a, *v;
269   PetscInt                           *AiLo, *AjLo;
270   PetscInt                            i, nz, nzLower, offset, rowOffset;
271 
272   PetscFunctionBegin;
273   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
274   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
275     try {
276       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
277       nzLower = n + ai[n] - ai[1];
278       if (!loTriFactor) {
279         PetscScalar *AALo;
280         PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));
281 
282         /* Allocate Space for the lower triangular matrix */
283         PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
284         PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));
285 
286         /* Fill the lower triangular matrix */
287         AiLo[0]   = (PetscInt)0;
288         AiLo[n]   = nzLower;
289         AjLo[0]   = (PetscInt)0;
290         AALo[0]   = (MatScalar)1.0;
291         v         = aa;
292         vi        = aj;
293         offset    = 1;
294         rowOffset = 1;
295         for (i = 1; i < n; i++) {
296           nz = ai[i + 1] - ai[i];
297           /* additional 1 for the term on the diagonal */
298           AiLo[i] = rowOffset;
299           rowOffset += nz + 1;
300 
301           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
302           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
303           offset += nz;
304           AjLo[offset] = (PetscInt)i;
305           AALo[offset] = (MatScalar)1.0;
306           offset += 1;
307           v += nz;
308           vi += nz;
309         }
310 
311         /* allocate space for the triangular factor information */
312         PetscCall(PetscNew(&loTriFactor));
313         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
314         /* Create the matrix description */
315         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
316         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
317         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
318         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
319         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
320 
321         /* set the operation */
322         loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
323 
324         /* set the matrix */
325         loTriFactor->csrMat                 = new CsrMatrix;
326         loTriFactor->csrMat->num_rows       = n;
327         loTriFactor->csrMat->num_cols       = n;
328         loTriFactor->csrMat->num_entries    = nzLower;
329         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
330         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
331         loTriFactor->csrMat->values         = new THRUSTARRAY(nzLower);
332 
333         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
334         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
335         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
336 
337         /* Create the solve analysis information */
338         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
339         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
340         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
342         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
343 
344         /* perform the solve analysis */
345         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
346                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
347 
348         PetscCallHIP(WaitForHIP());
349         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
350 
351         /* assign the pointer */
352         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
353         loTriFactor->AA_h                                           = AALo;
354         PetscCallHIP(hipHostFree(AiLo));
355         PetscCallHIP(hipHostFree(AjLo));
356         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
357       } else { /* update values only */
358         if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
359         /* Fill the lower triangular matrix */
360         loTriFactor->AA_h[0] = 1.0;
361         v                    = aa;
362         vi                   = aj;
363         offset               = 1;
364         for (i = 1; i < n; i++) {
365           nz = ai[i + 1] - ai[i];
366           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
367           offset += nz;
368           loTriFactor->AA_h[offset] = 1.0;
369           offset += 1;
370           v += nz;
371         }
372         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
373         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
374       }
375     } catch (char *ex) {
376       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
377     }
378   }
379   PetscFunctionReturn(PETSC_SUCCESS);
380 }
381 
382 static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
383 {
384   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
385   PetscInt                            n                   = A->rmap->n;
386   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
387   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
388   const PetscInt                     *aj                  = a->j, *adiag, *vi;
389   const MatScalar                    *aa                  = a->a, *v;
390   PetscInt                           *AiUp, *AjUp;
391   PetscInt                            i, nz, nzUpper, offset;
392 
393   PetscFunctionBegin;
394   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
395   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
396   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
397     try {
398       /* next, figure out the number of nonzeros in the upper triangular matrix. */
399       nzUpper = adiag[0] - adiag[n];
400       if (!upTriFactor) {
401         PetscScalar *AAUp;
402         PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
403 
404         /* Allocate Space for the upper triangular matrix */
405         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
406         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
407 
408         /* Fill the upper triangular matrix */
409         AiUp[0] = (PetscInt)0;
410         AiUp[n] = nzUpper;
411         offset  = nzUpper;
412         for (i = n - 1; i >= 0; i--) {
413           v  = aa + adiag[i + 1] + 1;
414           vi = aj + adiag[i + 1] + 1;
415           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
416           offset -= (nz + 1);               /* decrement the offset */
417 
418           /* first, set the diagonal elements */
419           AjUp[offset] = (PetscInt)i;
420           AAUp[offset] = (MatScalar)1. / v[nz];
421           AiUp[i]      = AiUp[i + 1] - (nz + 1);
422 
423           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
424           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
425         }
426 
427         /* allocate space for the triangular factor information */
428         PetscCall(PetscNew(&upTriFactor));
429         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
430 
431         /* Create the matrix description */
432         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
433         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
434         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
435         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
436         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
437 
438         /* set the operation */
439         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
440 
441         /* set the matrix */
442         upTriFactor->csrMat                 = new CsrMatrix;
443         upTriFactor->csrMat->num_rows       = n;
444         upTriFactor->csrMat->num_cols       = n;
445         upTriFactor->csrMat->num_entries    = nzUpper;
446         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
447         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
448         upTriFactor->csrMat->values         = new THRUSTARRAY(nzUpper);
449         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
450         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
451         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
452 
453         /* Create the solve analysis information */
454         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
455         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
456         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
457                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
458         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
459 
460         /* perform the solve analysis */
461         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
462                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
463 
464         PetscCallHIP(WaitForHIP());
465         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
466 
467         /* assign the pointer */
468         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
469         upTriFactor->AA_h                                           = AAUp;
470         PetscCallHIP(hipHostFree(AiUp));
471         PetscCallHIP(hipHostFree(AjUp));
472         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
473       } else {
474         if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
475         /* Fill the upper triangular matrix */
476         offset = nzUpper;
477         for (i = n - 1; i >= 0; i--) {
478           v  = aa + adiag[i + 1] + 1;
479           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
480           offset -= (nz + 1);               /* decrement the offset */
481 
482           /* first, set the diagonal elements */
483           upTriFactor->AA_h[offset] = 1. / v[nz];
484           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
485         }
486         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
487         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
488       }
489     } catch (char *ex) {
490       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
491     }
492   }
493   PetscFunctionReturn(PETSC_SUCCESS);
494 }
495 
496 static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
497 {
498   PetscBool                      row_identity, col_identity;
499   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
500   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
501   IS                             isrow = a->row, iscol = a->icol;
502   PetscInt                       n = A->rmap->n;
503 
504   PetscFunctionBegin;
505   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
506   PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
507   PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));
508 
509   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
510   hipsparseTriFactors->nnz = a->nz;
511 
512   A->offloadmask = PETSC_OFFLOAD_BOTH;
513   /* lower triangular indices */
514   PetscCall(ISIdentity(isrow, &row_identity));
515   if (!row_identity && !hipsparseTriFactors->rpermIndices) {
516     const PetscInt *r;
517 
518     PetscCall(ISGetIndices(isrow, &r));
519     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
520     hipsparseTriFactors->rpermIndices->assign(r, r + n);
521     PetscCall(ISRestoreIndices(isrow, &r));
522     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
523   }
524   /* upper triangular indices */
525   PetscCall(ISIdentity(iscol, &col_identity));
526   if (!col_identity && !hipsparseTriFactors->cpermIndices) {
527     const PetscInt *c;
528 
529     PetscCall(ISGetIndices(iscol, &c));
530     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
531     hipsparseTriFactors->cpermIndices->assign(c, c + n);
532     PetscCall(ISRestoreIndices(iscol, &c));
533     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
534   }
535   PetscFunctionReturn(PETSC_SUCCESS);
536 }
537 
538 static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
539 {
540   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
541   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
542   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
543   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
544   PetscInt                           *AiUp, *AjUp;
545   PetscScalar                        *AAUp;
546   PetscScalar                        *AALo;
547   PetscInt                            nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
548   Mat_SeqSBAIJ                       *b  = (Mat_SeqSBAIJ *)A->data;
549   const PetscInt                     *ai = b->i, *aj = b->j, *vj;
550   const MatScalar                    *aa = b->a, *v;
551 
552   PetscFunctionBegin;
553   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
554   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
555     try {
556       PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
557       PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
558       if (!upTriFactor && !loTriFactor) {
559         /* Allocate Space for the upper triangular matrix */
560         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
561         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
562 
563         /* Fill the upper triangular matrix */
564         AiUp[0] = (PetscInt)0;
565         AiUp[n] = nzUpper;
566         offset  = 0;
567         for (i = 0; i < n; i++) {
568           /* set the pointers */
569           v  = aa + ai[i];
570           vj = aj + ai[i];
571           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
572 
573           /* first, set the diagonal elements */
574           AjUp[offset] = (PetscInt)i;
575           AAUp[offset] = (MatScalar)1.0 / v[nz];
576           AiUp[i]      = offset;
577           AALo[offset] = (MatScalar)1.0 / v[nz];
578 
579           offset += 1;
580           if (nz > 0) {
581             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
582             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
583             for (j = offset; j < offset + nz; j++) {
584               AAUp[j] = -AAUp[j];
585               AALo[j] = AAUp[j] / v[nz];
586             }
587             offset += nz;
588           }
589         }
590 
591         /* allocate space for the triangular factor information */
592         PetscCall(PetscNew(&upTriFactor));
593         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
594 
595         /* Create the matrix description */
596         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
597         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
598         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
599         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
600         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
601 
602         /* set the matrix */
603         upTriFactor->csrMat                 = new CsrMatrix;
604         upTriFactor->csrMat->num_rows       = A->rmap->n;
605         upTriFactor->csrMat->num_cols       = A->cmap->n;
606         upTriFactor->csrMat->num_entries    = a->nz;
607         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
608         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609         upTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
610         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
611         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
612         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
613 
614         /* set the operation */
615         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
616 
617         /* Create the solve analysis information */
618         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
619         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
620         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
621                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
622         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
623 
624         /* perform the solve analysis */
625         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
626                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
627 
628         PetscCallHIP(WaitForHIP());
629         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
630 
631         /* assign the pointer */
632         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
633 
634         /* allocate space for the triangular factor information */
635         PetscCall(PetscNew(&loTriFactor));
636         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
637 
638         /* Create the matrix description */
639         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
640         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
641         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
642         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
643         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
644 
645         /* set the operation */
646         loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;
647 
648         /* set the matrix */
649         loTriFactor->csrMat                 = new CsrMatrix;
650         loTriFactor->csrMat->num_rows       = A->rmap->n;
651         loTriFactor->csrMat->num_cols       = A->cmap->n;
652         loTriFactor->csrMat->num_entries    = a->nz;
653         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
654         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
655         loTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
656         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
657         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
658         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
659 
660         /* Create the solve analysis information */
661         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
662         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
663         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
664                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
665         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
666 
667         /* perform the solve analysis */
668         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
669                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
670 
671         PetscCallHIP(WaitForHIP());
672         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
673 
674         /* assign the pointer */
675         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
676 
677         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
678         PetscCallHIP(hipHostFree(AiUp));
679         PetscCallHIP(hipHostFree(AjUp));
680       } else {
681         /* Fill the upper triangular matrix */
682         offset = 0;
683         for (i = 0; i < n; i++) {
684           /* set the pointers */
685           v  = aa + ai[i];
686           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
687 
688           /* first, set the diagonal elements */
689           AAUp[offset] = 1.0 / v[nz];
690           AALo[offset] = 1.0 / v[nz];
691 
692           offset += 1;
693           if (nz > 0) {
694             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
695             for (j = offset; j < offset + nz; j++) {
696               AAUp[j] = -AAUp[j];
697               AALo[j] = AAUp[j] / v[nz];
698             }
699             offset += nz;
700           }
701         }
702         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
703         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
704         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
705         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
706         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
707       }
708       PetscCallHIP(hipHostFree(AAUp));
709       PetscCallHIP(hipHostFree(AALo));
710     } catch (char *ex) {
711       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
712     }
713   }
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
717 static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
718 {
719   PetscBool                      perm_identity;
720   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
721   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
722   IS                             ip                  = a->row;
723   PetscInt                       n                   = A->rmap->n;
724 
725   PetscFunctionBegin;
726   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
727   PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
728   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
729   hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
730 
731   A->offloadmask = PETSC_OFFLOAD_BOTH;
732   /* lower triangular indices */
733   PetscCall(ISIdentity(ip, &perm_identity));
734   if (!perm_identity) {
735     IS              iip;
736     const PetscInt *irip, *rip;
737 
738     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
739     PetscCall(ISGetIndices(iip, &irip));
740     PetscCall(ISGetIndices(ip, &rip));
741     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
742     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
743     hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
744     hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
745     PetscCall(ISRestoreIndices(iip, &irip));
746     PetscCall(ISDestroy(&iip));
747     PetscCall(ISRestoreIndices(ip, &rip));
748     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
749   }
750   PetscFunctionReturn(PETSC_SUCCESS);
751 }
752 
753 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
754 {
755   PetscBool   perm_identity;
756   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
757   IS          ip = b->row;
758 
759   PetscFunctionBegin;
760   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
761   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
762   B->offloadmask = PETSC_OFFLOAD_CPU;
763   /* determine which version of MatSolve needs to be used. */
764   PetscCall(ISIdentity(ip, &perm_identity));
765   if (perm_identity) {
766     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
767     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
768     B->ops->matsolve          = NULL;
769     B->ops->matsolvetranspose = NULL;
770   } else {
771     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE;
772     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE;
773     B->ops->matsolve          = NULL;
774     B->ops->matsolvetranspose = NULL;
775   }
776 
777   /* get the triangular factors */
778   PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
779   PetscFunctionReturn(PETSC_SUCCESS);
780 }
781 
782 static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
783 {
784   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
785   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
786   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
787   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
788   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
789   hipsparseIndexBase_t                indexBase;
790   hipsparseMatrixType_t               matrixType;
791   hipsparseFillMode_t                 fillMode;
792   hipsparseDiagType_t                 diagType;
793 
794   PetscFunctionBegin;
795   /* allocate space for the transpose of the lower triangular factor */
796   PetscCall(PetscNew(&loTriFactorT));
797   loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
798 
799   /* set the matrix descriptors of the lower triangular factor */
800   matrixType = hipsparseGetMatType(loTriFactor->descr);
801   indexBase  = hipsparseGetMatIndexBase(loTriFactor->descr);
802   fillMode   = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
803   diagType   = hipsparseGetMatDiagType(loTriFactor->descr);
804 
805   /* Create the matrix description */
806   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
807   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
808   PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
809   PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
810   PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));
811 
812   /* set the operation */
813   loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
814 
815   /* allocate GPU space for the CSC of the lower triangular factor*/
816   loTriFactorT->csrMat                 = new CsrMatrix;
817   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
818   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
819   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
820   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
821   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
822   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
823 
824   /* compute the transpose of the lower triangular factor, i.e. the CSC */
825   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
826 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
827   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
828                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
829                                                   loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
830   PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
831 #endif
832 */
833   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
834 
835   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
836                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
837 #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
838                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
839                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
840 #else
841                                        loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
842 #endif
843 
844   PetscCallHIP(WaitForHIP());
845   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
846 
847   /* Create the solve analysis information */
848   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
849   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
850   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
851                                               loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
852   PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
853 
854   /* perform the solve analysis */
855   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
856                                               loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
857 
858   PetscCallHIP(WaitForHIP());
859   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
860 
861   /* assign the pointer */
862   ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
863 
864   /*********************************************/
865   /* Now the Transpose of the Upper Tri Factor */
866   /*********************************************/
867 
868   /* allocate space for the transpose of the upper triangular factor */
869   PetscCall(PetscNew(&upTriFactorT));
870   upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
871 
872   /* set the matrix descriptors of the upper triangular factor */
873   matrixType = hipsparseGetMatType(upTriFactor->descr);
874   indexBase  = hipsparseGetMatIndexBase(upTriFactor->descr);
875   fillMode   = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
876   diagType   = hipsparseGetMatDiagType(upTriFactor->descr);
877 
878   /* Create the matrix description */
879   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
880   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
881   PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
882   PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
883   PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));
884 
885   /* set the operation */
886   upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
887 
888   /* allocate GPU space for the CSC of the upper triangular factor*/
889   upTriFactorT->csrMat                 = new CsrMatrix;
890   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
891   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
892   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
893   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
894   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
895   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
896 
897   /* compute the transpose of the upper triangular factor, i.e. the CSC */
898   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
899 #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
900   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
901                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
902                                                   upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
903   PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
904 #endif
905 */
906   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
907   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
908                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
909 #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
910                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
911                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
912 #else
913                                        upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
914 #endif
915 
916   PetscCallHIP(WaitForHIP());
917   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
918 
919   /* Create the solve analysis information */
920   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
921   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
922   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
923                                               upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
924   PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
925 
926   /* perform the solve analysis */
927   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
928                                               upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
929 
930   PetscCallHIP(WaitForHIP());
931   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
932 
933   /* assign the pointer */
934   ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
935   PetscFunctionReturn(PETSC_SUCCESS);
936 }
937 
938 struct PetscScalarToPetscInt {
939   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
940 };
941 
942 static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
943 {
944   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
945   Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
946   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data;
947   hipsparseIndexBase_t           indexBase;
948 
949   PetscFunctionBegin;
950   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
951   matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
952   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
953   matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
954   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
955   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
956   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
957   PetscCall(PetscLogGpuTimeBegin());
958   if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
959   if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
960     matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
961     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
962     indexBase = hipsparseGetMatIndexBase(matstruct->descr);
963     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
964     PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
965 
966     /* set alpha and beta */
967     PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
968     PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
969     PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
970     PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
971     PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
972     PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
973 
974     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
975       CsrMatrix *matrixT      = new CsrMatrix;
976       matstructT->mat         = matrixT;
977       matrixT->num_rows       = A->cmap->n;
978       matrixT->num_cols       = A->rmap->n;
979       matrixT->num_entries    = a->nz;
980       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
981       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
982       matrixT->values         = new THRUSTARRAY(a->nz);
983 
984       if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
985       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
986 
987       PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
988                                             indexBase, hipsparse_scalartype));
989     } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
990       CsrMatrix *temp  = new CsrMatrix;
991       CsrMatrix *tempT = new CsrMatrix;
992       /* First convert HYB to CSR */
993       temp->num_rows       = A->rmap->n;
994       temp->num_cols       = A->cmap->n;
995       temp->num_entries    = a->nz;
996       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
997       temp->column_indices = new THRUSTINTARRAY32(a->nz);
998       temp->values         = new THRUSTARRAY(a->nz);
999 
1000       PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));
1001 
1002       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1003       tempT->num_rows       = A->rmap->n;
1004       tempT->num_cols       = A->cmap->n;
1005       tempT->num_entries    = a->nz;
1006       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1007       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1008       tempT->values         = new THRUSTARRAY(a->nz);
1009 
1010       PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1011                                            tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1012 
1013       /* Last, convert CSC to HYB */
1014       hipsparseHybMat_t hybMat;
1015       PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1016       hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1017       PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));
1018 
1019       /* assign the pointer */
1020       matstructT->mat = hybMat;
1021       A->transupdated = PETSC_TRUE;
1022       /* delete temporaries */
1023       if (tempT) {
1024         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1025         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1026         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1027         delete (CsrMatrix *)tempT;
1028       }
1029       if (temp) {
1030         if (temp->values) delete (THRUSTARRAY *)temp->values;
1031         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1032         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1033         delete (CsrMatrix *)temp;
1034       }
1035     }
1036   }
1037   if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1038     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1039     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1040     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1041     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1042     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1043     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1044     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1045     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1046     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1047     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1048     if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1049       hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1050       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1051       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1052     }
1053     if (!hipsparsestruct->csr2csc_i) {
1054       THRUSTARRAY csr2csc_a(matrix->num_entries);
1055       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1056 
1057       indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1058       if (matrix->num_entries) {
1059         /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1060            Need to verify this for ROCm.
1061         */
1062         PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1063                                              matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1064       } else {
1065         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1066       }
1067 
1068       hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1069       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1070     }
1071     PetscCallThrust(
1072       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1073   }
1074   PetscCall(PetscLogGpuTimeEnd());
1075   PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1076   /* the compressed row indices is not used for matTranspose */
1077   matstructT->cprowIndices = NULL;
1078   /* assign the pointer */
1079   ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1080   A->transupdated                                 = PETSC_TRUE;
1081   PetscFunctionReturn(PETSC_SUCCESS);
1082 }
1083 
1084 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1085 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1086 {
1087   PetscInt                              n = xx->map->n;
1088   const PetscScalar                    *barray;
1089   PetscScalar                          *xarray;
1090   thrust::device_ptr<const PetscScalar> bGPU;
1091   thrust::device_ptr<PetscScalar>       xGPU;
1092   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1093   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1094   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1095   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1096 
1097   PetscFunctionBegin;
1098   /* Analyze the matrix and create the transpose ... on the fly */
1099   if (!loTriFactorT && !upTriFactorT) {
1100     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1101     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1102     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1103   }
1104 
1105   /* Get the GPU pointers */
1106   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1107   PetscCall(VecHIPGetArrayRead(bb, &barray));
1108   xGPU = thrust::device_pointer_cast(xarray);
1109   bGPU = thrust::device_pointer_cast(barray);
1110 
1111   PetscCall(PetscLogGpuTimeBegin());
1112   /* First, reorder with the row permutation */
1113   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);
1114 
1115   /* First, solve U */
1116   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1117                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1118 
1119   /* Then, solve L */
1120   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1121                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1122 
1123   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1124   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());
1125 
1126   /* Copy the temporary to the full solution. */
1127   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);
1128 
1129   /* restore */
1130   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1131   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1132   PetscCall(PetscLogGpuTimeEnd());
1133   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1134   PetscFunctionReturn(PETSC_SUCCESS);
1135 }
1136 
1137 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1138 {
1139   const PetscScalar                  *barray;
1140   PetscScalar                        *xarray;
1141   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1142   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1143   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1144   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1145 
1146   PetscFunctionBegin;
1147   /* Analyze the matrix and create the transpose ... on the fly */
1148   if (!loTriFactorT && !upTriFactorT) {
1149     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1150     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1151     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1152   }
1153 
1154   /* Get the GPU pointers */
1155   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1156   PetscCall(VecHIPGetArrayRead(bb, &barray));
1157 
1158   PetscCall(PetscLogGpuTimeBegin());
1159   /* First, solve U */
1160   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1161                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1162 
1163   /* Then, solve L */
1164   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1165                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1166 
1167   /* restore */
1168   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1169   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1170   PetscCall(PetscLogGpuTimeEnd());
1171   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1172   PetscFunctionReturn(PETSC_SUCCESS);
1173 }
1174 
1175 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1176 {
1177   const PetscScalar                    *barray;
1178   PetscScalar                          *xarray;
1179   thrust::device_ptr<const PetscScalar> bGPU;
1180   thrust::device_ptr<PetscScalar>       xGPU;
1181   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1182   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1183   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1184   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1185 
1186   PetscFunctionBegin;
1187   /* Get the GPU pointers */
1188   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1189   PetscCall(VecHIPGetArrayRead(bb, &barray));
1190   xGPU = thrust::device_pointer_cast(xarray);
1191   bGPU = thrust::device_pointer_cast(barray);
1192 
1193   PetscCall(PetscLogGpuTimeBegin());
1194   /* First, reorder with the row permutation */
1195   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
1196 
1197   /* Next, solve L */
1198   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1199                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1200 
1201   /* Then, solve U */
1202   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1203                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1204 
1205   /* Last, reorder with the column permutation */
1206   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
1207 
1208   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1209   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1210   PetscCall(PetscLogGpuTimeEnd());
1211   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1212   PetscFunctionReturn(PETSC_SUCCESS);
1213 }
1214 
1215 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1216 {
1217   const PetscScalar                  *barray;
1218   PetscScalar                        *xarray;
1219   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1220   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1221   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1222   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1223 
1224   PetscFunctionBegin;
1225   /* Get the GPU pointers */
1226   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1227   PetscCall(VecHIPGetArrayRead(bb, &barray));
1228 
1229   PetscCall(PetscLogGpuTimeBegin());
1230   /* First, solve L */
1231   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1232                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1233 
1234   /* Next, solve U */
1235   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1236                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1237 
1238   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1239   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1240   PetscCall(PetscLogGpuTimeEnd());
1241   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1242   PetscFunctionReturn(PETSC_SUCCESS);
1243 }
1244 
1245 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1246 /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1247 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1248 {
1249   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1250   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1251   const PetscScalar             *barray;
1252   PetscScalar                   *xarray;
1253 
1254   PetscFunctionBegin;
1255   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1256   PetscCall(VecHIPGetArrayRead(b, &barray));
1257   PetscCall(PetscLogGpuTimeBegin());
1258 
1259   /* Solve L*y = b */
1260   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1261   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1262   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1263   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                   /* L Y = X */
1264                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1265   #else
1266   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                                     /* L Y = X */
1267                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1268   #endif
1269   /* Solve U*x = y */
1270   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1271   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1272   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1273                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1274   #else
1275   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1276                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1277   #endif
1278   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1279   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1280 
1281   PetscCall(PetscLogGpuTimeEnd());
1282   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1283   PetscFunctionReturn(PETSC_SUCCESS);
1284 }
1285 
1286 static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1287 {
1288   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1289   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1290   const PetscScalar             *barray;
1291   PetscScalar                   *xarray;
1292 
1293   PetscFunctionBegin;
1294   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1295     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1296     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1297                                                 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1298 
1299     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1300     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1301     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1302     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1303     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1304   }
1305 
1306   if (!fs->updatedTransposeSpSVAnalysis) {
1307     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1308 
1309     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1310     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1311   }
1312 
1313   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1314   PetscCall(VecHIPGetArrayRead(b, &barray));
1315   PetscCall(PetscLogGpuTimeBegin());
1316 
1317   /* Solve Ut*y = b */
1318   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1319   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1320   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1321   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1322                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1323   #else
1324   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1325                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1326   #endif
1327   /* Solve Lt*x = y */
1328   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1329   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1330   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1331                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1332   #else
1333   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1334                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1335   #endif
1336   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1337   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1338   PetscCall(PetscLogGpuTimeEnd());
1339   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1340   PetscFunctionReturn(PETSC_SUCCESS);
1341 }
1342 
1343 static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1344 {
1345   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1346   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1347   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1348   CsrMatrix                     *Acsr;
1349   PetscInt                       m, nz;
1350   PetscBool                      flg;
1351 
1352   PetscFunctionBegin;
1353   if (PetscDefined(USE_DEBUG)) {
1354     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1355     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1356   }
1357 
1358   /* Copy A's value to fact */
1359   m  = fact->rmap->n;
1360   nz = aij->nz;
1361   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1362   Acsr = (CsrMatrix *)Acusp->mat->mat;
1363   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1364 
1365   /* Factorize fact inplace */
1366   if (m)
1367     PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1368                                           fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1369   if (PetscDefined(USE_DEBUG)) {
1370     int               numerical_zero;
1371     hipsparseStatus_t status;
1372     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1373     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1374   }
1375 
1376   /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1377   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1378 
1379   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1380 
1381   /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1382   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1383 
1384   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1385   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ILU0;
1386   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1387   fact->ops->matsolve          = NULL;
1388   fact->ops->matsolvetranspose = NULL;
1389   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1390   PetscFunctionReturn(PETSC_SUCCESS);
1391 }
1392 
1393 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1394 {
1395   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1396   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1397   PetscInt                       m, nz;
1398 
1399   PetscFunctionBegin;
1400   if (PetscDefined(USE_DEBUG)) {
1401     PetscBool flg, diagDense;
1402 
1403     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1404     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1405     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1406     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1407     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1408   }
1409 
1410   /* Free the old stale stuff */
1411   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1412 
1413   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1414      but they will not be used. Allocate them just for easy debugging.
1415    */
1416   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1417 
1418   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1419   fact->factortype             = MAT_FACTOR_ILU;
1420   fact->info.factor_mallocs    = 0;
1421   fact->info.fill_ratio_given  = info->fill;
1422   fact->info.fill_ratio_needed = 1.0;
1423 
1424   aij->row = NULL;
1425   aij->col = NULL;
1426 
1427   /* ====================================================================== */
1428   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1429   /* We'll do in-place factorization on fact                                */
1430   /* ====================================================================== */
1431   const int *Ai, *Aj;
1432 
1433   m  = fact->rmap->n;
1434   nz = aij->nz;
1435 
1436   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1437   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1438   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1439   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1440   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1441   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1442 
1443   /* ====================================================================== */
1444   /* Create descriptors for M, L, U                                         */
1445   /* ====================================================================== */
1446   hipsparseFillMode_t fillMode;
1447   hipsparseDiagType_t diagType;
1448 
1449   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1450   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1451   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1452 
1453   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1454     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1455     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1456     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1457     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1458   */
1459   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1460   diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1461   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1462   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1463   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1464 
1465   fillMode = HIPSPARSE_FILL_MODE_UPPER;
1466   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1467   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1468   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1469   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1470 
1471   /* ========================================================================= */
1472   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1473   /* ========================================================================= */
1474   PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1475   if (m)
1476     PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1477                                                      fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1478 
1479   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1480   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1481 
1482   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1483   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1484 
1485   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1486   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1487 
1488   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1489   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1490 
1491   /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1492      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1493    */
1494   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1495     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1496     fs->spsvBuffer_L = fs->factBuffer_M;
1497     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1498   } else {
1499     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1500     fs->spsvBuffer_U = fs->factBuffer_M;
1501     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1502   }
1503 
1504   /* ========================================================================== */
1505   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1506   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1507   /* ========================================================================== */
1508   int structural_zero;
1509 
1510   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1511   if (m)
1512     PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1513                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1514   if (PetscDefined(USE_DEBUG)) {
1515     /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1516     hipsparseStatus_t status;
1517     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1518     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1519   }
1520 
1521   /* Estimate FLOPs of the numeric factorization */
1522   {
1523     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
1524     PetscInt       *Ai, nzRow, nzLeft;
1525     PetscLogDouble  flops = 0.0;
1526     const PetscInt *Adiag;
1527 
1528     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &Adiag, NULL));
1529     Ai = Aseq->i;
1530     for (PetscInt i = 0; i < m; i++) {
1531       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1532         nzRow  = Ai[i + 1] - Ai[i];
1533         nzLeft = Adiag[i] - Ai[i];
1534         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1535           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1536         */
1537         nzLeft = (nzRow - 1) / 2;
1538         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1539       }
1540     }
1541     fs->numericFactFlops = flops;
1542   }
1543   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1544   PetscFunctionReturn(PETSC_SUCCESS);
1545 }
1546 
1547 static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1548 {
1549   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1550   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1551   const PetscScalar             *barray;
1552   PetscScalar                   *xarray;
1553 
1554   PetscFunctionBegin;
1555   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1556   PetscCall(VecHIPGetArrayRead(b, &barray));
1557   PetscCall(PetscLogGpuTimeBegin());
1558 
1559   /* Solve L*y = b */
1560   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1561   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1562   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1563   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1564                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1565   #else
1566   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1567                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1568   #endif
1569   /* Solve Lt*x = y */
1570   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1571   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1572   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1573                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1574   #else
1575   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1576                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1577   #endif
1578   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1579   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1580 
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1587 {
1588   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1589   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1590   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1591   CsrMatrix                     *Acsr;
1592   PetscInt                       m, nz;
1593   PetscBool                      flg;
1594 
1595   PetscFunctionBegin;
1596   if (PetscDefined(USE_DEBUG)) {
1597     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1598     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1599   }
1600 
1601   /* Copy A's value to fact */
1602   m  = fact->rmap->n;
1603   nz = aij->nz;
1604   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1605   Acsr = (CsrMatrix *)Acusp->mat->mat;
1606   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1607 
1608   /* Factorize fact inplace */
1609   /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1610      The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1611      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1612      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1613    */
1614   if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1615   if (PetscDefined(USE_DEBUG)) {
1616     int               numerical_zero;
1617     hipsparseStatus_t status;
1618     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1619     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1620   }
1621 
1622   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1623 
1624   /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1625     ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1626   */
1627   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1628 
1629   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1630   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ICC0;
1631   fact->ops->solvetranspose    = MatSolve_SeqAIJHIPSPARSE_ICC0;
1632   fact->ops->matsolve          = NULL;
1633   fact->ops->matsolvetranspose = NULL;
1634   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1635   PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637 
1638 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1639 {
1640   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1641   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1642   PetscInt                       m, nz;
1643 
1644   PetscFunctionBegin;
1645   if (PetscDefined(USE_DEBUG)) {
1646     PetscBool flg, diagDense;
1647 
1648     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1649     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1650     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1651     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1652     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1653   }
1654 
1655   /* Free the old stale stuff */
1656   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1657 
1658   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1659      but they will not be used. Allocate them just for easy debugging.
1660    */
1661   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1662 
1663   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1664   fact->factortype             = MAT_FACTOR_ICC;
1665   fact->info.factor_mallocs    = 0;
1666   fact->info.fill_ratio_given  = info->fill;
1667   fact->info.fill_ratio_needed = 1.0;
1668 
1669   aij->row = NULL;
1670   aij->col = NULL;
1671 
1672   /* ====================================================================== */
1673   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1674   /* We'll do in-place factorization on fact                                */
1675   /* ====================================================================== */
1676   const int *Ai, *Aj;
1677 
1678   m  = fact->rmap->n;
1679   nz = aij->nz;
1680 
1681   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1682   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1683   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1684   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1685   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1686   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1687 
1688   /* ====================================================================== */
1689   /* Create mat descriptors for M, L                                        */
1690   /* ====================================================================== */
1691   hipsparseFillMode_t fillMode;
1692   hipsparseDiagType_t diagType;
1693 
1694   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1695   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1696   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1697 
1698   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1699     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1700     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1701     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1702     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1703   */
1704   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1705   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1706   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1707   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1708   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1709 
1710   /* ========================================================================= */
1711   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1712   /* ========================================================================= */
1713   PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1714   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1715 
1716   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1717   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1718 
1719   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1720   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1721 
1722   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1723   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1724 
1725   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1726   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1727 
1728   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1729      See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1730    */
1731   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1732     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1733     fs->spsvBuffer_L = fs->factBuffer_M;
1734     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1735   } else {
1736     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1737     fs->spsvBuffer_Lt = fs->factBuffer_M;
1738     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1739   }
1740 
1741   /* ========================================================================== */
1742   /* Perform analysis of ic0 on M                                               */
1743   /* The lower triangular part of M has the same sparsity pattern as L          */
1744   /* ========================================================================== */
1745   int structural_zero;
1746 
1747   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1748   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1749   if (PetscDefined(USE_DEBUG)) {
1750     hipsparseStatus_t status;
1751     /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1752     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1753     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1754   }
1755 
1756   /* Estimate FLOPs of the numeric factorization */
1757   {
1758     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1759     PetscInt      *Ai, nzRow, nzLeft;
1760     PetscLogDouble flops = 0.0;
1761 
1762     Ai = Aseq->i;
1763     for (PetscInt i = 0; i < m; i++) {
1764       nzRow = Ai[i + 1] - Ai[i];
1765       if (nzRow > 1) {
1766         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1767           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1768         */
1769         nzLeft = (nzRow - 1) / 2;
1770         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1771       }
1772     }
1773     fs->numericFactFlops = flops;
1774   }
1775   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1776   PetscFunctionReturn(PETSC_SUCCESS);
1777 }
1778 #endif
1779 
1780 static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1781 {
1782   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1783 
1784   PetscFunctionBegin;
1785 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1786   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1787   if (!info->factoronhost) {
1788     PetscCall(ISIdentity(isrow, &row_identity));
1789     PetscCall(ISIdentity(iscol, &col_identity));
1790   }
1791   if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1792   else
1793 #endif
1794   {
1795     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1796     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1797     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1798   }
1799   PetscFunctionReturn(PETSC_SUCCESS);
1800 }
1801 
1802 static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1803 {
1804   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1805 
1806   PetscFunctionBegin;
1807   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1808   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1809   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1810   PetscFunctionReturn(PETSC_SUCCESS);
1811 }
1812 
1813 static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1814 {
1815   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1816 
1817   PetscFunctionBegin;
1818 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1819   PetscBool perm_identity = PETSC_FALSE;
1820   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1821   if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1822   else
1823 #endif
1824   {
1825     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1826     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1827     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1828   }
1829   PetscFunctionReturn(PETSC_SUCCESS);
1830 }
1831 
1832 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1833 {
1834   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1835 
1836   PetscFunctionBegin;
1837   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1838   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1839   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1840   PetscFunctionReturn(PETSC_SUCCESS);
1841 }
1842 
1843 static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1844 {
1845   PetscFunctionBegin;
1846   *type = MATSOLVERHIPSPARSE;
1847   PetscFunctionReturn(PETSC_SUCCESS);
1848 }
1849 
1850 /*MC
1851   MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1852   on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1853   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1854   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1855   HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1856   algorithms are not recommended. This class does NOT support direct solver operations.
1857 
1858   Level: beginner
1859 
1860 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1861 M*/
1862 
1863 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1864 {
1865   PetscInt n = A->rmap->n;
1866 
1867   PetscFunctionBegin;
1868   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1869   PetscCall(MatSetSizes(*B, n, n, n, n));
1870   (*B)->factortype = ftype;
1871   PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
1872 
1873   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1874   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1875     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1876     if (!A->boundtocpu) {
1877       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1878       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1879     } else {
1880       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1881       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1882     }
1883     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1884     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1885     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1886   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1887     if (!A->boundtocpu) {
1888       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1889       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1890     } else {
1891       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1892       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1893     }
1894     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1895     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1896   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1897 
1898   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1899   (*B)->canuseordering = PETSC_TRUE;
1900   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1901   PetscFunctionReturn(PETSC_SUCCESS);
1902 }
1903 
1904 static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1905 {
1906   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
1907   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1908 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1909   Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1910 #endif
1911 
1912   PetscFunctionBegin;
1913   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1914     PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1915     if (A->factortype == MAT_FACTOR_NONE) {
1916       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1917       PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1918     }
1919 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1920     else if (fs->csrVal) {
1921       /* We have a factorized matrix on device and are able to copy it to host */
1922       PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1923     }
1924 #endif
1925     else
1926       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1927     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1928     PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1929     A->offloadmask = PETSC_OFFLOAD_BOTH;
1930   }
1931   PetscFunctionReturn(PETSC_SUCCESS);
1932 }
1933 
1934 static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1935 {
1936   PetscFunctionBegin;
1937   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1938   *array = ((Mat_SeqAIJ *)A->data)->a;
1939   PetscFunctionReturn(PETSC_SUCCESS);
1940 }
1941 
1942 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1943 {
1944   PetscFunctionBegin;
1945   A->offloadmask = PETSC_OFFLOAD_CPU;
1946   *array         = NULL;
1947   PetscFunctionReturn(PETSC_SUCCESS);
1948 }
1949 
1950 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1951 {
1952   PetscFunctionBegin;
1953   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1954   *array = ((Mat_SeqAIJ *)A->data)->a;
1955   PetscFunctionReturn(PETSC_SUCCESS);
1956 }
1957 
1958 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1959 {
1960   PetscFunctionBegin;
1961   *array = NULL;
1962   PetscFunctionReturn(PETSC_SUCCESS);
1963 }
1964 
1965 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1966 {
1967   PetscFunctionBegin;
1968   *array = ((Mat_SeqAIJ *)A->data)->a;
1969   PetscFunctionReturn(PETSC_SUCCESS);
1970 }
1971 
1972 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1973 {
1974   PetscFunctionBegin;
1975   A->offloadmask = PETSC_OFFLOAD_CPU;
1976   *array         = NULL;
1977   PetscFunctionReturn(PETSC_SUCCESS);
1978 }
1979 
1980 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1981 {
1982   Mat_SeqAIJHIPSPARSE *cusp;
1983   CsrMatrix           *matrix;
1984 
1985   PetscFunctionBegin;
1986   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1987   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1988   cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1989   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1990   matrix = (CsrMatrix *)cusp->mat->mat;
1991 
1992   if (i) {
1993 #if !defined(PETSC_USE_64BIT_INDICES)
1994     *i = matrix->row_offsets->data().get();
1995 #else
1996     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
1997 #endif
1998   }
1999   if (j) {
2000 #if !defined(PETSC_USE_64BIT_INDICES)
2001     *j = matrix->column_indices->data().get();
2002 #else
2003     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2004 #endif
2005   }
2006   if (a) *a = matrix->values->data().get();
2007   if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2008   PetscFunctionReturn(PETSC_SUCCESS);
2009 }
2010 
2011 PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2012 {
2013   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2014   Mat_SeqAIJHIPSPARSEMultStruct *matstruct       = hipsparsestruct->mat;
2015   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
2016   PetscBool                      both            = PETSC_TRUE;
2017   PetscInt                       m               = A->rmap->n, *ii, *ridx, tmp;
2018 
2019   PetscFunctionBegin;
2020   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2021   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2022     if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2023       CsrMatrix *matrix;
2024       matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
2025 
2026       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2027       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2028       matrix->values->assign(a->a, a->a + a->nz);
2029       PetscCallHIP(WaitForHIP());
2030       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2031       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2032       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2033     } else {
2034       PetscInt nnz;
2035       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2036       PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2037       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2038       delete hipsparsestruct->workVector;
2039       delete hipsparsestruct->rowoffsets_gpu;
2040       hipsparsestruct->workVector     = NULL;
2041       hipsparsestruct->rowoffsets_gpu = NULL;
2042       try {
2043         if (a->compressedrow.use) {
2044           m    = a->compressedrow.nrows;
2045           ii   = a->compressedrow.i;
2046           ridx = a->compressedrow.rindex;
2047         } else {
2048           m    = A->rmap->n;
2049           ii   = a->i;
2050           ridx = NULL;
2051         }
2052         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2053         if (!a->a) {
2054           nnz  = ii[m];
2055           both = PETSC_FALSE;
2056         } else nnz = a->nz;
2057         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2058 
2059         /* create hipsparse matrix */
2060         hipsparsestruct->nrows = m;
2061         matstruct              = new Mat_SeqAIJHIPSPARSEMultStruct;
2062         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2063         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2064         PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2065 
2066         PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2067         PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2068         PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2069         PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2070         PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2071         PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2072         PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2073 
2074         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2075         if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2076           /* set the matrix */
2077           CsrMatrix *mat      = new CsrMatrix;
2078           mat->num_rows       = m;
2079           mat->num_cols       = A->cmap->n;
2080           mat->num_entries    = nnz;
2081           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2082           mat->column_indices = new THRUSTINTARRAY32(nnz);
2083           mat->values         = new THRUSTARRAY(nnz);
2084           mat->row_offsets->assign(ii, ii + m + 1);
2085           mat->column_indices->assign(a->j, a->j + nnz);
2086           if (a->a) mat->values->assign(a->a, a->a + nnz);
2087 
2088           /* assign the pointer */
2089           matstruct->mat = mat;
2090           if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2091             PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2092                                                   HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2093           }
2094         } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2095           CsrMatrix *mat      = new CsrMatrix;
2096           mat->num_rows       = m;
2097           mat->num_cols       = A->cmap->n;
2098           mat->num_entries    = nnz;
2099           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2100           mat->column_indices = new THRUSTINTARRAY32(nnz);
2101           mat->values         = new THRUSTARRAY(nnz);
2102           mat->row_offsets->assign(ii, ii + m + 1);
2103           mat->column_indices->assign(a->j, a->j + nnz);
2104           if (a->a) mat->values->assign(a->a, a->a + nnz);
2105 
2106           hipsparseHybMat_t hybMat;
2107           PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2108           hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2109           PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2110           /* assign the pointer */
2111           matstruct->mat = hybMat;
2112 
2113           if (mat) {
2114             if (mat->values) delete (THRUSTARRAY *)mat->values;
2115             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2116             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2117             delete (CsrMatrix *)mat;
2118           }
2119         }
2120 
2121         /* assign the compressed row indices */
2122         if (a->compressedrow.use) {
2123           hipsparsestruct->workVector = new THRUSTARRAY(m);
2124           matstruct->cprowIndices     = new THRUSTINTARRAY(m);
2125           matstruct->cprowIndices->assign(ridx, ridx + m);
2126           tmp = m;
2127         } else {
2128           hipsparsestruct->workVector = NULL;
2129           matstruct->cprowIndices     = NULL;
2130           tmp                         = 0;
2131         }
2132         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2133 
2134         /* assign the pointer */
2135         hipsparsestruct->mat = matstruct;
2136       } catch (char *ex) {
2137         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2138       }
2139       PetscCallHIP(WaitForHIP());
2140       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2141       hipsparsestruct->nonzerostate = A->nonzerostate;
2142     }
2143     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2144   }
2145   PetscFunctionReturn(PETSC_SUCCESS);
2146 }
2147 
2148 struct VecHIPPlusEquals {
2149   template <typename Tuple>
2150   __host__ __device__ void operator()(Tuple t)
2151   {
2152     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2153   }
2154 };
2155 
2156 struct VecHIPEquals {
2157   template <typename Tuple>
2158   __host__ __device__ void operator()(Tuple t)
2159   {
2160     thrust::get<1>(t) = thrust::get<0>(t);
2161   }
2162 };
2163 
2164 struct VecHIPEqualsReverse {
2165   template <typename Tuple>
2166   __host__ __device__ void operator()(Tuple t)
2167   {
2168     thrust::get<0>(t) = thrust::get<1>(t);
2169   }
2170 };
2171 
2172 struct MatProductCtx_MatMatHipsparse {
2173   PetscBool             cisdense;
2174   PetscScalar          *Bt;
2175   Mat                   X;
2176   PetscBool             reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2177   PetscLogDouble        flops;
2178   CsrMatrix            *Bcsr;
2179   hipsparseSpMatDescr_t matSpBDescr;
2180   PetscBool             initialized; /* C = alpha op(A) op(B) + beta C */
2181   hipsparseDnMatDescr_t matBDescr;
2182   hipsparseDnMatDescr_t matCDescr;
2183   PetscInt              Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2184 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2185   void *dBuffer4, *dBuffer5;
2186 #endif
2187   size_t                 mmBufferSize;
2188   void                  *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2189   hipsparseSpGEMMDescr_t spgemmDesc;
2190 };
2191 
2192 static PetscErrorCode MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)
2193 {
2194   MatProductCtx_MatMatHipsparse *mmdata = *(MatProductCtx_MatMatHipsparse **)data;
2195 
2196   PetscFunctionBegin;
2197   PetscCallHIP(hipFree(mmdata->Bt));
2198   delete mmdata->Bcsr;
2199   if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2200   if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2201   if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2202   if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2203 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2204   if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4));
2205   if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5));
2206 #endif
2207   if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer));
2208   if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2));
2209   PetscCall(MatDestroy(&mmdata->X));
2210   PetscCall(PetscFree(*(void **)data));
2211   PetscFunctionReturn(PETSC_SUCCESS);
2212 }
2213 
2214 static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2215 {
2216   Mat_Product                   *product = C->product;
2217   Mat                            A, B;
2218   PetscInt                       m, n, blda, clda;
2219   PetscBool                      flg, biship;
2220   Mat_SeqAIJHIPSPARSE           *cusp;
2221   hipsparseOperation_t           opA;
2222   const PetscScalar             *barray;
2223   PetscScalar                   *carray;
2224   MatProductCtx_MatMatHipsparse *mmdata;
2225   Mat_SeqAIJHIPSPARSEMultStruct *mat;
2226   CsrMatrix                     *csrmat;
2227 
2228   PetscFunctionBegin;
2229   MatCheckProduct(C, 1);
2230   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2231   mmdata = (MatProductCtx_MatMatHipsparse *)product->data;
2232   A      = product->A;
2233   B      = product->B;
2234   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2235   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2236   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2237      Instead of silently accepting the wrong answer, I prefer to raise the error */
2238   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2239   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2240   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2241   switch (product->type) {
2242   case MATPRODUCT_AB:
2243   case MATPRODUCT_PtAP:
2244     mat = cusp->mat;
2245     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2246     m   = A->rmap->n;
2247     n   = B->cmap->n;
2248     break;
2249   case MATPRODUCT_AtB:
2250     if (!A->form_explicit_transpose) {
2251       mat = cusp->mat;
2252       opA = HIPSPARSE_OPERATION_TRANSPOSE;
2253     } else {
2254       PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2255       mat = cusp->matTranspose;
2256       opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2257     }
2258     m = A->cmap->n;
2259     n = B->cmap->n;
2260     break;
2261   case MATPRODUCT_ABt:
2262   case MATPRODUCT_RARt:
2263     mat = cusp->mat;
2264     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2265     m   = A->rmap->n;
2266     n   = B->rmap->n;
2267     break;
2268   default:
2269     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2270   }
2271   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2272   csrmat = (CsrMatrix *)mat->mat;
2273   /* if the user passed a CPU matrix, copy the data to the GPU */
2274   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2275   if (!biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
2276   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2277   PetscCall(MatDenseGetLDA(B, &blda));
2278   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2279     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2280     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2281   } else {
2282     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2283     PetscCall(MatDenseGetLDA(C, &clda));
2284   }
2285 
2286   PetscCall(PetscLogGpuTimeBegin());
2287   hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2288   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2289   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2290     size_t mmBufferSize;
2291     if (mmdata->initialized && mmdata->Blda != blda) {
2292       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2293       mmdata->matBDescr = NULL;
2294     }
2295     if (!mmdata->matBDescr) {
2296       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2297       mmdata->Blda = blda;
2298     }
2299     if (mmdata->initialized && mmdata->Clda != clda) {
2300       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2301       mmdata->matCDescr = NULL;
2302     }
2303     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2304       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2305       mmdata->Clda = clda;
2306     }
2307     if (!mat->matDescr) {
2308       PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2309                                             HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2310     }
2311     PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2312     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2313       PetscCallHIP(hipFree(mmdata->mmBuffer));
2314       PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2315       mmdata->mmBufferSize = mmBufferSize;
2316     }
2317     mmdata->initialized = PETSC_TRUE;
2318   } else {
2319     /* to be safe, always update pointers of the mats */
2320     PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2321     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2322     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2323   }
2324 
2325   /* do hipsparseSpMM, which supports transpose on B */
2326   PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2327 
2328   PetscCall(PetscLogGpuTimeEnd());
2329   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2330   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2331   if (product->type == MATPRODUCT_RARt) {
2332     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2333     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2334   } else if (product->type == MATPRODUCT_PtAP) {
2335     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2336     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2337   } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2338   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2339   if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2340   PetscFunctionReturn(PETSC_SUCCESS);
2341 }
2342 
2343 static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2344 {
2345   Mat_Product                   *product = C->product;
2346   Mat                            A, B;
2347   PetscInt                       m, n;
2348   PetscBool                      cisdense, flg;
2349   MatProductCtx_MatMatHipsparse *mmdata;
2350   Mat_SeqAIJHIPSPARSE           *cusp;
2351 
2352   PetscFunctionBegin;
2353   MatCheckProduct(C, 1);
2354   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2355   A = product->A;
2356   B = product->B;
2357   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2358   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2359   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2360   PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2361   switch (product->type) {
2362   case MATPRODUCT_AB:
2363     m = A->rmap->n;
2364     n = B->cmap->n;
2365     break;
2366   case MATPRODUCT_AtB:
2367     m = A->cmap->n;
2368     n = B->cmap->n;
2369     break;
2370   case MATPRODUCT_ABt:
2371     m = A->rmap->n;
2372     n = B->rmap->n;
2373     break;
2374   case MATPRODUCT_PtAP:
2375     m = B->cmap->n;
2376     n = B->cmap->n;
2377     break;
2378   case MATPRODUCT_RARt:
2379     m = B->rmap->n;
2380     n = B->rmap->n;
2381     break;
2382   default:
2383     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2384   }
2385   PetscCall(MatSetSizes(C, m, n, m, n));
2386   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2387   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2388   PetscCall(MatSetType(C, MATSEQDENSEHIP));
2389 
2390   /* product data */
2391   PetscCall(PetscNew(&mmdata));
2392   mmdata->cisdense = cisdense;
2393   /* for these products we need intermediate storage */
2394   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2395     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2396     PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2397     /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2398     if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2399     else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2400   }
2401   C->product->data       = mmdata;
2402   C->product->destroy    = MatProductCtxDestroy_MatMatHipsparse;
2403   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2404   PetscFunctionReturn(PETSC_SUCCESS);
2405 }
2406 
2407 static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2408 {
2409   Mat_Product                   *product = C->product;
2410   Mat                            A, B;
2411   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2412   Mat_SeqAIJ                    *c = (Mat_SeqAIJ *)C->data;
2413   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2414   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2415   PetscBool                      flg;
2416   MatProductType                 ptype;
2417   MatProductCtx_MatMatHipsparse *mmdata;
2418   hipsparseSpMatDescr_t          BmatSpDescr;
2419   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2420 
2421   PetscFunctionBegin;
2422   MatCheckProduct(C, 1);
2423   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2424   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2425   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2426   mmdata = (MatProductCtx_MatMatHipsparse *)C->product->data;
2427   A      = product->A;
2428   B      = product->B;
2429   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2430     mmdata->reusesym = PETSC_FALSE;
2431     Ccusp            = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2432     PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2433     Cmat = Ccusp->mat;
2434     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2435     Ccsr = (CsrMatrix *)Cmat->mat;
2436     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2437     goto finalize;
2438   }
2439   if (!c->nz) goto finalize;
2440   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2441   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2442   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2443   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2444   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2445   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2446   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2447   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2448   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2449   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2450   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2451   PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2452   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2453   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2454 
2455   ptype = product->type;
2456   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2457     ptype = MATPRODUCT_AB;
2458     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2459   }
2460   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2461     ptype = MATPRODUCT_AB;
2462     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2463   }
2464   switch (ptype) {
2465   case MATPRODUCT_AB:
2466     Amat = Acusp->mat;
2467     Bmat = Bcusp->mat;
2468     break;
2469   case MATPRODUCT_AtB:
2470     Amat = Acusp->matTranspose;
2471     Bmat = Bcusp->mat;
2472     break;
2473   case MATPRODUCT_ABt:
2474     Amat = Acusp->mat;
2475     Bmat = Bcusp->matTranspose;
2476     break;
2477   default:
2478     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2479   }
2480   Cmat = Ccusp->mat;
2481   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2482   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2483   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2484   Acsr = (CsrMatrix *)Amat->mat;
2485   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2486   Ccsr = (CsrMatrix *)Cmat->mat;
2487   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2488   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2489   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2490   PetscCall(PetscLogGpuTimeBegin());
2491 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2492   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2493   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2494   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2495   PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2496   #else
2497   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2498   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2499   #endif
2500 #else
2501   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2502                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2503                                           Ccsr->column_indices->data().get()));
2504 #endif
2505   PetscCall(PetscLogGpuFlops(mmdata->flops));
2506   PetscCallHIP(WaitForHIP());
2507   PetscCall(PetscLogGpuTimeEnd());
2508   C->offloadmask = PETSC_OFFLOAD_GPU;
2509 finalize:
2510   /* shorter version of MatAssemblyEnd_SeqAIJ */
2511   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2512   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2513   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2514   c->reallocs = 0;
2515   C->info.mallocs += 0;
2516   C->info.nz_unneeded = 0;
2517   C->assembled = C->was_assembled = PETSC_TRUE;
2518   C->num_ass++;
2519   PetscFunctionReturn(PETSC_SUCCESS);
2520 }
2521 
2522 static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2523 {
2524   Mat_Product                   *product = C->product;
2525   Mat                            A, B;
2526   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2527   Mat_SeqAIJ                    *a, *b, *c;
2528   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2529   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2530   PetscInt                       i, j, m, n, k;
2531   PetscBool                      flg;
2532   MatProductType                 ptype;
2533   MatProductCtx_MatMatHipsparse *mmdata;
2534   PetscLogDouble                 flops;
2535   PetscBool                      biscompressed, ciscompressed;
2536 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2537   int64_t               C_num_rows1, C_num_cols1, C_nnz1;
2538   hipsparseSpMatDescr_t BmatSpDescr;
2539 #else
2540   int cnz;
2541 #endif
2542   hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2543 
2544   PetscFunctionBegin;
2545   MatCheckProduct(C, 1);
2546   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2547   A = product->A;
2548   B = product->B;
2549   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2550   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2551   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2552   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2553   a = (Mat_SeqAIJ *)A->data;
2554   b = (Mat_SeqAIJ *)B->data;
2555   /* product data */
2556   PetscCall(PetscNew(&mmdata));
2557   C->product->data    = mmdata;
2558   C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
2559 
2560   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2561   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2562   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2563   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2564   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2565   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2566 
2567   ptype = product->type;
2568   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2569     ptype                                          = MATPRODUCT_AB;
2570     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2571   }
2572   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2573     ptype                                          = MATPRODUCT_AB;
2574     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2575   }
2576   biscompressed = PETSC_FALSE;
2577   ciscompressed = PETSC_FALSE;
2578   switch (ptype) {
2579   case MATPRODUCT_AB:
2580     m    = A->rmap->n;
2581     n    = B->cmap->n;
2582     k    = A->cmap->n;
2583     Amat = Acusp->mat;
2584     Bmat = Bcusp->mat;
2585     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2586     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2587     break;
2588   case MATPRODUCT_AtB:
2589     m = A->cmap->n;
2590     n = B->cmap->n;
2591     k = A->rmap->n;
2592     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2593     Amat = Acusp->matTranspose;
2594     Bmat = Bcusp->mat;
2595     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2596     break;
2597   case MATPRODUCT_ABt:
2598     m = A->rmap->n;
2599     n = B->rmap->n;
2600     k = A->cmap->n;
2601     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2602     Amat = Acusp->mat;
2603     Bmat = Bcusp->matTranspose;
2604     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2605     break;
2606   default:
2607     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2608   }
2609 
2610   /* create hipsparse matrix */
2611   PetscCall(MatSetSizes(C, m, n, m, n));
2612   PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2613   c     = (Mat_SeqAIJ *)C->data;
2614   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2615   Cmat  = new Mat_SeqAIJHIPSPARSEMultStruct;
2616   Ccsr  = new CsrMatrix;
2617 
2618   c->compressedrow.use = ciscompressed;
2619   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2620     c->compressedrow.nrows = a->compressedrow.nrows;
2621     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2622     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2623     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2624     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2625     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2626   } else {
2627     c->compressedrow.nrows  = 0;
2628     c->compressedrow.i      = NULL;
2629     c->compressedrow.rindex = NULL;
2630     Ccusp->workVector       = NULL;
2631     Cmat->cprowIndices      = NULL;
2632   }
2633   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2634   Ccusp->mat        = Cmat;
2635   Ccusp->mat->mat   = Ccsr;
2636   Ccsr->num_rows    = Ccusp->nrows;
2637   Ccsr->num_cols    = n;
2638   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2639   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2640   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2641   PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2642   PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2643   PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2644   PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2645   PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2646   PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2647   PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2648   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2649     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2650     c->nz                = 0;
2651     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2652     Ccsr->values         = new THRUSTARRAY(c->nz);
2653     goto finalizesym;
2654   }
2655 
2656   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2657   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2658   Acsr = (CsrMatrix *)Amat->mat;
2659   if (!biscompressed) {
2660     Bcsr        = (CsrMatrix *)Bmat->mat;
2661     BmatSpDescr = Bmat->matDescr;
2662   } else { /* we need to use row offsets for the full matrix */
2663     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2664     Bcsr                 = new CsrMatrix;
2665     Bcsr->num_rows       = B->rmap->n;
2666     Bcsr->num_cols       = cBcsr->num_cols;
2667     Bcsr->num_entries    = cBcsr->num_entries;
2668     Bcsr->column_indices = cBcsr->column_indices;
2669     Bcsr->values         = cBcsr->values;
2670     if (!Bcusp->rowoffsets_gpu) {
2671       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2672       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2673       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2674     }
2675     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2676     mmdata->Bcsr      = Bcsr;
2677     if (Bcsr->num_rows && Bcsr->num_cols) {
2678       PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2679     }
2680     BmatSpDescr = mmdata->matSpBDescr;
2681   }
2682   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2683   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2684   /* precompute flops count */
2685   if (ptype == MATPRODUCT_AB) {
2686     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2687       const PetscInt st = a->i[i];
2688       const PetscInt en = a->i[i + 1];
2689       for (j = st; j < en; j++) {
2690         const PetscInt brow = a->j[j];
2691         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2692       }
2693     }
2694   } else if (ptype == MATPRODUCT_AtB) {
2695     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2696       const PetscInt anzi = a->i[i + 1] - a->i[i];
2697       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2698       flops += (2. * anzi) * bnzi;
2699     }
2700   } else flops = 0.; /* TODO */
2701 
2702   mmdata->flops = flops;
2703   PetscCall(PetscLogGpuTimeBegin());
2704 #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2705   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2706   PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2707   PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2708   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2709   {
2710     /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2711      We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2712   */
2713     void *dBuffer1 = NULL;
2714     void *dBuffer2 = NULL;
2715     void *dBuffer3 = NULL;
2716     /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2717     size_t bufferSize1 = 0;
2718     size_t bufferSize2 = 0;
2719     size_t bufferSize3 = 0;
2720     size_t bufferSize4 = 0;
2721     size_t bufferSize5 = 0;
2722 
2723     /* ask bufferSize1 bytes for external memory */
2724     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2725     PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2726     /* inspect the matrices A and B to understand the memory requirement for the next step */
2727     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));
2728 
2729     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2730     PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2731     PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2732     PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2733     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2734     PetscCallHIP(hipFree(dBuffer1));
2735     PetscCallHIP(hipFree(dBuffer2));
2736 
2737     /* get matrix C non-zero entries C_nnz1 */
2738     PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2739     c->nz = (PetscInt)C_nnz1;
2740     /* allocate matrix C */
2741     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2742     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2743     Ccsr->values = new THRUSTARRAY(c->nz);
2744     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2745     /* update matC with the new pointers */
2746     if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */
2747       PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2748 
2749       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2750       PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2751       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2752       PetscCallHIP(hipFree(dBuffer3));
2753       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2754     }
2755     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2756   }
2757   #else
2758   size_t bufSize2;
2759   /* ask bufferSize bytes for external memory */
2760   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2761   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2762   /* inspect the matrices A and B to understand the memory requirement for the next step */
2763   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2764   /* ask bufferSize again bytes for external memory */
2765   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2766   /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2767      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2768      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2769      is stored in the descriptor! What a messy API... */
2770   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2771   /* compute the intermediate product of A * B */
2772   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2773   /* get matrix C non-zero entries C_nnz1 */
2774   PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2775   c->nz = (PetscInt)C_nnz1;
2776   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2777                       mmdata->mmBufferSize / 1024));
2778   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2779   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2780   Ccsr->values = new THRUSTARRAY(c->nz);
2781   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2782   PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2783   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2784   #endif
2785 #else
2786   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2787   PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2788                                           Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2789   c->nz                = cnz;
2790   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2791   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2792   Ccsr->values = new THRUSTARRAY(c->nz);
2793   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2794 
2795   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2796   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2797       I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2798       D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2799   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2800                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2801                                           Ccsr->column_indices->data().get()));
2802 #endif
2803   PetscCall(PetscLogGpuFlops(mmdata->flops));
2804   PetscCall(PetscLogGpuTimeEnd());
2805 finalizesym:
2806   c->free_a = PETSC_TRUE;
2807   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2808   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2809   c->free_ij = PETSC_TRUE;
2810   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2811     PetscInt      *d_i = c->i;
2812     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2813     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2814     ii = *Ccsr->row_offsets;
2815     jj = *Ccsr->column_indices;
2816     if (ciscompressed) d_i = c->compressedrow.i;
2817     PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2818     PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2819   } else {
2820     PetscInt *d_i = c->i;
2821     if (ciscompressed) d_i = c->compressedrow.i;
2822     PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2823     PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2824   }
2825   if (ciscompressed) { /* need to expand host row offsets */
2826     PetscInt r = 0;
2827     c->i[0]    = 0;
2828     for (k = 0; k < c->compressedrow.nrows; k++) {
2829       const PetscInt next = c->compressedrow.rindex[k];
2830       const PetscInt old  = c->compressedrow.i[k];
2831       for (; r < next; r++) c->i[r + 1] = old;
2832     }
2833     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2834   }
2835   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2836   PetscCall(PetscMalloc1(m, &c->ilen));
2837   PetscCall(PetscMalloc1(m, &c->imax));
2838   c->maxnz         = c->nz;
2839   c->nonzerorowcnt = 0;
2840   c->rmax          = 0;
2841   for (k = 0; k < m; k++) {
2842     const PetscInt nn = c->i[k + 1] - c->i[k];
2843     c->ilen[k] = c->imax[k] = nn;
2844     c->nonzerorowcnt += (PetscInt)!!nn;
2845     c->rmax = PetscMax(c->rmax, nn);
2846   }
2847   PetscCall(PetscMalloc1(c->nz, &c->a));
2848   Ccsr->num_entries = c->nz;
2849 
2850   C->nonzerostate++;
2851   PetscCall(PetscLayoutSetUp(C->rmap));
2852   PetscCall(PetscLayoutSetUp(C->cmap));
2853   Ccusp->nonzerostate = C->nonzerostate;
2854   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
2855   C->preallocated     = PETSC_TRUE;
2856   C->assembled        = PETSC_FALSE;
2857   C->was_assembled    = PETSC_FALSE;
2858   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2859     mmdata->reusesym = PETSC_TRUE;
2860     C->offloadmask   = PETSC_OFFLOAD_GPU;
2861   }
2862   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2863   PetscFunctionReturn(PETSC_SUCCESS);
2864 }
2865 
2866 /* handles sparse or dense B */
2867 static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2868 {
2869   Mat_Product *product = mat->product;
2870   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2871 
2872   PetscFunctionBegin;
2873   MatCheckProduct(mat, 1);
2874   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2875   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2876   if (product->type == MATPRODUCT_ABC) {
2877     Ciscusp = PETSC_FALSE;
2878     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2879   }
2880   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2881     PetscBool usecpu = PETSC_FALSE;
2882     switch (product->type) {
2883     case MATPRODUCT_AB:
2884       if (product->api_user) {
2885         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2886         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2887         PetscOptionsEnd();
2888       } else {
2889         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2890         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2891         PetscOptionsEnd();
2892       }
2893       break;
2894     case MATPRODUCT_AtB:
2895       if (product->api_user) {
2896         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2897         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2898         PetscOptionsEnd();
2899       } else {
2900         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2901         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2902         PetscOptionsEnd();
2903       }
2904       break;
2905     case MATPRODUCT_PtAP:
2906       if (product->api_user) {
2907         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2908         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2909         PetscOptionsEnd();
2910       } else {
2911         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2912         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2913         PetscOptionsEnd();
2914       }
2915       break;
2916     case MATPRODUCT_RARt:
2917       if (product->api_user) {
2918         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2919         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2920         PetscOptionsEnd();
2921       } else {
2922         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2923         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2924         PetscOptionsEnd();
2925       }
2926       break;
2927     case MATPRODUCT_ABC:
2928       if (product->api_user) {
2929         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2930         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2931         PetscOptionsEnd();
2932       } else {
2933         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2934         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2935         PetscOptionsEnd();
2936       }
2937       break;
2938     default:
2939       break;
2940     }
2941     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2942   }
2943   /* dispatch */
2944   if (isdense) {
2945     switch (product->type) {
2946     case MATPRODUCT_AB:
2947     case MATPRODUCT_AtB:
2948     case MATPRODUCT_ABt:
2949     case MATPRODUCT_PtAP:
2950     case MATPRODUCT_RARt:
2951       if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2952       else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2953       break;
2954     case MATPRODUCT_ABC:
2955       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2956       break;
2957     default:
2958       break;
2959     }
2960   } else if (Biscusp && Ciscusp) {
2961     switch (product->type) {
2962     case MATPRODUCT_AB:
2963     case MATPRODUCT_AtB:
2964     case MATPRODUCT_ABt:
2965       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2966       break;
2967     case MATPRODUCT_PtAP:
2968     case MATPRODUCT_RARt:
2969     case MATPRODUCT_ABC:
2970       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2971       break;
2972     default:
2973       break;
2974     }
2975   } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2976   PetscFunctionReturn(PETSC_SUCCESS);
2977 }
2978 
2979 static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2980 {
2981   PetscFunctionBegin;
2982   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2983   PetscFunctionReturn(PETSC_SUCCESS);
2984 }
2985 
2986 static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2987 {
2988   PetscFunctionBegin;
2989   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2990   PetscFunctionReturn(PETSC_SUCCESS);
2991 }
2992 
2993 static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2994 {
2995   PetscFunctionBegin;
2996   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2997   PetscFunctionReturn(PETSC_SUCCESS);
2998 }
2999 
3000 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3001 {
3002   PetscFunctionBegin;
3003   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3004   PetscFunctionReturn(PETSC_SUCCESS);
3005 }
3006 
3007 static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3008 {
3009   PetscFunctionBegin;
3010   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3011   PetscFunctionReturn(PETSC_SUCCESS);
3012 }
3013 
3014 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3015 {
3016   int i = blockIdx.x * blockDim.x + threadIdx.x;
3017   if (i < n) y[idx[i]] += x[i];
3018 }
3019 
3020 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3021 static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3022 {
3023   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
3024   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3025   Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3026   PetscScalar                   *xarray, *zarray, *dptr, *beta, *xptr;
3027   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3028   PetscBool                      compressed;
3029   PetscInt                       nx, ny;
3030 
3031   PetscFunctionBegin;
3032   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3033   if (!a->nz) {
3034     if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
3035     else PetscCall(VecSeq_HIP::Set(zz, 0));
3036     PetscFunctionReturn(PETSC_SUCCESS);
3037   }
3038   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3039   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3040   if (!trans) {
3041     matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3042     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3043   } else {
3044     if (herm || !A->form_explicit_transpose) {
3045       opA       = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3046       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3047     } else {
3048       if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3049       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3050     }
3051   }
3052   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3053   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3054   try {
3055     PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3056     if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3057     else PetscCall(VecHIPGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3058 
3059     PetscCall(PetscLogGpuTimeBegin());
3060     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3061       /* z = A x + beta y.
3062          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3063          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3064       */
3065       xptr = xarray;
3066       dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3067       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3068       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3069           allocated to accommodate different uses. So we get the length info directly from mat.
3070        */
3071       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3072         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3073         nx             = mat->num_cols;
3074         ny             = mat->num_rows;
3075       }
3076     } else {
3077       /* z = A^T x + beta y
3078          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3079          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3080        */
3081       xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3082       dptr = zarray;
3083       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3084       if (compressed) { /* Scatter x to work vector */
3085         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3086         thrust::for_each(
3087 #if PetscDefined(HAVE_THRUST_ASYNC)
3088           thrust::hip::par.on(PetscDefaultHipStream),
3089 #endif
3090           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3091           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3092       }
3093       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3094         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3095         nx             = mat->num_rows;
3096         ny             = mat->num_cols;
3097       }
3098     }
3099     /* csr_spmv does y = alpha op(A) x + beta y */
3100     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3101 #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
3102       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3103       if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3104         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3105         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3106         PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3107                                                     &matstruct->hipSpMV[opA].spmvBufferSize));
3108         PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3109         matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3110       } else {
3111         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3112         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3113         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3114       }
3115       PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3116                                        matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3117 #else
3118       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3119       PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3120 #endif
3121     } else {
3122       if (hipsparsestruct->nrows) {
3123         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3124         PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3125       }
3126     }
3127     PetscCall(PetscLogGpuTimeEnd());
3128 
3129     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3130       if (yy) {                                     /* MatMultAdd: zz = A*xx + yy */
3131         if (compressed) {                           /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3132           PetscCall(VecSeq_HIP::Copy(yy, zz));      /* zz = yy */
3133         } else if (zz != yy) {                      /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3134           PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3135         }
3136       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3137         PetscCall(VecSeq_HIP::Set(zz, 0));
3138       }
3139 
3140       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3141       if (compressed) {
3142         PetscCall(PetscLogGpuTimeBegin());
3143         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3144            and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3145            prevent that. So I just add a ScatterAdd kernel.
3146          */
3147 #if 0
3148         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3149         thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3150                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3151                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3152                          VecHIPPlusEquals());
3153 #else
3154         PetscInt n = matstruct->cprowIndices->size();
3155         hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3156 #endif
3157         PetscCall(PetscLogGpuTimeEnd());
3158       }
3159     } else {
3160       if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3161     }
3162     PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3163     if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3164     else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3165   } catch (char *ex) {
3166     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3167   }
3168   if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3169   else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3170   PetscFunctionReturn(PETSC_SUCCESS);
3171 }
3172 
3173 static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3174 {
3175   PetscFunctionBegin;
3176   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3177   PetscFunctionReturn(PETSC_SUCCESS);
3178 }
3179 
3180 static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3181 {
3182   PetscFunctionBegin;
3183   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3184   PetscFunctionReturn(PETSC_SUCCESS);
3185 }
3186 
3187 /*@
3188   MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3189   This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3190 
3191   Collective
3192 
3193   Input Parameters:
3194 + comm - MPI communicator, set to `PETSC_COMM_SELF`
3195 . m    - number of rows
3196 . n    - number of columns
3197 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is set
3198 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3199 
3200   Output Parameter:
3201 . A - the matrix
3202 
3203   Level: intermediate
3204 
3205   Notes:
3206   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3207   `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3208   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
3209 
3210   The AIJ format (compressed row storage), is fully compatible with standard Fortran
3211   storage.  That is, the stored row and column indices can begin at
3212   either one (as in Fortran) or zero.
3213 
3214   Specify the preallocated storage with either `nz` or `nnz` (not both).
3215   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3216   allocation.
3217 
3218 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3219 @*/
3220 PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3221 {
3222   PetscFunctionBegin;
3223   PetscCall(MatCreate(comm, A));
3224   PetscCall(MatSetSizes(*A, m, n, m, n));
3225   PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3226   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3227   PetscFunctionReturn(PETSC_SUCCESS);
3228 }
3229 
3230 static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3231 {
3232   PetscFunctionBegin;
3233   if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy(A));
3234   else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3235   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3236   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3237   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3238   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3239   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3240   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3241   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3242   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3243   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3244   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3245   PetscCall(MatDestroy_SeqAIJ(A));
3246   PetscFunctionReturn(PETSC_SUCCESS);
3247 }
3248 
3249 static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3250 {
3251   PetscFunctionBegin;
3252   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3253   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3254   PetscFunctionReturn(PETSC_SUCCESS);
3255 }
3256 
3257 static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3258 {
3259   Mat_SeqAIJ          *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3260   Mat_SeqAIJHIPSPARSE *cy;
3261   Mat_SeqAIJHIPSPARSE *cx;
3262   PetscScalar         *ay;
3263   const PetscScalar   *ax;
3264   CsrMatrix           *csry, *csrx;
3265 
3266   PetscFunctionBegin;
3267   cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3268   cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3269   if (X->ops->axpy != Y->ops->axpy) {
3270     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3271     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3272     PetscFunctionReturn(PETSC_SUCCESS);
3273   }
3274   /* if we are here, it means both matrices are bound to GPU */
3275   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3276   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3277   PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3278   PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3279   csry = (CsrMatrix *)cy->mat->mat;
3280   csrx = (CsrMatrix *)cx->mat->mat;
3281   /* see if we can turn this into a hipblas axpy */
3282   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3283     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3284     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3285     if (eq) str = SAME_NONZERO_PATTERN;
3286   }
3287   /* spgeam is buggy with one column */
3288   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3289   if (str == SUBSET_NONZERO_PATTERN) {
3290     PetscScalar b = 1.0;
3291 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3292     size_t bufferSize;
3293     void  *buffer;
3294 #endif
3295 
3296     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3297     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3298     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3299 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3300     PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3301                                                        csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3302     PetscCallHIP(hipMalloc(&buffer, bufferSize));
3303     PetscCall(PetscLogGpuTimeBegin());
3304     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3305                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3306     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3307     PetscCall(PetscLogGpuTimeEnd());
3308     PetscCallHIP(hipFree(buffer));
3309 #else
3310     PetscCall(PetscLogGpuTimeBegin());
3311     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3312                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3313     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3314     PetscCall(PetscLogGpuTimeEnd());
3315 #endif
3316     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3317     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3318     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3319   } else if (str == SAME_NONZERO_PATTERN) {
3320     hipblasHandle_t hipblasv2handle;
3321     PetscBLASInt    one = 1, bnz = 1;
3322 
3323     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3324     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3325     PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3326     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3327     PetscCall(PetscLogGpuTimeBegin());
3328     PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3329     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3330     PetscCall(PetscLogGpuTimeEnd());
3331     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3332     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3333   } else {
3334     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3335     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3336   }
3337   PetscFunctionReturn(PETSC_SUCCESS);
3338 }
3339 
3340 static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3341 {
3342   Mat_SeqAIJ     *y = (Mat_SeqAIJ *)Y->data;
3343   PetscScalar    *ay;
3344   hipblasHandle_t hipblasv2handle;
3345   PetscBLASInt    one = 1, bnz = 1;
3346 
3347   PetscFunctionBegin;
3348   PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3349   PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3350   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3351   PetscCall(PetscLogGpuTimeBegin());
3352   PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3353   PetscCall(PetscLogGpuFlops(bnz));
3354   PetscCall(PetscLogGpuTimeEnd());
3355   PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3356   PetscFunctionReturn(PETSC_SUCCESS);
3357 }
3358 
3359 static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3360 {
3361   PetscBool   both = PETSC_FALSE;
3362   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3363 
3364   PetscFunctionBegin;
3365   if (A->factortype == MAT_FACTOR_NONE) {
3366     Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3367     if (spptr->mat) {
3368       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3369       if (matrix->values) {
3370         both = PETSC_TRUE;
3371         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3372       }
3373     }
3374     if (spptr->matTranspose) {
3375       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3376       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3377     }
3378   }
3379   //PetscCall(MatZeroEntries_SeqAIJ(A));
3380   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3381   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3382   else A->offloadmask = PETSC_OFFLOAD_CPU;
3383   PetscFunctionReturn(PETSC_SUCCESS);
3384 }
3385 
3386 static PetscErrorCode MatGetCurrentMemType_SeqAIJHIPSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3387 {
3388   PetscFunctionBegin;
3389   *m = PETSC_MEMTYPE_HIP;
3390   PetscFunctionReturn(PETSC_SUCCESS);
3391 }
3392 
3393 static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3394 {
3395   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3396 
3397   PetscFunctionBegin;
3398   if (A->factortype != MAT_FACTOR_NONE) {
3399     A->boundtocpu = flg;
3400     PetscFunctionReturn(PETSC_SUCCESS);
3401   }
3402   if (flg) {
3403     PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
3404 
3405     A->ops->scale                     = MatScale_SeqAIJ;
3406     A->ops->axpy                      = MatAXPY_SeqAIJ;
3407     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3408     A->ops->mult                      = MatMult_SeqAIJ;
3409     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3410     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3411     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3412     A->ops->multhermitiantranspose    = NULL;
3413     A->ops->multhermitiantransposeadd = NULL;
3414     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3415     A->ops->getcurrentmemtype         = NULL;
3416     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3417     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3418     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3419     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3420     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3421     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3422     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3423   } else {
3424     A->ops->scale                     = MatScale_SeqAIJHIPSPARSE;
3425     A->ops->axpy                      = MatAXPY_SeqAIJHIPSPARSE;
3426     A->ops->zeroentries               = MatZeroEntries_SeqAIJHIPSPARSE;
3427     A->ops->mult                      = MatMult_SeqAIJHIPSPARSE;
3428     A->ops->multadd                   = MatMultAdd_SeqAIJHIPSPARSE;
3429     A->ops->multtranspose             = MatMultTranspose_SeqAIJHIPSPARSE;
3430     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3431     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3432     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3433     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3434     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3435     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3436     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3437     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3438     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3439     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3440     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3441     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3442     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3443     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3444     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3445     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3446     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3447     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3448   }
3449   A->boundtocpu = flg;
3450   if (flg && a->inode.size_csr) a->inode.use = PETSC_TRUE;
3451   else a->inode.use = PETSC_FALSE;
3452   PetscFunctionReturn(PETSC_SUCCESS);
3453 }
3454 
3455 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3456 {
3457   Mat B;
3458 
3459   PetscFunctionBegin;
3460   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3461   if (reuse == MAT_INITIAL_MATRIX) {
3462     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3463   } else if (reuse == MAT_REUSE_MATRIX) {
3464     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3465   }
3466   B = *newmat;
3467   PetscCall(PetscFree(B->defaultvectype));
3468   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3469   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3470     if (B->factortype == MAT_FACTOR_NONE) {
3471       Mat_SeqAIJHIPSPARSE *spptr;
3472       PetscCall(PetscNew(&spptr));
3473       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3474       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3475       spptr->format = MAT_HIPSPARSE_CSR;
3476 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3477       spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3478 #else
3479       spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3480 #endif
3481       spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3482       //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
3483 
3484       B->spptr = spptr;
3485     } else {
3486       Mat_SeqAIJHIPSPARSETriFactors *spptr;
3487 
3488       PetscCall(PetscNew(&spptr));
3489       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3490       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3491       B->spptr = spptr;
3492     }
3493     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3494   }
3495   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJHIPSPARSE;
3496   B->ops->destroy           = MatDestroy_SeqAIJHIPSPARSE;
3497   B->ops->setoption         = MatSetOption_SeqAIJHIPSPARSE;
3498   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJHIPSPARSE;
3499   B->ops->bindtocpu         = MatBindToCPU_SeqAIJHIPSPARSE;
3500   B->ops->duplicate         = MatDuplicate_SeqAIJHIPSPARSE;
3501   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3502 
3503   PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3504   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3505   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3506 #if defined(PETSC_HAVE_HYPRE)
3507   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3508 #endif
3509   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3510   PetscFunctionReturn(PETSC_SUCCESS);
3511 }
3512 
3513 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3514 {
3515   PetscFunctionBegin;
3516   PetscCall(MatCreate_SeqAIJ(B));
3517   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3518   PetscFunctionReturn(PETSC_SUCCESS);
3519 }
3520 
3521 /*MC
3522    MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs
3523 
3524    A matrix type whose data resides on AMD GPUs. These matrices can be in either
3525    CSR, ELL, or Hybrid format.
3526    All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.
3527 
3528    Options Database Keys:
3529 +  -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
3530 .  -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3531                                        Other options include ell (ellpack) or hyb (hybrid).
3532 . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3533 -  -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU
3534 
3535   Level: beginner
3536 
3537 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3538 M*/
3539 
3540 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3541 {
3542   PetscFunctionBegin;
3543   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3544   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3545   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3546   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3547   PetscFunctionReturn(PETSC_SUCCESS);
3548 }
3549 
3550 static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
3551 {
3552   Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
3553 
3554   PetscFunctionBegin;
3555   if (cusp) {
3556     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
3557     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3558     delete cusp->workVector;
3559     delete cusp->rowoffsets_gpu;
3560     delete cusp->csr2csc_i;
3561     delete cusp->coords;
3562     if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
3563     PetscCall(PetscFree(mat->spptr));
3564   }
3565   PetscFunctionReturn(PETSC_SUCCESS);
3566 }
3567 
3568 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3569 {
3570   PetscFunctionBegin;
3571   if (*mat) {
3572     delete (*mat)->values;
3573     delete (*mat)->column_indices;
3574     delete (*mat)->row_offsets;
3575     delete *mat;
3576     *mat = 0;
3577   }
3578   PetscFunctionReturn(PETSC_SUCCESS);
3579 }
3580 
3581 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3582 {
3583   PetscFunctionBegin;
3584   if (*trifactor) {
3585     if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3586     if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3587     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3588     if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3589     if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3590     if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3591     PetscCall(PetscFree(*trifactor));
3592   }
3593   PetscFunctionReturn(PETSC_SUCCESS);
3594 }
3595 
3596 static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3597 {
3598   CsrMatrix *mat;
3599 
3600   PetscFunctionBegin;
3601   if (*matstruct) {
3602     if ((*matstruct)->mat) {
3603       if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3604         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3605         PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3606       } else {
3607         mat = (CsrMatrix *)(*matstruct)->mat;
3608         PetscCall(CsrMatrix_Destroy(&mat));
3609       }
3610     }
3611     if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3612     delete (*matstruct)->cprowIndices;
3613     if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one));
3614     if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero));
3615     if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one));
3616 
3617     Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3618     if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3619     for (int i = 0; i < 3; i++) {
3620       if (mdata->hipSpMV[i].initialized) {
3621         PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3622         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3623         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3624       }
3625     }
3626     delete *matstruct;
3627     *matstruct = NULL;
3628   }
3629   PetscFunctionReturn(PETSC_SUCCESS);
3630 }
3631 
3632 PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3633 {
3634   Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
3635 
3636   PetscFunctionBegin;
3637   if (fs) {
3638     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3639     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3640     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3641     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3642     delete fs->rpermIndices;
3643     delete fs->cpermIndices;
3644     delete fs->workVector;
3645     fs->rpermIndices  = NULL;
3646     fs->cpermIndices  = NULL;
3647     fs->workVector    = NULL;
3648     fs->init_dev_prop = PETSC_FALSE;
3649 #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3650     PetscCallHIP(hipFree(fs->csrRowPtr));
3651     PetscCallHIP(hipFree(fs->csrColIdx));
3652     PetscCallHIP(hipFree(fs->csrVal));
3653     PetscCallHIP(hipFree(fs->X));
3654     PetscCallHIP(hipFree(fs->Y));
3655     // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3656     PetscCallHIP(hipFree(fs->spsvBuffer_L));
3657     PetscCallHIP(hipFree(fs->spsvBuffer_U));
3658     PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3659     PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3660     PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3661     if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3662     if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3663     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3664     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3665     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3666     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3667     if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3668     if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3669     PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3670     PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
3671 
3672     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3673     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3674 #endif
3675   }
3676   PetscFunctionReturn(PETSC_SUCCESS);
3677 }
3678 
3679 static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3680 {
3681   hipsparseHandle_t handle;
3682 
3683   PetscFunctionBegin;
3684   if (*trifactors) {
3685     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3686     if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3687     PetscCall(PetscFree(*trifactors));
3688   }
3689   PetscFunctionReturn(PETSC_SUCCESS);
3690 }
3691 
3692 struct IJCompare {
3693   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3694   {
3695     if (t1.get<0>() < t2.get<0>()) return true;
3696     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3697     return false;
3698   }
3699 };
3700 
3701 static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3702 {
3703   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3704 
3705   PetscFunctionBegin;
3706   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3707   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3708   if (destroy) {
3709     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3710     delete cusp->csr2csc_i;
3711     cusp->csr2csc_i = NULL;
3712   }
3713   A->transupdated = PETSC_FALSE;
3714   PetscFunctionReturn(PETSC_SUCCESS);
3715 }
3716 
3717 static PetscErrorCode MatCOOStructDestroy_SeqAIJHIPSPARSE(PetscCtxRt data)
3718 {
3719   MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)data;
3720 
3721   PetscFunctionBegin;
3722   PetscCallHIP(hipFree(coo->perm));
3723   PetscCallHIP(hipFree(coo->jmap));
3724   PetscCall(PetscFree(coo));
3725   PetscFunctionReturn(PETSC_SUCCESS);
3726 }
3727 
3728 static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3729 {
3730   PetscBool            dev_ij = PETSC_FALSE;
3731   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
3732   PetscInt            *i, *j;
3733   PetscContainer       container_h;
3734   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
3735 
3736   PetscFunctionBegin;
3737   PetscCall(PetscGetMemType(coo_i, &mtype));
3738   if (PetscMemTypeDevice(mtype)) {
3739     dev_ij = PETSC_TRUE;
3740     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
3741     PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3742     PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3743   } else {
3744     i = coo_i;
3745     j = coo_j;
3746   }
3747   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
3748   if (dev_ij) PetscCall(PetscFree2(i, j));
3749   mat->offloadmask = PETSC_OFFLOAD_CPU;
3750   // Create the GPU memory
3751   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));
3752 
3753   // Copy the COO struct to device
3754   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
3755   PetscCall(PetscContainerGetPointer(container_h, &coo_h));
3756   PetscCall(PetscMalloc1(1, &coo_d));
3757   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
3758   PetscCallHIP(hipMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
3759   PetscCallHIP(hipMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
3760   PetscCallHIP(hipMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
3761   PetscCallHIP(hipMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));
3762 
3763   // Put the COO struct in a container and then attach that to the matrix
3764   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJHIPSPARSE));
3765   PetscFunctionReturn(PETSC_SUCCESS);
3766 }
3767 
3768 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3769 {
3770   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
3771   const PetscCount grid_size = gridDim.x * blockDim.x;
3772   for (; i < nnz; i += grid_size) {
3773     PetscScalar sum = 0.0;
3774     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3775     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3776   }
3777 }
3778 
3779 static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3780 {
3781   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
3782   Mat_SeqAIJHIPSPARSE *dev  = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3783   PetscCount           Annz = seq->nz;
3784   PetscMemType         memtype;
3785   const PetscScalar   *v1 = v;
3786   PetscScalar         *Aa;
3787   PetscContainer       container;
3788   MatCOOStruct_SeqAIJ *coo;
3789 
3790   PetscFunctionBegin;
3791   if (!dev->mat) PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3792 
3793   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
3794   PetscCall(PetscContainerGetPointer(container, &coo));
3795 
3796   PetscCall(PetscGetMemType(v, &memtype));
3797   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3798     PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
3799     PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
3800   }
3801 
3802   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
3803   else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));
3804 
3805   PetscCall(PetscLogGpuTimeBegin());
3806   if (Annz) {
3807     hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, coo->jmap, coo->perm, imode, Aa);
3808     PetscCallHIP(hipPeekAtLastError());
3809   }
3810   PetscCall(PetscLogGpuTimeEnd());
3811 
3812   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
3813   else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
3814 
3815   if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
3816   PetscFunctionReturn(PETSC_SUCCESS);
3817 }
3818 
3819 /*@C
3820   MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.
3821 
3822   Not Collective
3823 
3824   Input Parameters:
3825 + A          - the matrix
3826 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3827 
3828   Output Parameters:
3829 + i - the CSR row pointers
3830 - j - the CSR column indices
3831 
3832   Level: developer
3833 
3834   Note:
3835   When compressed is true, the CSR structure does not contain empty rows
3836 
3837 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3838 @*/
3839 PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3840 {
3841   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3842   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
3843   CsrMatrix           *csr;
3844 
3845   PetscFunctionBegin;
3846   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3847   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
3848   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3849   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3850   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3851   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3852   csr = (CsrMatrix *)cusp->mat->mat;
3853   if (i) {
3854     if (!compressed && a->compressedrow.use) { /* need full row offset */
3855       if (!cusp->rowoffsets_gpu) {
3856         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3857         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3858         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3859       }
3860       *i = cusp->rowoffsets_gpu->data().get();
3861     } else *i = csr->row_offsets->data().get();
3862   }
3863   if (j) *j = csr->column_indices->data().get();
3864   PetscFunctionReturn(PETSC_SUCCESS);
3865 }
3866 
3867 /*@C
3868   MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`
3869 
3870   Not Collective
3871 
3872   Input Parameters:
3873 + A          - the matrix
3874 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3875 . i          - the CSR row pointers
3876 - j          - the CSR column indices
3877 
3878   Level: developer
3879 
3880 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3881 @*/
3882 PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3883 {
3884   PetscFunctionBegin;
3885   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3886   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3887   if (i) *i = NULL;
3888   if (j) *j = NULL;
3889   PetscFunctionReturn(PETSC_SUCCESS);
3890 }
3891 
3892 /*@C
3893   MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3894 
3895   Not Collective
3896 
3897   Input Parameter:
3898 . A - a `MATSEQAIJHIPSPARSE` matrix
3899 
3900   Output Parameter:
3901 . a - pointer to the device data
3902 
3903   Level: developer
3904 
3905   Note:
3906   May trigger host-device copies if the up-to-date matrix data is on host
3907 
3908 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3909 @*/
3910 PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[])
3911 {
3912   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3913   CsrMatrix           *csr;
3914 
3915   PetscFunctionBegin;
3916   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3917   PetscAssertPointer(a, 2);
3918   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3919   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3920   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3921   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3922   csr = (CsrMatrix *)cusp->mat->mat;
3923   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3924   *a = csr->values->data().get();
3925   PetscFunctionReturn(PETSC_SUCCESS);
3926 }
3927 
3928 /*@C
3929   MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`
3930 
3931   Not Collective
3932 
3933   Input Parameters:
3934 + A - a `MATSEQAIJHIPSPARSE` matrix
3935 - a - pointer to the device data
3936 
3937   Level: developer
3938 
3939 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3940 @*/
3941 PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[])
3942 {
3943   PetscFunctionBegin;
3944   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3945   PetscAssertPointer(a, 2);
3946   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3947   *a = NULL;
3948   PetscFunctionReturn(PETSC_SUCCESS);
3949 }
3950 
3951 /*@C
3952   MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3953 
3954   Not Collective
3955 
3956   Input Parameter:
3957 . A - a `MATSEQAIJHIPSPARSE` matrix
3958 
3959   Output Parameter:
3960 . a - pointer to the device data
3961 
3962   Level: developer
3963 
3964   Note:
3965   May trigger host-device copies if up-to-date matrix data is on host
3966 
3967 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3968 @*/
3969 PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[])
3970 {
3971   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3972   CsrMatrix           *csr;
3973 
3974   PetscFunctionBegin;
3975   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
3976   PetscAssertPointer(a, 2);
3977   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3978   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3979   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3980   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3981   csr = (CsrMatrix *)cusp->mat->mat;
3982   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3983   *a             = csr->values->data().get();
3984   A->offloadmask = PETSC_OFFLOAD_GPU;
3985   PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
3986   PetscFunctionReturn(PETSC_SUCCESS);
3987 }
3988 /*@C
3989   MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`
3990 
3991   Not Collective
3992 
3993   Input Parameters:
3994 + A - a `MATSEQAIJHIPSPARSE` matrix
3995 - a - pointer to the device data
3996 
3997   Level: developer
3998 
3999 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4000 @*/
4001 PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[])
4002 {
4003   PetscFunctionBegin;
4004   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4005   PetscAssertPointer(a, 2);
4006   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4007   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4008   *a = NULL;
4009   PetscFunctionReturn(PETSC_SUCCESS);
4010 }
4011 
4012 /*@C
4013   MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4014 
4015   Not Collective
4016 
4017   Input Parameter:
4018 . A - a `MATSEQAIJHIPSPARSE` matrix
4019 
4020   Output Parameter:
4021 . a - pointer to the device data
4022 
4023   Level: developer
4024 
4025   Note:
4026   Does not trigger host-device copies and flags data validity on the GPU
4027 
4028 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4029 @*/
4030 PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[])
4031 {
4032   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4033   CsrMatrix           *csr;
4034 
4035   PetscFunctionBegin;
4036   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4037   PetscAssertPointer(a, 2);
4038   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4039   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4040   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4041   csr = (CsrMatrix *)cusp->mat->mat;
4042   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4043   *a             = csr->values->data().get();
4044   A->offloadmask = PETSC_OFFLOAD_GPU;
4045   PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4046   PetscFunctionReturn(PETSC_SUCCESS);
4047 }
4048 
4049 /*@C
4050   MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`
4051 
4052   Not Collective
4053 
4054   Input Parameters:
4055 + A - a `MATSEQAIJHIPSPARSE` matrix
4056 - a - pointer to the device data
4057 
4058   Level: developer
4059 
4060 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4061 @*/
4062 PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[])
4063 {
4064   PetscFunctionBegin;
4065   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4066   PetscAssertPointer(a, 2);
4067   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4068   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4069   *a = NULL;
4070   PetscFunctionReturn(PETSC_SUCCESS);
4071 }
4072 
4073 struct IJCompare4 {
4074   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4075   {
4076     if (t1.get<0>() < t2.get<0>()) return true;
4077     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4078     return false;
4079   }
4080 };
4081 
4082 struct Shift {
4083   int _shift;
4084 
4085   Shift(int shift) : _shift(shift) { }
4086   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4087 };
4088 
4089 /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4090 PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4091 {
4092   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4093   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4094   Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4095   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
4096   PetscInt                       Annz, Bnnz;
4097   PetscInt                       i, m, n, zero = 0;
4098 
4099   PetscFunctionBegin;
4100   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4101   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4102   PetscAssertPointer(C, 4);
4103   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4104   PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4105   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4106   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4107   PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4108   PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4109   if (reuse == MAT_INITIAL_MATRIX) {
4110     m = A->rmap->n;
4111     n = A->cmap->n + B->cmap->n;
4112     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4113     PetscCall(MatSetSizes(*C, m, n, m, n));
4114     PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4115     c                       = (Mat_SeqAIJ *)(*C)->data;
4116     Ccusp                   = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4117     Cmat                    = new Mat_SeqAIJHIPSPARSEMultStruct;
4118     Ccsr                    = new CsrMatrix;
4119     Cmat->cprowIndices      = NULL;
4120     c->compressedrow.use    = PETSC_FALSE;
4121     c->compressedrow.nrows  = 0;
4122     c->compressedrow.i      = NULL;
4123     c->compressedrow.rindex = NULL;
4124     Ccusp->workVector       = NULL;
4125     Ccusp->nrows            = m;
4126     Ccusp->mat              = Cmat;
4127     Ccusp->mat->mat         = Ccsr;
4128     Ccsr->num_rows          = m;
4129     Ccsr->num_cols          = n;
4130     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4131     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4132     PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4133     PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4134     PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4135     PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4136     PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4137     PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4138     PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4139     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4140     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4141     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4142     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4143 
4144     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4145     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4146     Annz                 = (PetscInt)Acsr->column_indices->size();
4147     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4148     c->nz                = Annz + Bnnz;
4149     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4150     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4151     Ccsr->values         = new THRUSTARRAY(c->nz);
4152     Ccsr->num_entries    = c->nz;
4153     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4154     if (c->nz) {
4155       auto              Acoo = new THRUSTINTARRAY32(Annz);
4156       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4157       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4158       THRUSTINTARRAY32 *Aroff, *Broff;
4159 
4160       if (a->compressedrow.use) { /* need full row offset */
4161         if (!Acusp->rowoffsets_gpu) {
4162           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4163           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4164           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4165         }
4166         Aroff = Acusp->rowoffsets_gpu;
4167       } else Aroff = Acsr->row_offsets;
4168       if (b->compressedrow.use) { /* need full row offset */
4169         if (!Bcusp->rowoffsets_gpu) {
4170           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4171           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4172           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4173         }
4174         Broff = Bcusp->rowoffsets_gpu;
4175       } else Broff = Bcsr->row_offsets;
4176       PetscCall(PetscLogGpuTimeBegin());
4177       PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4178       PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4179       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4180       auto Aperm = thrust::make_constant_iterator(1);
4181       auto Bperm = thrust::make_constant_iterator(0);
4182       auto Bcib  = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4183       auto Bcie  = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4184       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4185       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4186       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4187       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4188       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4189       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4190       auto p1    = Ccusp->coords->begin();
4191       auto p2    = Ccusp->coords->begin();
4192       thrust::advance(p2, Annz);
4193       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4194       auto cci = thrust::make_counting_iterator(zero);
4195       auto cce = thrust::make_counting_iterator(c->nz);
4196 #if 0 //Errors on SUMMIT cuda 11.1.0
4197       PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4198 #else
4199       auto pred = thrust::identity<int>();
4200       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4201       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4202 #endif
4203       PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4204       PetscCall(PetscLogGpuTimeEnd());
4205       delete wPerm;
4206       delete Acoo;
4207       delete Bcoo;
4208       delete Ccoo;
4209       PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4210 
4211       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4212         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4213         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4214         PetscBool                      AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4215         Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4216         CsrMatrix                     *CcsrT = new CsrMatrix;
4217         CsrMatrix                     *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4218         CsrMatrix                     *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4219 
4220         (*C)->form_explicit_transpose = PETSC_TRUE;
4221         (*C)->transupdated            = PETSC_TRUE;
4222         Ccusp->rowoffsets_gpu         = NULL;
4223         CmatT->cprowIndices           = NULL;
4224         CmatT->mat                    = CcsrT;
4225         CcsrT->num_rows               = n;
4226         CcsrT->num_cols               = m;
4227         CcsrT->num_entries            = c->nz;
4228         CcsrT->row_offsets            = new THRUSTINTARRAY32(n + 1);
4229         CcsrT->column_indices         = new THRUSTINTARRAY32(c->nz);
4230         CcsrT->values                 = new THRUSTARRAY(c->nz);
4231 
4232         PetscCall(PetscLogGpuTimeBegin());
4233         auto rT = CcsrT->row_offsets->begin();
4234         if (AT) {
4235           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4236           thrust::advance(rT, -1);
4237         }
4238         if (BT) {
4239           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4240           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4241           thrust::copy(titb, tite, rT);
4242         }
4243         auto cT = CcsrT->column_indices->begin();
4244         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4245         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4246         auto vT = CcsrT->values->begin();
4247         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4248         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4249         PetscCall(PetscLogGpuTimeEnd());
4250 
4251         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4252         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4253         PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4254         PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4255         PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4256         PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4257         PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4258         PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4259         PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4260 
4261         PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4262         Ccusp->matTranspose = CmatT;
4263       }
4264     }
4265 
4266     c->free_a = PETSC_TRUE;
4267     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4268     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4269     c->free_ij = PETSC_TRUE;
4270     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4271       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4272       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4273       ii = *Ccsr->row_offsets;
4274       jj = *Ccsr->column_indices;
4275       PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4276       PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4277     } else {
4278       PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4279       PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4280     }
4281     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4282     PetscCall(PetscMalloc1(m, &c->ilen));
4283     PetscCall(PetscMalloc1(m, &c->imax));
4284     c->maxnz         = c->nz;
4285     c->nonzerorowcnt = 0;
4286     c->rmax          = 0;
4287     for (i = 0; i < m; i++) {
4288       const PetscInt nn = c->i[i + 1] - c->i[i];
4289       c->ilen[i] = c->imax[i] = nn;
4290       c->nonzerorowcnt += (PetscInt)!!nn;
4291       c->rmax = PetscMax(c->rmax, nn);
4292     }
4293     PetscCall(PetscMalloc1(c->nz, &c->a));
4294     (*C)->nonzerostate++;
4295     PetscCall(PetscLayoutSetUp((*C)->rmap));
4296     PetscCall(PetscLayoutSetUp((*C)->cmap));
4297     Ccusp->nonzerostate = (*C)->nonzerostate;
4298     (*C)->preallocated  = PETSC_TRUE;
4299   } else {
4300     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4301     c = (Mat_SeqAIJ *)(*C)->data;
4302     if (c->nz) {
4303       Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4304       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4305       PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4306       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4307       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4308       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4309       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4310       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4311       Acsr = (CsrMatrix *)Acusp->mat->mat;
4312       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4313       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4314       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4315       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4316       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4317       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4318       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4319       auto pmid = Ccusp->coords->begin();
4320       thrust::advance(pmid, Acsr->num_entries);
4321       PetscCall(PetscLogGpuTimeBegin());
4322       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4323       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4324       thrust::for_each(zibait, zieait, VecHIPEquals());
4325       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4326       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4327       thrust::for_each(zibbit, ziebit, VecHIPEquals());
4328       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4329       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4330         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4331         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4332         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4333         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4334         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4335         auto       vT    = CcsrT->values->begin();
4336         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4337         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4338         (*C)->transupdated = PETSC_TRUE;
4339       }
4340       PetscCall(PetscLogGpuTimeEnd());
4341     }
4342   }
4343   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4344   (*C)->assembled     = PETSC_TRUE;
4345   (*C)->was_assembled = PETSC_FALSE;
4346   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4347   PetscFunctionReturn(PETSC_SUCCESS);
4348 }
4349 
4350 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4351 {
4352   bool               dmem;
4353   const PetscScalar *av;
4354 
4355   PetscFunctionBegin;
4356   dmem = isHipMem(v);
4357   PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4358   if (n && idx) {
4359     THRUSTINTARRAY widx(n);
4360     widx.assign(idx, idx + n);
4361     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4362 
4363     THRUSTARRAY                    *w = NULL;
4364     thrust::device_ptr<PetscScalar> dv;
4365     if (dmem) dv = thrust::device_pointer_cast(v);
4366     else {
4367       w  = new THRUSTARRAY(n);
4368       dv = w->data();
4369     }
4370     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4371 
4372     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4373     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4374     thrust::for_each(zibit, zieit, VecHIPEquals());
4375     if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4376     delete w;
4377   } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
4378 
4379   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4380   PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4381   PetscFunctionReturn(PETSC_SUCCESS);
4382 }
4383