xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 0ea77eda237b1068ff0d8bfa28c3463dc2087695)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
55 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
94 
95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
96   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
97 
98   PetscFunctionBegin;
99   switch (op) {
100   case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break;
101   case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break;
102   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
103   }
104   PetscFunctionReturn(0);
105 }
106 
107 /*@
108    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
109    operation. Only the `MatMult()` operation can use different GPU storage formats
110 
111    Not Collective
112 
113    Input Parameters:
114 +  A - Matrix of type `MATSEQAIJCUSPARSE`
115 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
116         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
117 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
118 
119    Output Parameter:
120 
121    Level: intermediate
122 
123 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
124 @*/
125 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
126   PetscFunctionBegin;
127   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
128   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
129   PetscFunctionReturn(0);
130 }
131 
132 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) {
133   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
134 
135   PetscFunctionBegin;
136   cusparsestruct->use_cpu_solve = use_cpu;
137   PetscFunctionReturn(0);
138 }
139 
140 /*@
141    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
142 
143    Input Parameters:
144 +  A - Matrix of type `MATSEQAIJCUSPARSE`
145 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
146 
147    Output Parameter:
148 
149    Note:
150    The cuSparse LU solver currently computes the factors with the built-in CPU method
151    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
152    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
153 
154    Level: intermediate
155 
156 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
157 @*/
158 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) {
159   PetscFunctionBegin;
160   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
161   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
162   PetscFunctionReturn(0);
163 }
164 
165 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) {
166   PetscFunctionBegin;
167   switch (op) {
168   case MAT_FORM_EXPLICIT_TRANSPOSE:
169     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
170     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
171     A->form_explicit_transpose = flg;
172     break;
173   default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break;
174   }
175   PetscFunctionReturn(0);
176 }
177 
178 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
179 
180 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
181   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
182   IS                  isrow = b->row, iscol = b->col;
183   PetscBool           row_identity, col_identity;
184   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
185 
186   PetscFunctionBegin;
187   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
188   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
189   B->offloadmask = PETSC_OFFLOAD_CPU;
190   /* determine which version of MatSolve needs to be used. */
191   PetscCall(ISIdentity(isrow, &row_identity));
192   PetscCall(ISIdentity(iscol, &col_identity));
193 
194   if (!cusparsestruct->use_cpu_solve) {
195     if (row_identity && col_identity) {
196       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
197       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
198     } else {
199       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
200       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
201     }
202   }
203   B->ops->matsolve          = NULL;
204   B->ops->matsolvetranspose = NULL;
205 
206   /* get the triangular factors */
207   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
208   PetscFunctionReturn(0);
209 }
210 
211 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) {
212   MatCUSPARSEStorageFormat format;
213   PetscBool                flg;
214   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
215 
216   PetscFunctionBegin;
217   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
218   if (A->factortype == MAT_FACTOR_NONE) {
219     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
220     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
221 
222     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
223     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
224     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
225     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
226 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
227     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
228     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
229 #if CUSPARSE_VERSION > 11301
230     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
231 #else
232     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
233 #endif
234     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
235     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
236 
237     PetscCall(
238       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
239     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
240 #endif
241   }
242   PetscOptionsHeadEnd();
243   PetscFunctionReturn(0);
244 }
245 
246 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) {
247   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
248   PetscInt                           n                  = A->rmap->n;
249   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
250   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
251   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
252   const MatScalar                   *aa = a->a, *v;
253   PetscInt                          *AiLo, *AjLo;
254   PetscInt                           i, nz, nzLower, offset, rowOffset;
255 
256   PetscFunctionBegin;
257   if (!n) PetscFunctionReturn(0);
258   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
259     try {
260       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
261       nzLower = n + ai[n] - ai[1];
262       if (!loTriFactor) {
263         PetscScalar *AALo;
264 
265         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
266 
267         /* Allocate Space for the lower triangular matrix */
268         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
269         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
270 
271         /* Fill the lower triangular matrix */
272         AiLo[0]   = (PetscInt)0;
273         AiLo[n]   = nzLower;
274         AjLo[0]   = (PetscInt)0;
275         AALo[0]   = (MatScalar)1.0;
276         v         = aa;
277         vi        = aj;
278         offset    = 1;
279         rowOffset = 1;
280         for (i = 1; i < n; i++) {
281           nz      = ai[i + 1] - ai[i];
282           /* additional 1 for the term on the diagonal */
283           AiLo[i] = rowOffset;
284           rowOffset += nz + 1;
285 
286           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
287           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
288 
289           offset += nz;
290           AjLo[offset] = (PetscInt)i;
291           AALo[offset] = (MatScalar)1.0;
292           offset += 1;
293 
294           v += nz;
295           vi += nz;
296         }
297 
298         /* allocate space for the triangular factor information */
299         PetscCall(PetscNew(&loTriFactor));
300         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
301         /* Create the matrix description */
302         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
303         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
304 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
305         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
306 #else
307         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
308 #endif
309         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
310         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
311 
312         /* set the operation */
313         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
314 
315         /* set the matrix */
316         loTriFactor->csrMat              = new CsrMatrix;
317         loTriFactor->csrMat->num_rows    = n;
318         loTriFactor->csrMat->num_cols    = n;
319         loTriFactor->csrMat->num_entries = nzLower;
320 
321         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
322         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
323 
324         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
325         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
326 
327         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
328         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
329 
330         /* Create the solve analysis information */
331         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
332         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
333 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
334         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
335                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
336         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
337 #endif
338 
339         /* perform the solve analysis */
340         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
342 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
343                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
344 #else
345                                                   loTriFactor->solveInfo));
346 #endif
347         PetscCallCUDA(WaitForCUDA());
348         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
349 
350         /* assign the pointer */
351         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
352         loTriFactor->AA_h                                          = AALo;
353         PetscCallCUDA(cudaFreeHost(AiLo));
354         PetscCallCUDA(cudaFreeHost(AjLo));
355         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
356       } else { /* update values only */
357         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
358         /* Fill the lower triangular matrix */
359         loTriFactor->AA_h[0] = 1.0;
360         v                    = aa;
361         vi                   = aj;
362         offset               = 1;
363         for (i = 1; i < n; i++) {
364           nz = ai[i + 1] - ai[i];
365           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
366           offset += nz;
367           loTriFactor->AA_h[offset] = 1.0;
368           offset += 1;
369           v += nz;
370         }
371         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
372         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
373       }
374     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
375   }
376   PetscFunctionReturn(0);
377 }
378 
379 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) {
380   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
381   PetscInt                           n                  = A->rmap->n;
382   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
383   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
384   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
385   const MatScalar                   *aa = a->a, *v;
386   PetscInt                          *AiUp, *AjUp;
387   PetscInt                           i, nz, nzUpper, offset;
388 
389   PetscFunctionBegin;
390   if (!n) PetscFunctionReturn(0);
391   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
392     try {
393       /* next, figure out the number of nonzeros in the upper triangular matrix. */
394       nzUpper = adiag[0] - adiag[n];
395       if (!upTriFactor) {
396         PetscScalar *AAUp;
397 
398         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
399 
400         /* Allocate Space for the upper triangular matrix */
401         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
402         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
403 
404         /* Fill the upper triangular matrix */
405         AiUp[0] = (PetscInt)0;
406         AiUp[n] = nzUpper;
407         offset  = nzUpper;
408         for (i = n - 1; i >= 0; i--) {
409           v  = aa + adiag[i + 1] + 1;
410           vi = aj + adiag[i + 1] + 1;
411 
412           /* number of elements NOT on the diagonal */
413           nz = adiag[i] - adiag[i + 1] - 1;
414 
415           /* decrement the offset */
416           offset -= (nz + 1);
417 
418           /* first, set the diagonal elements */
419           AjUp[offset] = (PetscInt)i;
420           AAUp[offset] = (MatScalar)1. / v[nz];
421           AiUp[i]      = AiUp[i + 1] - (nz + 1);
422 
423           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
424           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
425         }
426 
427         /* allocate space for the triangular factor information */
428         PetscCall(PetscNew(&upTriFactor));
429         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
430 
431         /* Create the matrix description */
432         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
433         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
434 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
435         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
436 #else
437         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
438 #endif
439         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
440         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
441 
442         /* set the operation */
443         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
444 
445         /* set the matrix */
446         upTriFactor->csrMat              = new CsrMatrix;
447         upTriFactor->csrMat->num_rows    = n;
448         upTriFactor->csrMat->num_cols    = n;
449         upTriFactor->csrMat->num_entries = nzUpper;
450 
451         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
452         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
453 
454         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
455         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
456 
457         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
458         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
459 
460         /* Create the solve analysis information */
461         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
462         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
463 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
464         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
465                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
466         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
467 #endif
468 
469         /* perform the solve analysis */
470         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
471                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
472 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
473                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
474 #else
475                                                   upTriFactor->solveInfo));
476 #endif
477         PetscCallCUDA(WaitForCUDA());
478         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
479 
480         /* assign the pointer */
481         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
482         upTriFactor->AA_h                                          = AAUp;
483         PetscCallCUDA(cudaFreeHost(AiUp));
484         PetscCallCUDA(cudaFreeHost(AjUp));
485         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
486       } else {
487         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
488         /* Fill the upper triangular matrix */
489         offset = nzUpper;
490         for (i = n - 1; i >= 0; i--) {
491           v = aa + adiag[i + 1] + 1;
492 
493           /* number of elements NOT on the diagonal */
494           nz = adiag[i] - adiag[i + 1] - 1;
495 
496           /* decrement the offset */
497           offset -= (nz + 1);
498 
499           /* first, set the diagonal elements */
500           upTriFactor->AA_h[offset] = 1. / v[nz];
501           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
502         }
503         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
504         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
505       }
506     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
507   }
508   PetscFunctionReturn(0);
509 }
510 
511 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) {
512   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
513   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
514   IS                            isrow = a->row, iscol = a->icol;
515   PetscBool                     row_identity, col_identity;
516   PetscInt                      n = A->rmap->n;
517 
518   PetscFunctionBegin;
519   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
520   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
521   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
522 
523   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
524   cusparseTriFactors->nnz = a->nz;
525 
526   A->offloadmask = PETSC_OFFLOAD_BOTH;
527   /* lower triangular indices */
528   PetscCall(ISIdentity(isrow, &row_identity));
529   if (!row_identity && !cusparseTriFactors->rpermIndices) {
530     const PetscInt *r;
531 
532     PetscCall(ISGetIndices(isrow, &r));
533     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
534     cusparseTriFactors->rpermIndices->assign(r, r + n);
535     PetscCall(ISRestoreIndices(isrow, &r));
536     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
537   }
538 
539   /* upper triangular indices */
540   PetscCall(ISIdentity(iscol, &col_identity));
541   if (!col_identity && !cusparseTriFactors->cpermIndices) {
542     const PetscInt *c;
543 
544     PetscCall(ISGetIndices(iscol, &c));
545     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
546     cusparseTriFactors->cpermIndices->assign(c, c + n);
547     PetscCall(ISRestoreIndices(iscol, &c));
548     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
549   }
550   PetscFunctionReturn(0);
551 }
552 
553 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) {
554   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
555   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
556   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
557   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
558   PetscInt                          *AiUp, *AjUp;
559   PetscScalar                       *AAUp;
560   PetscScalar                       *AALo;
561   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
562   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
563   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
564   const MatScalar                   *aa = b->a, *v;
565 
566   PetscFunctionBegin;
567   if (!n) PetscFunctionReturn(0);
568   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
569     try {
570       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
571       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
572       if (!upTriFactor && !loTriFactor) {
573         /* Allocate Space for the upper triangular matrix */
574         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
575         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
576 
577         /* Fill the upper triangular matrix */
578         AiUp[0] = (PetscInt)0;
579         AiUp[n] = nzUpper;
580         offset  = 0;
581         for (i = 0; i < n; i++) {
582           /* set the pointers */
583           v  = aa + ai[i];
584           vj = aj + ai[i];
585           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
586 
587           /* first, set the diagonal elements */
588           AjUp[offset] = (PetscInt)i;
589           AAUp[offset] = (MatScalar)1.0 / v[nz];
590           AiUp[i]      = offset;
591           AALo[offset] = (MatScalar)1.0 / v[nz];
592 
593           offset += 1;
594           if (nz > 0) {
595             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
596             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
597             for (j = offset; j < offset + nz; j++) {
598               AAUp[j] = -AAUp[j];
599               AALo[j] = AAUp[j] / v[nz];
600             }
601             offset += nz;
602           }
603         }
604 
605         /* allocate space for the triangular factor information */
606         PetscCall(PetscNew(&upTriFactor));
607         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
608 
609         /* Create the matrix description */
610         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
611         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
612 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
613         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
614 #else
615         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
616 #endif
617         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
618         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
619 
620         /* set the matrix */
621         upTriFactor->csrMat              = new CsrMatrix;
622         upTriFactor->csrMat->num_rows    = A->rmap->n;
623         upTriFactor->csrMat->num_cols    = A->cmap->n;
624         upTriFactor->csrMat->num_entries = a->nz;
625 
626         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
627         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
628 
629         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
630         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
631 
632         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
633         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
634 
635         /* set the operation */
636         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
637 
638         /* Create the solve analysis information */
639         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
640         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
641 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
642         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
643                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
644         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
645 #endif
646 
647         /* perform the solve analysis */
648         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
649                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
650 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
651                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
652 #else
653                                                   upTriFactor->solveInfo));
654 #endif
655         PetscCallCUDA(WaitForCUDA());
656         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
657 
658         /* assign the pointer */
659         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
660 
661         /* allocate space for the triangular factor information */
662         PetscCall(PetscNew(&loTriFactor));
663         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
664 
665         /* Create the matrix description */
666         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
667         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
668 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
669         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
670 #else
671         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
672 #endif
673         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
674         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
675 
676         /* set the operation */
677         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
678 
679         /* set the matrix */
680         loTriFactor->csrMat              = new CsrMatrix;
681         loTriFactor->csrMat->num_rows    = A->rmap->n;
682         loTriFactor->csrMat->num_cols    = A->cmap->n;
683         loTriFactor->csrMat->num_entries = a->nz;
684 
685         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
686         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
687 
688         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
689         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
690 
691         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
692         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
693 
694         /* Create the solve analysis information */
695         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
696         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
697 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
698         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
699                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
700         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
701 #endif
702 
703         /* perform the solve analysis */
704         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
705                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
706 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
707                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
708 #else
709                                                   loTriFactor->solveInfo));
710 #endif
711         PetscCallCUDA(WaitForCUDA());
712         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
713 
714         /* assign the pointer */
715         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
716 
717         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
718         PetscCallCUDA(cudaFreeHost(AiUp));
719         PetscCallCUDA(cudaFreeHost(AjUp));
720       } else {
721         /* Fill the upper triangular matrix */
722         offset = 0;
723         for (i = 0; i < n; i++) {
724           /* set the pointers */
725           v  = aa + ai[i];
726           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
727 
728           /* first, set the diagonal elements */
729           AAUp[offset] = 1.0 / v[nz];
730           AALo[offset] = 1.0 / v[nz];
731 
732           offset += 1;
733           if (nz > 0) {
734             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
735             for (j = offset; j < offset + nz; j++) {
736               AAUp[j] = -AAUp[j];
737               AALo[j] = AAUp[j] / v[nz];
738             }
739             offset += nz;
740           }
741         }
742         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
743         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
744         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
745         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
746         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
747       }
748       PetscCallCUDA(cudaFreeHost(AAUp));
749       PetscCallCUDA(cudaFreeHost(AALo));
750     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
751   }
752   PetscFunctionReturn(0);
753 }
754 
755 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) {
756   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
757   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
758   IS                            ip                 = a->row;
759   PetscBool                     perm_identity;
760   PetscInt                      n = A->rmap->n;
761 
762   PetscFunctionBegin;
763   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
764   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
765   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
766   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
767 
768   A->offloadmask = PETSC_OFFLOAD_BOTH;
769 
770   /* lower triangular indices */
771   PetscCall(ISIdentity(ip, &perm_identity));
772   if (!perm_identity) {
773     IS              iip;
774     const PetscInt *irip, *rip;
775 
776     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
777     PetscCall(ISGetIndices(iip, &irip));
778     PetscCall(ISGetIndices(ip, &rip));
779     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
780     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
781     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
782     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
783     PetscCall(ISRestoreIndices(iip, &irip));
784     PetscCall(ISDestroy(&iip));
785     PetscCall(ISRestoreIndices(ip, &rip));
786     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
787   }
788   PetscFunctionReturn(0);
789 }
790 
791 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
792   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
793   IS          ip = b->row;
794   PetscBool   perm_identity;
795 
796   PetscFunctionBegin;
797   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
798   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
799   B->offloadmask = PETSC_OFFLOAD_CPU;
800   /* determine which version of MatSolve needs to be used. */
801   PetscCall(ISIdentity(ip, &perm_identity));
802   if (perm_identity) {
803     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
804     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
805     B->ops->matsolve          = NULL;
806     B->ops->matsolvetranspose = NULL;
807   } else {
808     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
809     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
810     B->ops->matsolve          = NULL;
811     B->ops->matsolvetranspose = NULL;
812   }
813 
814   /* get the triangular factors */
815   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
816   PetscFunctionReturn(0);
817 }
818 
819 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) {
820   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
821   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
822   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
823   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
824   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
825   cusparseIndexBase_t                indexBase;
826   cusparseMatrixType_t               matrixType;
827   cusparseFillMode_t                 fillMode;
828   cusparseDiagType_t                 diagType;
829 
830   PetscFunctionBegin;
831   /* allocate space for the transpose of the lower triangular factor */
832   PetscCall(PetscNew(&loTriFactorT));
833   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
834 
835   /* set the matrix descriptors of the lower triangular factor */
836   matrixType = cusparseGetMatType(loTriFactor->descr);
837   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
838   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
839   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
840 
841   /* Create the matrix description */
842   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
843   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
844   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
845   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
846   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
847 
848   /* set the operation */
849   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
850 
851   /* allocate GPU space for the CSC of the lower triangular factor*/
852   loTriFactorT->csrMat                 = new CsrMatrix;
853   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
854   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
855   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
856   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
857   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
858   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
859 
860   /* compute the transpose of the lower triangular factor, i.e. the CSC */
861 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
862   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
863                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
864                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
865   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
866 #endif
867 
868   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
869   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
870                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
871 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
872                                      loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
873 #else
874                                      loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
875 #endif
876   PetscCallCUDA(WaitForCUDA());
877   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
878 
879   /* Create the solve analysis information */
880   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
881   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
882 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
883   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
884                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
885   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
886 #endif
887 
888   /* perform the solve analysis */
889   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
890                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
891 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
892                                             loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
893 #else
894                                             loTriFactorT->solveInfo));
895 #endif
896   PetscCallCUDA(WaitForCUDA());
897   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
898 
899   /* assign the pointer */
900   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
901 
902   /*********************************************/
903   /* Now the Transpose of the Upper Tri Factor */
904   /*********************************************/
905 
906   /* allocate space for the transpose of the upper triangular factor */
907   PetscCall(PetscNew(&upTriFactorT));
908   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
909 
910   /* set the matrix descriptors of the upper triangular factor */
911   matrixType = cusparseGetMatType(upTriFactor->descr);
912   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
913   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
914   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
915 
916   /* Create the matrix description */
917   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
918   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
919   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
920   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
921   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
922 
923   /* set the operation */
924   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
925 
926   /* allocate GPU space for the CSC of the upper triangular factor*/
927   upTriFactorT->csrMat                 = new CsrMatrix;
928   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
929   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
930   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
931   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
932   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
933   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
934 
935   /* compute the transpose of the upper triangular factor, i.e. the CSC */
936 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
937   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
938                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
939                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
940   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
941 #endif
942 
943   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
944   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
945                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
946 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
947                                      upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
948 #else
949                                      upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
950 #endif
951 
952   PetscCallCUDA(WaitForCUDA());
953   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
954 
955   /* Create the solve analysis information */
956   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
957   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
958 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
959   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
960                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
961   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
962 #endif
963 
964   /* perform the solve analysis */
965   /* christ, would it have killed you to put this stuff in a function????????? */
966   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
967                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
968 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
969                                             upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
970 #else
971                                             upTriFactorT->solveInfo));
972 #endif
973 
974   PetscCallCUDA(WaitForCUDA());
975   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
976 
977   /* assign the pointer */
978   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
979   PetscFunctionReturn(0);
980 }
981 
982 struct PetscScalarToPetscInt {
983   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
984 };
985 
986 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) {
987   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
988   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
989   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
990   cusparseStatus_t              stat;
991   cusparseIndexBase_t           indexBase;
992 
993   PetscFunctionBegin;
994   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
995   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
996   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
997   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
998   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
999   if (A->transupdated) PetscFunctionReturn(0);
1000   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1001   PetscCall(PetscLogGpuTimeBegin());
1002   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1003   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1004     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1005     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1006     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1007     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1008     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1009 
1010     /* set alpha and beta */
1011     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1012     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1013     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1014     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1015     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1016     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1017 
1018     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1019       CsrMatrix *matrixT      = new CsrMatrix;
1020       matstructT->mat         = matrixT;
1021       matrixT->num_rows       = A->cmap->n;
1022       matrixT->num_cols       = A->rmap->n;
1023       matrixT->num_entries    = a->nz;
1024       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1025       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1026       matrixT->values         = new THRUSTARRAY(a->nz);
1027 
1028       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1029       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1030 
1031 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1032 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1033       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1034                                indexBase, cusparse_scalartype);
1035       PetscCallCUSPARSE(stat);
1036 #else
1037       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1038            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1039 
1040            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1041            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1042            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1043         */
1044       if (matrixT->num_entries) {
1045         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1046         PetscCallCUSPARSE(stat);
1047 
1048       } else {
1049         matstructT->matDescr = NULL;
1050         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1051       }
1052 #endif
1053 #endif
1054     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1055 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1056       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1057 #else
1058       CsrMatrix *temp = new CsrMatrix;
1059       CsrMatrix *tempT = new CsrMatrix;
1060       /* First convert HYB to CSR */
1061       temp->num_rows = A->rmap->n;
1062       temp->num_cols = A->cmap->n;
1063       temp->num_entries = a->nz;
1064       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1065       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1066       temp->values = new THRUSTARRAY(a->nz);
1067 
1068       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1069       PetscCallCUSPARSE(stat);
1070 
1071       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1072       tempT->num_rows = A->rmap->n;
1073       tempT->num_cols = A->cmap->n;
1074       tempT->num_entries = a->nz;
1075       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1076       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1077       tempT->values = new THRUSTARRAY(a->nz);
1078 
1079       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1080                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1081       PetscCallCUSPARSE(stat);
1082 
1083       /* Last, convert CSC to HYB */
1084       cusparseHybMat_t hybMat;
1085       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1086       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1087       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1088       PetscCallCUSPARSE(stat);
1089 
1090       /* assign the pointer */
1091       matstructT->mat = hybMat;
1092       A->transupdated = PETSC_TRUE;
1093       /* delete temporaries */
1094       if (tempT) {
1095         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1096         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1097         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1098         delete (CsrMatrix *)tempT;
1099       }
1100       if (temp) {
1101         if (temp->values) delete (THRUSTARRAY *)temp->values;
1102         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1103         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1104         delete (CsrMatrix *)temp;
1105       }
1106 #endif
1107     }
1108   }
1109   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1110     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1111     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1112     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1113     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1114     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1115     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1116     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1117     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1118     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1119     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1120     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1121       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1122       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1123       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1124     }
1125     if (!cusparsestruct->csr2csc_i) {
1126       THRUSTARRAY csr2csc_a(matrix->num_entries);
1127       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1128 
1129       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1130 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1131       void  *csr2cscBuffer;
1132       size_t csr2cscBufferSize;
1133       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1134                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1135       PetscCallCUSPARSE(stat);
1136       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1137 #endif
1138 
1139       if (matrix->num_entries) {
1140         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1141            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1142            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1143 
1144            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1145            should be filled with indexBase. So I just take a shortcut here.
1146         */
1147         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1148 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1149                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1150         PetscCallCUSPARSE(stat);
1151 #else
1152                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1153         PetscCallCUSPARSE(stat);
1154 #endif
1155       } else {
1156         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1157       }
1158 
1159       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1160       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1161 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1162       PetscCallCUDA(cudaFree(csr2cscBuffer));
1163 #endif
1164     }
1165     PetscCallThrust(
1166       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1167   }
1168   PetscCall(PetscLogGpuTimeEnd());
1169   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1170   /* the compressed row indices is not used for matTranspose */
1171   matstructT->cprowIndices                       = NULL;
1172   /* assign the pointer */
1173   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1174   A->transupdated                                = PETSC_TRUE;
1175   PetscFunctionReturn(0);
1176 }
1177 
1178 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1179 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1180   PetscInt                              n = xx->map->n;
1181   const PetscScalar                    *barray;
1182   PetscScalar                          *xarray;
1183   thrust::device_ptr<const PetscScalar> bGPU;
1184   thrust::device_ptr<PetscScalar>       xGPU;
1185   cusparseStatus_t                      stat;
1186   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1187   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1188   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1189   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1190 
1191   PetscFunctionBegin;
1192   /* Analyze the matrix and create the transpose ... on the fly */
1193   if (!loTriFactorT && !upTriFactorT) {
1194     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1195     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1196     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1197   }
1198 
1199   /* Get the GPU pointers */
1200   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1201   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1202   xGPU = thrust::device_pointer_cast(xarray);
1203   bGPU = thrust::device_pointer_cast(barray);
1204 
1205   PetscCall(PetscLogGpuTimeBegin());
1206   /* First, reorder with the row permutation */
1207   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1208 
1209   /* First, solve U */
1210   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
1211 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1212                               upTriFactorT->csrMat->num_entries,
1213 #endif
1214                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray,
1215 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1216                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
1217   PetscCallCUSPARSE(stat);
1218 #else
1219                               tempGPU->data().get());
1220   PetscCallCUSPARSE(stat);
1221 #endif
1222 
1223   /* Then, solve L */
1224   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
1225 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1226                               loTriFactorT->csrMat->num_entries,
1227 #endif
1228                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1229                               tempGPU->data().get(),
1230 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1231                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
1232   PetscCallCUSPARSE(stat);
1233 #else
1234                               xarray);
1235   PetscCallCUSPARSE(stat);
1236 #endif
1237 
1238   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1239   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1240 
1241   /* Copy the temporary to the full solution. */
1242   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1243 
1244   /* restore */
1245   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1246   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1247   PetscCall(PetscLogGpuTimeEnd());
1248   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1249   PetscFunctionReturn(0);
1250 }
1251 
1252 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1253   const PetscScalar                 *barray;
1254   PetscScalar                       *xarray;
1255   cusparseStatus_t                   stat;
1256   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1257   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1258   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1259   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1260 
1261   PetscFunctionBegin;
1262   /* Analyze the matrix and create the transpose ... on the fly */
1263   if (!loTriFactorT && !upTriFactorT) {
1264     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1265     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1266     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1267   }
1268 
1269   /* Get the GPU pointers */
1270   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1271   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1272 
1273   PetscCall(PetscLogGpuTimeBegin());
1274   /* First, solve U */
1275   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
1276 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1277                               upTriFactorT->csrMat->num_entries,
1278 #endif
1279                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray,
1280 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1281                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
1282   PetscCallCUSPARSE(stat);
1283 #else
1284                               tempGPU->data().get());
1285   PetscCallCUSPARSE(stat);
1286 #endif
1287 
1288   /* Then, solve L */
1289   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
1290 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1291                               loTriFactorT->csrMat->num_entries,
1292 #endif
1293                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1294                               tempGPU->data().get(),
1295 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1296                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
1297   PetscCallCUSPARSE(stat);
1298 #else
1299                               xarray);
1300   PetscCallCUSPARSE(stat);
1301 #endif
1302 
1303   /* restore */
1304   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1305   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1306   PetscCall(PetscLogGpuTimeEnd());
1307   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1308   PetscFunctionReturn(0);
1309 }
1310 
1311 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1312   const PetscScalar                    *barray;
1313   PetscScalar                          *xarray;
1314   thrust::device_ptr<const PetscScalar> bGPU;
1315   thrust::device_ptr<PetscScalar>       xGPU;
1316   cusparseStatus_t                      stat;
1317   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1318   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1319   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1320   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1321 
1322   PetscFunctionBegin;
1323 
1324   /* Get the GPU pointers */
1325   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1326   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1327   xGPU = thrust::device_pointer_cast(xarray);
1328   bGPU = thrust::device_pointer_cast(barray);
1329 
1330   PetscCall(PetscLogGpuTimeBegin());
1331   /* First, reorder with the row permutation */
1332   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1333 
1334   /* Next, solve L */
1335   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
1336 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1337                               loTriFactor->csrMat->num_entries,
1338 #endif
1339                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
1340                               tempGPU->data().get(),
1341 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1342                               xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer);
1343   PetscCallCUSPARSE(stat);
1344 #else
1345                               xarray);
1346   PetscCallCUSPARSE(stat);
1347 #endif
1348 
1349   /* Then, solve U */
1350   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
1351 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1352                               upTriFactor->csrMat->num_entries,
1353 #endif
1354                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray,
1355 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1356                               tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer);
1357   PetscCallCUSPARSE(stat);
1358 #else
1359                               tempGPU->data().get());
1360   PetscCallCUSPARSE(stat);
1361 #endif
1362 
1363   /* Last, reorder with the column permutation */
1364   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1365 
1366   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1367   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1368   PetscCall(PetscLogGpuTimeEnd());
1369   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1374   const PetscScalar                 *barray;
1375   PetscScalar                       *xarray;
1376   cusparseStatus_t                   stat;
1377   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1378   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1379   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1380   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1381 
1382   PetscFunctionBegin;
1383   /* Get the GPU pointers */
1384   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1385   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1386 
1387   PetscCall(PetscLogGpuTimeBegin());
1388   /* First, solve L */
1389   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
1390 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1391                               loTriFactor->csrMat->num_entries,
1392 #endif
1393                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray,
1394 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1395                               tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer);
1396   PetscCallCUSPARSE(stat);
1397 #else
1398                               tempGPU->data().get());
1399   PetscCallCUSPARSE(stat);
1400 #endif
1401 
1402   /* Next, solve U */
1403   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
1404 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1405                               upTriFactor->csrMat->num_entries,
1406 #endif
1407                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
1408                               tempGPU->data().get(),
1409 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1410                               xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer);
1411   PetscCallCUSPARSE(stat);
1412 #else
1413                               xarray);
1414   PetscCallCUSPARSE(stat);
1415 #endif
1416 
1417   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1418   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1419   PetscCall(PetscLogGpuTimeEnd());
1420   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1421   PetscFunctionReturn(0);
1422 }
1423 
1424 #if CUSPARSE_VERSION >= 11500
1425 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1426 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1427   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1428   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1429   const PetscScalar            *barray;
1430   PetscScalar                  *xarray;
1431 
1432   PetscFunctionBegin;
1433   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1434   PetscCall(VecCUDAGetArrayRead(b, &barray));
1435   PetscCall(PetscLogGpuTimeBegin());
1436 
1437   /* Solve L*y = b */
1438   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1439   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1440   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1441                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1442                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1443 
1444   /* Solve U*x = y */
1445   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1446   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1447                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1448 
1449   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1450   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1451 
1452   PetscCall(PetscLogGpuTimeEnd());
1453   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1454   PetscFunctionReturn(0);
1455 }
1456 
1457 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1458   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1459   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1460   const PetscScalar            *barray;
1461   PetscScalar                  *xarray;
1462 
1463   PetscFunctionBegin;
1464   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1465     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1466     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1467                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1468 
1469     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1470     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1471     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1472     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1473     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1474   }
1475 
1476   if (!fs->updatedTransposeSpSVAnalysis) {
1477     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1478 
1479     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1480     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1481   }
1482 
1483   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1484   PetscCall(VecCUDAGetArrayRead(b, &barray));
1485   PetscCall(PetscLogGpuTimeBegin());
1486 
1487   /* Solve Ut*y = b */
1488   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1489   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1490   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1491                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1492 
1493   /* Solve Lt*x = y */
1494   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1495   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1496                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1497 
1498   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1499   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1500   PetscCall(PetscLogGpuTimeEnd());
1501   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1502   PetscFunctionReturn(0);
1503 }
1504 
1505 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) {
1506   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1507   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1508   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1509   CsrMatrix                    *Acsr;
1510   PetscInt                      m, nz;
1511   PetscBool                     flg;
1512 
1513   PetscFunctionBegin;
1514   if (PetscDefined(USE_DEBUG)) {
1515     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1516     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1517   }
1518 
1519   /* Copy A's value to fact */
1520   m  = fact->rmap->n;
1521   nz = aij->nz;
1522   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1523   Acsr = (CsrMatrix *)Acusp->mat->mat;
1524   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1525 
1526   /* Factorize fact inplace */
1527   if (m)
1528     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1529                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1530   if (PetscDefined(USE_DEBUG)) {
1531     int              numerical_zero;
1532     cusparseStatus_t status;
1533     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1534     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1535   }
1536 
1537   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1538      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1539   */
1540   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1541 
1542   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1543 
1544   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1545   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1546 
1547   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1548   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1549   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1550   fact->ops->matsolve          = NULL;
1551   fact->ops->matsolvetranspose = NULL;
1552   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1553   PetscFunctionReturn(0);
1554 }
1555 
1556 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1557   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1558   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1559   PetscInt                      m, nz;
1560 
1561   PetscFunctionBegin;
1562   if (PetscDefined(USE_DEBUG)) {
1563     PetscInt  i;
1564     PetscBool flg, missing;
1565 
1566     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1567     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1568     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1569     PetscCall(MatMissingDiagonal(A, &missing, &i));
1570     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1571   }
1572 
1573   /* Free the old stale stuff */
1574   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1575 
1576   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1577      but they will not be used. Allocate them just for easy debugging.
1578    */
1579   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1580 
1581   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1582   fact->factortype             = MAT_FACTOR_ILU;
1583   fact->info.factor_mallocs    = 0;
1584   fact->info.fill_ratio_given  = info->fill;
1585   fact->info.fill_ratio_needed = 1.0;
1586 
1587   aij->row = NULL;
1588   aij->col = NULL;
1589 
1590   /* ====================================================================== */
1591   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1592   /* We'll do in-place factorization on fact                                */
1593   /* ====================================================================== */
1594   const int *Ai, *Aj;
1595 
1596   m  = fact->rmap->n;
1597   nz = aij->nz;
1598 
1599   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1600   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1601   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1602   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1603   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1604   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1605 
1606   /* ====================================================================== */
1607   /* Create descriptors for M, L, U                                         */
1608   /* ====================================================================== */
1609   cusparseFillMode_t fillMode;
1610   cusparseDiagType_t diagType;
1611 
1612   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1613   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1614   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1615 
1616   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1617     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1618     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1619     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1620     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1621   */
1622   fillMode = CUSPARSE_FILL_MODE_LOWER;
1623   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1624   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1625   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1626   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1627 
1628   fillMode = CUSPARSE_FILL_MODE_UPPER;
1629   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1630   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1631   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1632   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1633 
1634   /* ========================================================================= */
1635   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1636   /* ========================================================================= */
1637   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1638   if (m)
1639     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1640                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1641 
1642   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1643   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1644 
1645   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1646   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1647 
1648   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1649   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1650 
1651   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1652   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1653 
1654   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1655      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1656      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1657      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1658    */
1659   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1660     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1661     fs->spsvBuffer_L = fs->factBuffer_M;
1662     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1663   } else {
1664     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1665     fs->spsvBuffer_U = fs->factBuffer_M;
1666     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1667   }
1668 
1669   /* ========================================================================== */
1670   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1671   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1672   /* ========================================================================== */
1673   int              structural_zero;
1674   cusparseStatus_t status;
1675 
1676   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1677   if (m)
1678     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1679                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1680   if (PetscDefined(USE_DEBUG)) {
1681     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1682     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1683     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1684   }
1685 
1686   /* Estimate FLOPs of the numeric factorization */
1687   {
1688     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1689     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1690     PetscLogDouble flops = 0.0;
1691 
1692     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1693     Ai    = Aseq->i;
1694     Adiag = Aseq->diag;
1695     for (PetscInt i = 0; i < m; i++) {
1696       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1697         nzRow  = Ai[i + 1] - Ai[i];
1698         nzLeft = Adiag[i] - Ai[i];
1699         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1700           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1701         */
1702         nzLeft = (nzRow - 1) / 2;
1703         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1704       }
1705     }
1706     fs->numericFactFlops = flops;
1707   }
1708   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1709   PetscFunctionReturn(0);
1710 }
1711 
1712 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) {
1713   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1714   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1715   const PetscScalar            *barray;
1716   PetscScalar                  *xarray;
1717 
1718   PetscFunctionBegin;
1719   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1720   PetscCall(VecCUDAGetArrayRead(b, &barray));
1721   PetscCall(PetscLogGpuTimeBegin());
1722 
1723   /* Solve L*y = b */
1724   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1725   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1726   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1727                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1728 
1729   /* Solve Lt*x = y */
1730   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1731   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1732                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1733 
1734   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1735   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1736 
1737   PetscCall(PetscLogGpuTimeEnd());
1738   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1739   PetscFunctionReturn(0);
1740 }
1741 
1742 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) {
1743   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1744   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1745   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1746   CsrMatrix                    *Acsr;
1747   PetscInt                      m, nz;
1748   PetscBool                     flg;
1749 
1750   PetscFunctionBegin;
1751   if (PetscDefined(USE_DEBUG)) {
1752     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1753     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1754   }
1755 
1756   /* Copy A's value to fact */
1757   m  = fact->rmap->n;
1758   nz = aij->nz;
1759   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1760   Acsr = (CsrMatrix *)Acusp->mat->mat;
1761   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1762 
1763   /* Factorize fact inplace */
1764   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1765      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1766      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1767      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1768      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1769    */
1770   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1771   if (PetscDefined(USE_DEBUG)) {
1772     int              numerical_zero;
1773     cusparseStatus_t status;
1774     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1775     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1776   }
1777 
1778   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1779 
1780   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1781     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1782   */
1783   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1784 
1785   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1786   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1787   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1788   fact->ops->matsolve          = NULL;
1789   fact->ops->matsolvetranspose = NULL;
1790   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1791   PetscFunctionReturn(0);
1792 }
1793 
1794 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
1795   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1796   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1797   PetscInt                      m, nz;
1798 
1799   PetscFunctionBegin;
1800   if (PetscDefined(USE_DEBUG)) {
1801     PetscInt  i;
1802     PetscBool flg, missing;
1803 
1804     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1805     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1806     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1807     PetscCall(MatMissingDiagonal(A, &missing, &i));
1808     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1809   }
1810 
1811   /* Free the old stale stuff */
1812   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1813 
1814   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1815      but they will not be used. Allocate them just for easy debugging.
1816    */
1817   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1818 
1819   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1820   fact->factortype             = MAT_FACTOR_ICC;
1821   fact->info.factor_mallocs    = 0;
1822   fact->info.fill_ratio_given  = info->fill;
1823   fact->info.fill_ratio_needed = 1.0;
1824 
1825   aij->row = NULL;
1826   aij->col = NULL;
1827 
1828   /* ====================================================================== */
1829   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1830   /* We'll do in-place factorization on fact                                */
1831   /* ====================================================================== */
1832   const int *Ai, *Aj;
1833 
1834   m  = fact->rmap->n;
1835   nz = aij->nz;
1836 
1837   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1838   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1839   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1840   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1841   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1842   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1843 
1844   /* ====================================================================== */
1845   /* Create mat descriptors for M, L                                        */
1846   /* ====================================================================== */
1847   cusparseFillMode_t fillMode;
1848   cusparseDiagType_t diagType;
1849 
1850   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1851   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1852   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1853 
1854   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1855     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1856     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1857     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1858     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1859   */
1860   fillMode = CUSPARSE_FILL_MODE_LOWER;
1861   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1862   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1863   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1864   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1865 
1866   /* ========================================================================= */
1867   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1868   /* ========================================================================= */
1869   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1870   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1871 
1872   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1873   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1874 
1875   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1876   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1877 
1878   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1879   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1880 
1881   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1882   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1883 
1884   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1885      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1886    */
1887   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1888     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1889     fs->spsvBuffer_L = fs->factBuffer_M;
1890     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1891   } else {
1892     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1893     fs->spsvBuffer_Lt = fs->factBuffer_M;
1894     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1895   }
1896 
1897   /* ========================================================================== */
1898   /* Perform analysis of ic0 on M                                               */
1899   /* The lower triangular part of M has the same sparsity pattern as L          */
1900   /* ========================================================================== */
1901   int              structural_zero;
1902   cusparseStatus_t status;
1903 
1904   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1905   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1906   if (PetscDefined(USE_DEBUG)) {
1907     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1908     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1909     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1910   }
1911 
1912   /* Estimate FLOPs of the numeric factorization */
1913   {
1914     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1915     PetscInt      *Ai, nzRow, nzLeft;
1916     PetscLogDouble flops = 0.0;
1917 
1918     Ai = Aseq->i;
1919     for (PetscInt i = 0; i < m; i++) {
1920       nzRow = Ai[i + 1] - Ai[i];
1921       if (nzRow > 1) {
1922         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1923           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1924         */
1925         nzLeft = (nzRow - 1) / 2;
1926         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1927       }
1928     }
1929     fs->numericFactFlops = flops;
1930   }
1931   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1932   PetscFunctionReturn(0);
1933 }
1934 #endif
1935 
1936 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1937   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1938 
1939   PetscFunctionBegin;
1940 #if CUSPARSE_VERSION >= 11500
1941   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1942   if (cusparseTriFactors->factorizeOnDevice) {
1943     PetscCall(ISIdentity(isrow, &row_identity));
1944     PetscCall(ISIdentity(iscol, &col_identity));
1945   }
1946   if (!info->levels && row_identity && col_identity) {
1947     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1948   } else
1949 #endif
1950   {
1951     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1952     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1953     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1954   }
1955   PetscFunctionReturn(0);
1956 }
1957 
1958 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1959   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1960 
1961   PetscFunctionBegin;
1962   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1963   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1964   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1965   PetscFunctionReturn(0);
1966 }
1967 
1968 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1969   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1970 
1971   PetscFunctionBegin;
1972 #if CUSPARSE_VERSION >= 11500
1973   PetscBool perm_identity = PETSC_FALSE;
1974   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1975   if (!info->levels && perm_identity) {
1976     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1977   } else
1978 #endif
1979   {
1980     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1981     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1982     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1983   }
1984   PetscFunctionReturn(0);
1985 }
1986 
1987 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1988   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1989 
1990   PetscFunctionBegin;
1991   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1992   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1993   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1994   PetscFunctionReturn(0);
1995 }
1996 
1997 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) {
1998   PetscFunctionBegin;
1999   *type = MATSOLVERCUSPARSE;
2000   PetscFunctionReturn(0);
2001 }
2002 
2003 /*MC
2004   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2005   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2006   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2007   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2008   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2009   algorithms are not recommended. This class does NOT support direct solver operations.
2010 
2011   Level: beginner
2012 
2013 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2014 M*/
2015 
2016 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) {
2017   PetscInt  n = A->rmap->n;
2018   PetscBool factOnDevice, factOnHost;
2019   char     *prefix;
2020   char      factPlace[32] = "device"; /* the default */
2021 
2022   PetscFunctionBegin;
2023   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2024   PetscCall(MatSetSizes(*B, n, n, n, n));
2025   (*B)->factortype = ftype;
2026   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2027 
2028   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2029   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2030   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2031   PetscOptionsEnd();
2032   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2033   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2034   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2035   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2036 
2037   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2038   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2039     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2040     if (!A->boundtocpu) {
2041       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2042       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2043     } else {
2044       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2045       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2046     }
2047     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2048     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2049     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2050   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2051     if (!A->boundtocpu) {
2052       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2053       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2054     } else {
2055       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2056       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2057     }
2058     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2059     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2060   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2061 
2062   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2063   (*B)->canuseordering = PETSC_TRUE;
2064   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2065   PetscFunctionReturn(0);
2066 }
2067 
2068 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) {
2069   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2070   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2071 #if CUSPARSE_VERSION >= 13500
2072   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2073 #endif
2074 
2075   PetscFunctionBegin;
2076   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2077     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2078     if (A->factortype == MAT_FACTOR_NONE) {
2079       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2080       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2081     }
2082 #if CUSPARSE_VERSION >= 13500
2083     else if (fs->csrVal) {
2084       /* We have a factorized matrix on device and are able to copy it to host */
2085       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2086     }
2087 #endif
2088     else
2089       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2090     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2091     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2092     A->offloadmask = PETSC_OFFLOAD_BOTH;
2093   }
2094   PetscFunctionReturn(0);
2095 }
2096 
2097 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2098   PetscFunctionBegin;
2099   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2100   *array = ((Mat_SeqAIJ *)A->data)->a;
2101   PetscFunctionReturn(0);
2102 }
2103 
2104 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2105   PetscFunctionBegin;
2106   A->offloadmask = PETSC_OFFLOAD_CPU;
2107   *array         = NULL;
2108   PetscFunctionReturn(0);
2109 }
2110 
2111 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
2112   PetscFunctionBegin;
2113   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2114   *array = ((Mat_SeqAIJ *)A->data)->a;
2115   PetscFunctionReturn(0);
2116 }
2117 
2118 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
2119   PetscFunctionBegin;
2120   *array = NULL;
2121   PetscFunctionReturn(0);
2122 }
2123 
2124 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2125   PetscFunctionBegin;
2126   *array = ((Mat_SeqAIJ *)A->data)->a;
2127   PetscFunctionReturn(0);
2128 }
2129 
2130 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2131   PetscFunctionBegin;
2132   A->offloadmask = PETSC_OFFLOAD_CPU;
2133   *array         = NULL;
2134   PetscFunctionReturn(0);
2135 }
2136 
2137 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) {
2138   Mat_SeqAIJCUSPARSE *cusp;
2139   CsrMatrix          *matrix;
2140 
2141   PetscFunctionBegin;
2142   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2143   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2144   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2145   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2146   matrix = (CsrMatrix *)cusp->mat->mat;
2147 
2148   if (i) {
2149 #if !defined(PETSC_USE_64BIT_INDICES)
2150     *i = matrix->row_offsets->data().get();
2151 #else
2152     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2153 #endif
2154   }
2155   if (j) {
2156 #if !defined(PETSC_USE_64BIT_INDICES)
2157     *j = matrix->column_indices->data().get();
2158 #else
2159     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2160 #endif
2161   }
2162   if (a) *a = matrix->values->data().get();
2163   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2164   PetscFunctionReturn(0);
2165 }
2166 
2167 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) {
2168   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2169   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2170   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2171   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2172   cusparseStatus_t              stat;
2173   PetscBool                     both = PETSC_TRUE;
2174 
2175   PetscFunctionBegin;
2176   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2177   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2178     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2179       CsrMatrix *matrix;
2180       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2181 
2182       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2183       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2184       matrix->values->assign(a->a, a->a + a->nz);
2185       PetscCallCUDA(WaitForCUDA());
2186       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2187       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2188       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2189     } else {
2190       PetscInt nnz;
2191       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2192       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2193       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2194       delete cusparsestruct->workVector;
2195       delete cusparsestruct->rowoffsets_gpu;
2196       cusparsestruct->workVector     = NULL;
2197       cusparsestruct->rowoffsets_gpu = NULL;
2198       try {
2199         if (a->compressedrow.use) {
2200           m    = a->compressedrow.nrows;
2201           ii   = a->compressedrow.i;
2202           ridx = a->compressedrow.rindex;
2203         } else {
2204           m    = A->rmap->n;
2205           ii   = a->i;
2206           ridx = NULL;
2207         }
2208         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2209         if (!a->a) {
2210           nnz  = ii[m];
2211           both = PETSC_FALSE;
2212         } else nnz = a->nz;
2213         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2214 
2215         /* create cusparse matrix */
2216         cusparsestruct->nrows = m;
2217         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2218         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2219         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2220         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2221 
2222         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2223         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2224         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2225         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2226         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2227         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2228         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2229 
2230         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2231         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2232           /* set the matrix */
2233           CsrMatrix *mat   = new CsrMatrix;
2234           mat->num_rows    = m;
2235           mat->num_cols    = A->cmap->n;
2236           mat->num_entries = nnz;
2237           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2238           mat->row_offsets->assign(ii, ii + m + 1);
2239 
2240           mat->column_indices = new THRUSTINTARRAY32(nnz);
2241           mat->column_indices->assign(a->j, a->j + nnz);
2242 
2243           mat->values = new THRUSTARRAY(nnz);
2244           if (a->a) mat->values->assign(a->a, a->a + nnz);
2245 
2246           /* assign the pointer */
2247           matstruct->mat = mat;
2248 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2249           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2250             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2251                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2252             PetscCallCUSPARSE(stat);
2253           }
2254 #endif
2255         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2256 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2257           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2258 #else
2259           CsrMatrix *mat = new CsrMatrix;
2260           mat->num_rows = m;
2261           mat->num_cols = A->cmap->n;
2262           mat->num_entries = nnz;
2263           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2264           mat->row_offsets->assign(ii, ii + m + 1);
2265 
2266           mat->column_indices = new THRUSTINTARRAY32(nnz);
2267           mat->column_indices->assign(a->j, a->j + nnz);
2268 
2269           mat->values = new THRUSTARRAY(nnz);
2270           if (a->a) mat->values->assign(a->a, a->a + nnz);
2271 
2272           cusparseHybMat_t hybMat;
2273           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2274           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2275           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2276           PetscCallCUSPARSE(stat);
2277           /* assign the pointer */
2278           matstruct->mat = hybMat;
2279 
2280           if (mat) {
2281             if (mat->values) delete (THRUSTARRAY *)mat->values;
2282             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2283             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2284             delete (CsrMatrix *)mat;
2285           }
2286 #endif
2287         }
2288 
2289         /* assign the compressed row indices */
2290         if (a->compressedrow.use) {
2291           cusparsestruct->workVector = new THRUSTARRAY(m);
2292           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2293           matstruct->cprowIndices->assign(ridx, ridx + m);
2294           tmp = m;
2295         } else {
2296           cusparsestruct->workVector = NULL;
2297           matstruct->cprowIndices    = NULL;
2298           tmp                        = 0;
2299         }
2300         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2301 
2302         /* assign the pointer */
2303         cusparsestruct->mat = matstruct;
2304       } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
2305       PetscCallCUDA(WaitForCUDA());
2306       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2307       cusparsestruct->nonzerostate = A->nonzerostate;
2308     }
2309     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2310   }
2311   PetscFunctionReturn(0);
2312 }
2313 
2314 struct VecCUDAPlusEquals {
2315   template <typename Tuple>
2316   __host__ __device__ void operator()(Tuple t) {
2317     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2318   }
2319 };
2320 
2321 struct VecCUDAEquals {
2322   template <typename Tuple>
2323   __host__ __device__ void operator()(Tuple t) {
2324     thrust::get<1>(t) = thrust::get<0>(t);
2325   }
2326 };
2327 
2328 struct VecCUDAEqualsReverse {
2329   template <typename Tuple>
2330   __host__ __device__ void operator()(Tuple t) {
2331     thrust::get<0>(t) = thrust::get<1>(t);
2332   }
2333 };
2334 
2335 struct MatMatCusparse {
2336   PetscBool      cisdense;
2337   PetscScalar   *Bt;
2338   Mat            X;
2339   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2340   PetscLogDouble flops;
2341   CsrMatrix     *Bcsr;
2342 
2343 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2344   cusparseSpMatDescr_t matSpBDescr;
2345   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2346   cusparseDnMatDescr_t matBDescr;
2347   cusparseDnMatDescr_t matCDescr;
2348   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2349 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2350   void *dBuffer4;
2351   void *dBuffer5;
2352 #endif
2353   size_t                mmBufferSize;
2354   void                 *mmBuffer;
2355   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2356   cusparseSpGEMMDescr_t spgemmDesc;
2357 #endif
2358 };
2359 
2360 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) {
2361   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2362 
2363   PetscFunctionBegin;
2364   PetscCallCUDA(cudaFree(mmdata->Bt));
2365   delete mmdata->Bcsr;
2366 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2367   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2368   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2369   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2370   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2371 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2372   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2373   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2374 #endif
2375   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2376   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2377 #endif
2378   PetscCall(MatDestroy(&mmdata->X));
2379   PetscCall(PetscFree(data));
2380   PetscFunctionReturn(0);
2381 }
2382 
2383 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2384 
2385 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2386   Mat_Product                  *product = C->product;
2387   Mat                           A, B;
2388   PetscInt                      m, n, blda, clda;
2389   PetscBool                     flg, biscuda;
2390   Mat_SeqAIJCUSPARSE           *cusp;
2391   cusparseStatus_t              stat;
2392   cusparseOperation_t           opA;
2393   const PetscScalar            *barray;
2394   PetscScalar                  *carray;
2395   MatMatCusparse               *mmdata;
2396   Mat_SeqAIJCUSPARSEMultStruct *mat;
2397   CsrMatrix                    *csrmat;
2398 
2399   PetscFunctionBegin;
2400   MatCheckProduct(C, 1);
2401   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2402   mmdata = (MatMatCusparse *)product->data;
2403   A      = product->A;
2404   B      = product->B;
2405   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2406   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2407   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2408      Instead of silently accepting the wrong answer, I prefer to raise the error */
2409   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2410   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2411   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2412   switch (product->type) {
2413   case MATPRODUCT_AB:
2414   case MATPRODUCT_PtAP:
2415     mat = cusp->mat;
2416     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2417     m   = A->rmap->n;
2418     n   = B->cmap->n;
2419     break;
2420   case MATPRODUCT_AtB:
2421     if (!A->form_explicit_transpose) {
2422       mat = cusp->mat;
2423       opA = CUSPARSE_OPERATION_TRANSPOSE;
2424     } else {
2425       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2426       mat = cusp->matTranspose;
2427       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2428     }
2429     m = A->cmap->n;
2430     n = B->cmap->n;
2431     break;
2432   case MATPRODUCT_ABt:
2433   case MATPRODUCT_RARt:
2434     mat = cusp->mat;
2435     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2436     m   = A->rmap->n;
2437     n   = B->rmap->n;
2438     break;
2439   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2440   }
2441   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2442   csrmat = (CsrMatrix *)mat->mat;
2443   /* if the user passed a CPU matrix, copy the data to the GPU */
2444   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2445   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2446   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2447 
2448   PetscCall(MatDenseGetLDA(B, &blda));
2449   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2450     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
2451     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2452   } else {
2453     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
2454     PetscCall(MatDenseGetLDA(C, &clda));
2455   }
2456 
2457   PetscCall(PetscLogGpuTimeBegin());
2458 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2459   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2460   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2461   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2462     size_t mmBufferSize;
2463     if (mmdata->initialized && mmdata->Blda != blda) {
2464       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2465       mmdata->matBDescr = NULL;
2466     }
2467     if (!mmdata->matBDescr) {
2468       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2469       mmdata->Blda = blda;
2470     }
2471 
2472     if (mmdata->initialized && mmdata->Clda != clda) {
2473       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2474       mmdata->matCDescr = NULL;
2475     }
2476     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2477       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2478       mmdata->Clda = clda;
2479     }
2480 
2481     if (!mat->matDescr) {
2482       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2483                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2484       PetscCallCUSPARSE(stat);
2485     }
2486     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2487     PetscCallCUSPARSE(stat);
2488     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2489       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2490       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2491       mmdata->mmBufferSize = mmBufferSize;
2492     }
2493     mmdata->initialized = PETSC_TRUE;
2494   } else {
2495     /* to be safe, always update pointers of the mats */
2496     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2497     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2498     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2499   }
2500 
2501   /* do cusparseSpMM, which supports transpose on B */
2502   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2503   PetscCallCUSPARSE(stat);
2504 #else
2505   PetscInt k;
2506   /* cusparseXcsrmm does not support transpose on B */
2507   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2508     cublasHandle_t cublasv2handle;
2509     cublasStatus_t cerr;
2510 
2511     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2512     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2513     PetscCallCUBLAS(cerr);
2514     blda = B->cmap->n;
2515     k = B->cmap->n;
2516   } else {
2517     k = B->rmap->n;
2518   }
2519 
2520   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2521   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2522   PetscCallCUSPARSE(stat);
2523 #endif
2524   PetscCall(PetscLogGpuTimeEnd());
2525   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2526   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2527   if (product->type == MATPRODUCT_RARt) {
2528     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2529     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2530   } else if (product->type == MATPRODUCT_PtAP) {
2531     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2532     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2533   } else {
2534     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2535   }
2536   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2537   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2538   PetscFunctionReturn(0);
2539 }
2540 
2541 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2542   Mat_Product        *product = C->product;
2543   Mat                 A, B;
2544   PetscInt            m, n;
2545   PetscBool           cisdense, flg;
2546   MatMatCusparse     *mmdata;
2547   Mat_SeqAIJCUSPARSE *cusp;
2548 
2549   PetscFunctionBegin;
2550   MatCheckProduct(C, 1);
2551   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2552   A = product->A;
2553   B = product->B;
2554   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2555   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2556   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2557   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2558   switch (product->type) {
2559   case MATPRODUCT_AB:
2560     m = A->rmap->n;
2561     n = B->cmap->n;
2562     break;
2563   case MATPRODUCT_AtB:
2564     m = A->cmap->n;
2565     n = B->cmap->n;
2566     break;
2567   case MATPRODUCT_ABt:
2568     m = A->rmap->n;
2569     n = B->rmap->n;
2570     break;
2571   case MATPRODUCT_PtAP:
2572     m = B->cmap->n;
2573     n = B->cmap->n;
2574     break;
2575   case MATPRODUCT_RARt:
2576     m = B->rmap->n;
2577     n = B->rmap->n;
2578     break;
2579   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2580   }
2581   PetscCall(MatSetSizes(C, m, n, m, n));
2582   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2583   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2584   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2585 
2586   /* product data */
2587   PetscCall(PetscNew(&mmdata));
2588   mmdata->cisdense = cisdense;
2589 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2590   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2591   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2592 #endif
2593   /* for these products we need intermediate storage */
2594   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2595     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2596     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2597     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2598       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2599     } else {
2600       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2601     }
2602   }
2603   C->product->data    = mmdata;
2604   C->product->destroy = MatDestroy_MatMatCusparse;
2605 
2606   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2607   PetscFunctionReturn(0);
2608 }
2609 
2610 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2611   Mat_Product                  *product = C->product;
2612   Mat                           A, B;
2613   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2614   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2615   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2616   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2617   PetscBool                     flg;
2618   cusparseStatus_t              stat;
2619   MatProductType                ptype;
2620   MatMatCusparse               *mmdata;
2621 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2622   cusparseSpMatDescr_t BmatSpDescr;
2623 #endif
2624   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2625 
2626   PetscFunctionBegin;
2627   MatCheckProduct(C, 1);
2628   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2629   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2630   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2631   mmdata = (MatMatCusparse *)C->product->data;
2632   A      = product->A;
2633   B      = product->B;
2634   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2635     mmdata->reusesym = PETSC_FALSE;
2636     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2637     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2638     Cmat = Ccusp->mat;
2639     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2640     Ccsr = (CsrMatrix *)Cmat->mat;
2641     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2642     goto finalize;
2643   }
2644   if (!c->nz) goto finalize;
2645   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2646   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2647   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2648   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2649   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2650   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2651   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2652   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2653   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2654   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2655   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2656   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2657   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2658   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2659 
2660   ptype = product->type;
2661   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2662     ptype = MATPRODUCT_AB;
2663     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2664   }
2665   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2666     ptype = MATPRODUCT_AB;
2667     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2668   }
2669   switch (ptype) {
2670   case MATPRODUCT_AB:
2671     Amat = Acusp->mat;
2672     Bmat = Bcusp->mat;
2673     break;
2674   case MATPRODUCT_AtB:
2675     Amat = Acusp->matTranspose;
2676     Bmat = Bcusp->mat;
2677     break;
2678   case MATPRODUCT_ABt:
2679     Amat = Acusp->mat;
2680     Bmat = Bcusp->matTranspose;
2681     break;
2682   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2683   }
2684   Cmat = Ccusp->mat;
2685   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2686   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2687   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2688   Acsr = (CsrMatrix *)Amat->mat;
2689   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2690   Ccsr = (CsrMatrix *)Cmat->mat;
2691   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2692   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2693   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2694   PetscCall(PetscLogGpuTimeBegin());
2695 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2696   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2697   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2698 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2699   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2700   PetscCallCUSPARSE(stat);
2701 #else
2702   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2703   PetscCallCUSPARSE(stat);
2704   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2705   PetscCallCUSPARSE(stat);
2706 #endif
2707 #else
2708   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2709                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2710   PetscCallCUSPARSE(stat);
2711 #endif
2712   PetscCall(PetscLogGpuFlops(mmdata->flops));
2713   PetscCallCUDA(WaitForCUDA());
2714   PetscCall(PetscLogGpuTimeEnd());
2715   C->offloadmask = PETSC_OFFLOAD_GPU;
2716 finalize:
2717   /* shorter version of MatAssemblyEnd_SeqAIJ */
2718   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2719   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2720   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2721   c->reallocs = 0;
2722   C->info.mallocs += 0;
2723   C->info.nz_unneeded = 0;
2724   C->assembled = C->was_assembled = PETSC_TRUE;
2725   C->num_ass++;
2726   PetscFunctionReturn(0);
2727 }
2728 
2729 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2730   Mat_Product                  *product = C->product;
2731   Mat                           A, B;
2732   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2733   Mat_SeqAIJ                   *a, *b, *c;
2734   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2735   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2736   PetscInt                      i, j, m, n, k;
2737   PetscBool                     flg;
2738   cusparseStatus_t              stat;
2739   MatProductType                ptype;
2740   MatMatCusparse               *mmdata;
2741   PetscLogDouble                flops;
2742   PetscBool                     biscompressed, ciscompressed;
2743 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2744   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2745   cusparseSpMatDescr_t BmatSpDescr;
2746 #else
2747   int cnz;
2748 #endif
2749   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2750 
2751   PetscFunctionBegin;
2752   MatCheckProduct(C, 1);
2753   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2754   A = product->A;
2755   B = product->B;
2756   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2757   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2758   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2759   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2760   a = (Mat_SeqAIJ *)A->data;
2761   b = (Mat_SeqAIJ *)B->data;
2762   /* product data */
2763   PetscCall(PetscNew(&mmdata));
2764   C->product->data    = mmdata;
2765   C->product->destroy = MatDestroy_MatMatCusparse;
2766 
2767   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2768   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2769   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2770   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2771   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2772   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2773 
2774   ptype = product->type;
2775   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2776     ptype                                          = MATPRODUCT_AB;
2777     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2778   }
2779   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2780     ptype                                          = MATPRODUCT_AB;
2781     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2782   }
2783   biscompressed = PETSC_FALSE;
2784   ciscompressed = PETSC_FALSE;
2785   switch (ptype) {
2786   case MATPRODUCT_AB:
2787     m    = A->rmap->n;
2788     n    = B->cmap->n;
2789     k    = A->cmap->n;
2790     Amat = Acusp->mat;
2791     Bmat = Bcusp->mat;
2792     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2793     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2794     break;
2795   case MATPRODUCT_AtB:
2796     m = A->cmap->n;
2797     n = B->cmap->n;
2798     k = A->rmap->n;
2799     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2800     Amat = Acusp->matTranspose;
2801     Bmat = Bcusp->mat;
2802     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2803     break;
2804   case MATPRODUCT_ABt:
2805     m = A->rmap->n;
2806     n = B->rmap->n;
2807     k = A->cmap->n;
2808     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2809     Amat = Acusp->mat;
2810     Bmat = Bcusp->matTranspose;
2811     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2812     break;
2813   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2814   }
2815 
2816   /* create cusparse matrix */
2817   PetscCall(MatSetSizes(C, m, n, m, n));
2818   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2819   c     = (Mat_SeqAIJ *)C->data;
2820   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2821   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2822   Ccsr  = new CsrMatrix;
2823 
2824   c->compressedrow.use = ciscompressed;
2825   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2826     c->compressedrow.nrows = a->compressedrow.nrows;
2827     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2828     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2829     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2830     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2831     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2832   } else {
2833     c->compressedrow.nrows  = 0;
2834     c->compressedrow.i      = NULL;
2835     c->compressedrow.rindex = NULL;
2836     Ccusp->workVector       = NULL;
2837     Cmat->cprowIndices      = NULL;
2838   }
2839   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2840   Ccusp->mat        = Cmat;
2841   Ccusp->mat->mat   = Ccsr;
2842   Ccsr->num_rows    = Ccusp->nrows;
2843   Ccsr->num_cols    = n;
2844   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2845   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2846   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2847   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2848   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2849   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2850   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2851   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2852   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2853   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2854   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2855     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2856     c->nz                = 0;
2857     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2858     Ccsr->values         = new THRUSTARRAY(c->nz);
2859     goto finalizesym;
2860   }
2861 
2862   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2863   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2864   Acsr = (CsrMatrix *)Amat->mat;
2865   if (!biscompressed) {
2866     Bcsr = (CsrMatrix *)Bmat->mat;
2867 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2868     BmatSpDescr = Bmat->matDescr;
2869 #endif
2870   } else { /* we need to use row offsets for the full matrix */
2871     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2872     Bcsr                 = new CsrMatrix;
2873     Bcsr->num_rows       = B->rmap->n;
2874     Bcsr->num_cols       = cBcsr->num_cols;
2875     Bcsr->num_entries    = cBcsr->num_entries;
2876     Bcsr->column_indices = cBcsr->column_indices;
2877     Bcsr->values         = cBcsr->values;
2878     if (!Bcusp->rowoffsets_gpu) {
2879       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2880       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2881       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2882     }
2883     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2884     mmdata->Bcsr      = Bcsr;
2885 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2886     if (Bcsr->num_rows && Bcsr->num_cols) {
2887       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2888       PetscCallCUSPARSE(stat);
2889     }
2890     BmatSpDescr = mmdata->matSpBDescr;
2891 #endif
2892   }
2893   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2894   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2895   /* precompute flops count */
2896   if (ptype == MATPRODUCT_AB) {
2897     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2898       const PetscInt st = a->i[i];
2899       const PetscInt en = a->i[i + 1];
2900       for (j = st; j < en; j++) {
2901         const PetscInt brow = a->j[j];
2902         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2903       }
2904     }
2905   } else if (ptype == MATPRODUCT_AtB) {
2906     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2907       const PetscInt anzi = a->i[i + 1] - a->i[i];
2908       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2909       flops += (2. * anzi) * bnzi;
2910     }
2911   } else { /* TODO */
2912     flops = 0.;
2913   }
2914 
2915   mmdata->flops = flops;
2916   PetscCall(PetscLogGpuTimeBegin());
2917 
2918 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2919   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2920   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2921   PetscCallCUSPARSE(stat);
2922   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2923 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2924   {
2925     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2926      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2927   */
2928     void  *dBuffer1    = NULL;
2929     void  *dBuffer2    = NULL;
2930     void  *dBuffer3    = NULL;
2931     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2932     size_t bufferSize1 = 0;
2933     size_t bufferSize2 = 0;
2934     size_t bufferSize3 = 0;
2935     size_t bufferSize4 = 0;
2936     size_t bufferSize5 = 0;
2937 
2938     /*----------------------------------------------------------------------*/
2939     /* ask bufferSize1 bytes for external memory */
2940     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2941     PetscCallCUSPARSE(stat);
2942     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2943     /* inspect the matrices A and B to understand the memory requirement for the next step */
2944     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2945     PetscCallCUSPARSE(stat);
2946 
2947     /*----------------------------------------------------------------------*/
2948     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2949     PetscCallCUSPARSE(stat);
2950     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2951     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2952     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2953     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2954     PetscCallCUSPARSE(stat);
2955     PetscCallCUDA(cudaFree(dBuffer1));
2956     PetscCallCUDA(cudaFree(dBuffer2));
2957 
2958     /*----------------------------------------------------------------------*/
2959     /* get matrix C non-zero entries C_nnz1 */
2960     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2961     c->nz                = (PetscInt)C_nnz1;
2962     /* allocate matrix C */
2963     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2964     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2965     Ccsr->values = new THRUSTARRAY(c->nz);
2966     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2967     /* update matC with the new pointers */
2968     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2969     PetscCallCUSPARSE(stat);
2970 
2971     /*----------------------------------------------------------------------*/
2972     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2973     PetscCallCUSPARSE(stat);
2974     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2975     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2976     PetscCallCUSPARSE(stat);
2977     PetscCallCUDA(cudaFree(dBuffer3));
2978     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2979     PetscCallCUSPARSE(stat);
2980     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2981   }
2982 #else
2983   size_t bufSize2;
2984   /* ask bufferSize bytes for external memory */
2985   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2986   PetscCallCUSPARSE(stat);
2987   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2988   /* inspect the matrices A and B to understand the memory requirement for the next step */
2989   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2990   PetscCallCUSPARSE(stat);
2991   /* ask bufferSize again bytes for external memory */
2992   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2993   PetscCallCUSPARSE(stat);
2994   /* The CUSPARSE documentation is not clear, nor the API
2995      We need both buffers to perform the operations properly!
2996      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2997      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2998      is stored in the descriptor! What a messy API... */
2999   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3000   /* compute the intermediate product of A * B */
3001   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3002   PetscCallCUSPARSE(stat);
3003   /* get matrix C non-zero entries C_nnz1 */
3004   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3005   c->nz = (PetscInt)C_nnz1;
3006   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3007                       mmdata->mmBufferSize / 1024));
3008   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3009   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3010   Ccsr->values = new THRUSTARRAY(c->nz);
3011   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3012   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3013   PetscCallCUSPARSE(stat);
3014   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3015   PetscCallCUSPARSE(stat);
3016 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3017 #else
3018   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3019   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3020                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3021   PetscCallCUSPARSE(stat);
3022   c->nz = cnz;
3023   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3024   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3025   Ccsr->values = new THRUSTARRAY(c->nz);
3026   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3027 
3028   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3029   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3030      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3031      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3032   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3033                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3034   PetscCallCUSPARSE(stat);
3035 #endif
3036   PetscCall(PetscLogGpuFlops(mmdata->flops));
3037   PetscCall(PetscLogGpuTimeEnd());
3038 finalizesym:
3039   c->singlemalloc = PETSC_FALSE;
3040   c->free_a       = PETSC_TRUE;
3041   c->free_ij      = PETSC_TRUE;
3042   PetscCall(PetscMalloc1(m + 1, &c->i));
3043   PetscCall(PetscMalloc1(c->nz, &c->j));
3044   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3045     PetscInt      *d_i = c->i;
3046     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3047     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3048     ii = *Ccsr->row_offsets;
3049     jj = *Ccsr->column_indices;
3050     if (ciscompressed) d_i = c->compressedrow.i;
3051     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3052     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3053   } else {
3054     PetscInt *d_i = c->i;
3055     if (ciscompressed) d_i = c->compressedrow.i;
3056     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3057     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3058   }
3059   if (ciscompressed) { /* need to expand host row offsets */
3060     PetscInt r = 0;
3061     c->i[0]    = 0;
3062     for (k = 0; k < c->compressedrow.nrows; k++) {
3063       const PetscInt next = c->compressedrow.rindex[k];
3064       const PetscInt old  = c->compressedrow.i[k];
3065       for (; r < next; r++) c->i[r + 1] = old;
3066     }
3067     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3068   }
3069   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3070   PetscCall(PetscMalloc1(m, &c->ilen));
3071   PetscCall(PetscMalloc1(m, &c->imax));
3072   c->maxnz         = c->nz;
3073   c->nonzerorowcnt = 0;
3074   c->rmax          = 0;
3075   for (k = 0; k < m; k++) {
3076     const PetscInt nn = c->i[k + 1] - c->i[k];
3077     c->ilen[k] = c->imax[k] = nn;
3078     c->nonzerorowcnt += (PetscInt) !!nn;
3079     c->rmax = PetscMax(c->rmax, nn);
3080   }
3081   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3082   PetscCall(PetscMalloc1(c->nz, &c->a));
3083   Ccsr->num_entries = c->nz;
3084 
3085   C->nonzerostate++;
3086   PetscCall(PetscLayoutSetUp(C->rmap));
3087   PetscCall(PetscLayoutSetUp(C->cmap));
3088   Ccusp->nonzerostate = C->nonzerostate;
3089   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3090   C->preallocated     = PETSC_TRUE;
3091   C->assembled        = PETSC_FALSE;
3092   C->was_assembled    = PETSC_FALSE;
3093   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3094     mmdata->reusesym = PETSC_TRUE;
3095     C->offloadmask   = PETSC_OFFLOAD_GPU;
3096   }
3097   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3098   PetscFunctionReturn(0);
3099 }
3100 
3101 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3102 
3103 /* handles sparse or dense B */
3104 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) {
3105   Mat_Product *product = mat->product;
3106   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3107 
3108   PetscFunctionBegin;
3109   MatCheckProduct(mat, 1);
3110   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3111   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3112   if (product->type == MATPRODUCT_ABC) {
3113     Ciscusp = PETSC_FALSE;
3114     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3115   }
3116   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3117     PetscBool usecpu = PETSC_FALSE;
3118     switch (product->type) {
3119     case MATPRODUCT_AB:
3120       if (product->api_user) {
3121         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3122         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3123         PetscOptionsEnd();
3124       } else {
3125         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3126         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3127         PetscOptionsEnd();
3128       }
3129       break;
3130     case MATPRODUCT_AtB:
3131       if (product->api_user) {
3132         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3133         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3134         PetscOptionsEnd();
3135       } else {
3136         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3137         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3138         PetscOptionsEnd();
3139       }
3140       break;
3141     case MATPRODUCT_PtAP:
3142       if (product->api_user) {
3143         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3144         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3145         PetscOptionsEnd();
3146       } else {
3147         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3148         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3149         PetscOptionsEnd();
3150       }
3151       break;
3152     case MATPRODUCT_RARt:
3153       if (product->api_user) {
3154         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3155         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3156         PetscOptionsEnd();
3157       } else {
3158         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3159         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3160         PetscOptionsEnd();
3161       }
3162       break;
3163     case MATPRODUCT_ABC:
3164       if (product->api_user) {
3165         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3166         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3167         PetscOptionsEnd();
3168       } else {
3169         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3170         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3171         PetscOptionsEnd();
3172       }
3173       break;
3174     default: break;
3175     }
3176     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3177   }
3178   /* dispatch */
3179   if (isdense) {
3180     switch (product->type) {
3181     case MATPRODUCT_AB:
3182     case MATPRODUCT_AtB:
3183     case MATPRODUCT_ABt:
3184     case MATPRODUCT_PtAP:
3185     case MATPRODUCT_RARt:
3186       if (product->A->boundtocpu) {
3187         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3188       } else {
3189         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3190       }
3191       break;
3192     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
3193     default: break;
3194     }
3195   } else if (Biscusp && Ciscusp) {
3196     switch (product->type) {
3197     case MATPRODUCT_AB:
3198     case MATPRODUCT_AtB:
3199     case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break;
3200     case MATPRODUCT_PtAP:
3201     case MATPRODUCT_RARt:
3202     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
3203     default: break;
3204     }
3205   } else { /* fallback for AIJ */
3206     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3207   }
3208   PetscFunctionReturn(0);
3209 }
3210 
3211 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3212   PetscFunctionBegin;
3213   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3214   PetscFunctionReturn(0);
3215 }
3216 
3217 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3218   PetscFunctionBegin;
3219   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3220   PetscFunctionReturn(0);
3221 }
3222 
3223 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3224   PetscFunctionBegin;
3225   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3226   PetscFunctionReturn(0);
3227 }
3228 
3229 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3230   PetscFunctionBegin;
3231   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3232   PetscFunctionReturn(0);
3233 }
3234 
3235 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3236   PetscFunctionBegin;
3237   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3238   PetscFunctionReturn(0);
3239 }
3240 
3241 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) {
3242   int i = blockIdx.x * blockDim.x + threadIdx.x;
3243   if (i < n) y[idx[i]] += x[i];
3244 }
3245 
3246 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3247 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) {
3248   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3249   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3250   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3251   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3252   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3253   PetscBool                     compressed;
3254 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3255   PetscInt nx, ny;
3256 #endif
3257 
3258   PetscFunctionBegin;
3259   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3260   if (!a->nz) {
3261     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
3262     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3263     PetscFunctionReturn(0);
3264   }
3265   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3266   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3267   if (!trans) {
3268     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3269     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3270   } else {
3271     if (herm || !A->form_explicit_transpose) {
3272       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3273       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3274     } else {
3275       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3276       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3277     }
3278   }
3279   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3280   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3281 
3282   try {
3283     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3284     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3285     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3286 
3287     PetscCall(PetscLogGpuTimeBegin());
3288     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3289       /* z = A x + beta y.
3290          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3291          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3292       */
3293       xptr = xarray;
3294       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3295       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3296 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3297       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3298           allocated to accommodate different uses. So we get the length info directly from mat.
3299        */
3300       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3301         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3302         nx             = mat->num_cols;
3303         ny             = mat->num_rows;
3304       }
3305 #endif
3306     } else {
3307       /* z = A^T x + beta y
3308          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3309          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3310        */
3311       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3312       dptr = zarray;
3313       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3314       if (compressed) { /* Scatter x to work vector */
3315         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3316         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3317                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3318       }
3319 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3320       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3321         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3322         nx             = mat->num_rows;
3323         ny             = mat->num_cols;
3324       }
3325 #endif
3326     }
3327 
3328     /* csr_spmv does y = alpha op(A) x + beta y */
3329     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3330 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3331       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3332       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3333         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3334         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3335         PetscCallCUSPARSE(
3336           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3337         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3338 
3339         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3340       } else {
3341         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3342         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3343         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3344       }
3345 
3346       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3347                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3348 #else
3349       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3350       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3351 #endif
3352     } else {
3353       if (cusparsestruct->nrows) {
3354 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3355         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3356 #else
3357         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3358         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3359 #endif
3360       }
3361     }
3362     PetscCall(PetscLogGpuTimeEnd());
3363 
3364     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3365       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3366         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3367           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3368         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3369           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
3370         }
3371       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3372         PetscCall(VecSet_SeqCUDA(zz, 0));
3373       }
3374 
3375       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3376       if (compressed) {
3377         PetscCall(PetscLogGpuTimeBegin());
3378         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3379            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3380            prevent that. So I just add a ScatterAdd kernel.
3381          */
3382 #if 0
3383         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3384         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3385                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3386                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3387                          VecCUDAPlusEquals());
3388 #else
3389         PetscInt n = matstruct->cprowIndices->size();
3390         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3391 #endif
3392         PetscCall(PetscLogGpuTimeEnd());
3393       }
3394     } else {
3395       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3396     }
3397     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3398     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3399     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3400   } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
3401   if (yy) {
3402     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3403   } else {
3404     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3405   }
3406   PetscFunctionReturn(0);
3407 }
3408 
3409 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3410   PetscFunctionBegin;
3411   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3412   PetscFunctionReturn(0);
3413 }
3414 
3415 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) {
3416   PetscObjectState    onnz = A->nonzerostate;
3417   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3418 
3419   PetscFunctionBegin;
3420   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3421   if (onnz != A->nonzerostate && cusp->deviceMat) {
3422     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3423     PetscCallCUDA(cudaFree(cusp->deviceMat));
3424     cusp->deviceMat = NULL;
3425   }
3426   PetscFunctionReturn(0);
3427 }
3428 
3429 /* --------------------------------------------------------------------------------*/
3430 /*@
3431    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3432    (the default parallel PETSc format). This matrix will ultimately pushed down
3433    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3434    assembly performance the user should preallocate the matrix storage by setting
3435    the parameter nz (or the array nnz).  By setting these parameters accurately,
3436    performance during matrix assembly can be increased by more than a factor of 50.
3437 
3438    Collective
3439 
3440    Input Parameters:
3441 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3442 .  m - number of rows
3443 .  n - number of columns
3444 .  nz - number of nonzeros per row (same for all rows)
3445 -  nnz - array containing the number of nonzeros in the various rows
3446          (possibly different for each row) or NULL
3447 
3448    Output Parameter:
3449 .  A - the matrix
3450 
3451    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3452    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3453    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3454 
3455    Notes:
3456    If nnz is given then nz is ignored
3457 
3458    The AIJ format, also called
3459    compressed row storage, is fully compatible with standard Fortran 77
3460    storage.  That is, the stored row and column indices can begin at
3461    either one (as in Fortran) or zero.  See the users' manual for details.
3462 
3463    Specify the preallocated storage with either nz or nnz (not both).
3464    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3465    allocation.  For large problems you MUST preallocate memory or you
3466    will get TERRIBLE performance, see the users' manual chapter on matrices.
3467 
3468    By default, this format uses inodes (identical nodes) when possible, to
3469    improve numerical efficiency of matrix-vector products and solves. We
3470    search for consecutive rows with the same nonzero structure, thereby
3471    reusing matrix information to achieve increased efficiency.
3472 
3473    Level: intermediate
3474 
3475 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3476 @*/
3477 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
3478   PetscFunctionBegin;
3479   PetscCall(MatCreate(comm, A));
3480   PetscCall(MatSetSizes(*A, m, n, m, n));
3481   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3482   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3483   PetscFunctionReturn(0);
3484 }
3485 
3486 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) {
3487   PetscFunctionBegin;
3488   if (A->factortype == MAT_FACTOR_NONE) {
3489     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3490   } else {
3491     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3492   }
3493   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3494   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3495   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3496   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3497   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3498   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3499   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3500   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3501   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3502   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3503   PetscCall(MatDestroy_SeqAIJ(A));
3504   PetscFunctionReturn(0);
3505 }
3506 
3507 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3508 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3509 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) {
3510         PetscFunctionBegin;
3511         PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3512         PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3513         PetscFunctionReturn(0);
3514 }
3515 
3516 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) {
3517   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3518   Mat_SeqAIJCUSPARSE *cy;
3519   Mat_SeqAIJCUSPARSE *cx;
3520   PetscScalar        *ay;
3521   const PetscScalar  *ax;
3522   CsrMatrix          *csry, *csrx;
3523 
3524   PetscFunctionBegin;
3525   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3526   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3527   if (X->ops->axpy != Y->ops->axpy) {
3528     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3529     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3530     PetscFunctionReturn(0);
3531   }
3532   /* if we are here, it means both matrices are bound to GPU */
3533   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3534   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3535   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3536   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3537   csry = (CsrMatrix *)cy->mat->mat;
3538   csrx = (CsrMatrix *)cx->mat->mat;
3539   /* see if we can turn this into a cublas axpy */
3540   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3541     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3542     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3543     if (eq) str = SAME_NONZERO_PATTERN;
3544   }
3545   /* spgeam is buggy with one column */
3546   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3547 
3548   if (str == SUBSET_NONZERO_PATTERN) {
3549     PetscScalar b = 1.0;
3550 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3551     size_t bufferSize;
3552     void  *buffer;
3553 #endif
3554 
3555     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3556     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3557     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3558 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3559     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3560                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3561     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3562     PetscCall(PetscLogGpuTimeBegin());
3563     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3564                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3565     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3566     PetscCall(PetscLogGpuTimeEnd());
3567     PetscCallCUDA(cudaFree(buffer));
3568 #else
3569     PetscCall(PetscLogGpuTimeBegin());
3570     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3571                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3572     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3573     PetscCall(PetscLogGpuTimeEnd());
3574 #endif
3575     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3576     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3577     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3578     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3579   } else if (str == SAME_NONZERO_PATTERN) {
3580     cublasHandle_t cublasv2handle;
3581     PetscBLASInt   one = 1, bnz = 1;
3582 
3583     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3584     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3585     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3586     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3587     PetscCall(PetscLogGpuTimeBegin());
3588     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3589     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3590     PetscCall(PetscLogGpuTimeEnd());
3591     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3592     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3593     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3594   } else {
3595     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3596     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3597   }
3598   PetscFunctionReturn(0);
3599 }
3600 
3601 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) {
3602   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3603   PetscScalar   *ay;
3604   cublasHandle_t cublasv2handle;
3605   PetscBLASInt   one = 1, bnz = 1;
3606 
3607   PetscFunctionBegin;
3608   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3609   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3610   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3611   PetscCall(PetscLogGpuTimeBegin());
3612   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3613   PetscCall(PetscLogGpuFlops(bnz));
3614   PetscCall(PetscLogGpuTimeEnd());
3615   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3616   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3617   PetscFunctionReturn(0);
3618 }
3619 
3620 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) {
3621   PetscBool   both = PETSC_FALSE;
3622   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3623 
3624   PetscFunctionBegin;
3625   if (A->factortype == MAT_FACTOR_NONE) {
3626     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3627     if (spptr->mat) {
3628       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3629       if (matrix->values) {
3630         both = PETSC_TRUE;
3631         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3632       }
3633     }
3634     if (spptr->matTranspose) {
3635       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3636       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3637     }
3638   }
3639   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3640   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3641   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3642   else A->offloadmask = PETSC_OFFLOAD_CPU;
3643   PetscFunctionReturn(0);
3644 }
3645 
3646 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) {
3647   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3648 
3649   PetscFunctionBegin;
3650   if (A->factortype != MAT_FACTOR_NONE) {
3651     A->boundtocpu = flg;
3652     PetscFunctionReturn(0);
3653   }
3654   if (flg) {
3655     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3656 
3657     A->ops->scale                     = MatScale_SeqAIJ;
3658     A->ops->axpy                      = MatAXPY_SeqAIJ;
3659     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3660     A->ops->mult                      = MatMult_SeqAIJ;
3661     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3662     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3663     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3664     A->ops->multhermitiantranspose    = NULL;
3665     A->ops->multhermitiantransposeadd = NULL;
3666     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3667     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3668     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3669     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3670     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3671     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3672     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3673     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3674   } else {
3675     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3676     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3677     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3678     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3679     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3680     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3681     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3682     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3683     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3684     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3685     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3686     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3687     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3688     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3689     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3690     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3691     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3692 
3693     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3694     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3695     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3696     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3697     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3698     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3699   }
3700   A->boundtocpu = flg;
3701   if (flg && a->inode.size) {
3702     a->inode.use = PETSC_TRUE;
3703   } else {
3704     a->inode.use = PETSC_FALSE;
3705   }
3706   PetscFunctionReturn(0);
3707 }
3708 
3709 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) {
3710   Mat B;
3711 
3712   PetscFunctionBegin;
3713   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3714   if (reuse == MAT_INITIAL_MATRIX) {
3715     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3716   } else if (reuse == MAT_REUSE_MATRIX) {
3717     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3718   }
3719   B = *newmat;
3720 
3721   PetscCall(PetscFree(B->defaultvectype));
3722   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3723 
3724   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3725     if (B->factortype == MAT_FACTOR_NONE) {
3726       Mat_SeqAIJCUSPARSE *spptr;
3727       PetscCall(PetscNew(&spptr));
3728       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3729       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3730       spptr->format = MAT_CUSPARSE_CSR;
3731 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3732 #if CUSPARSE_VERSION > 11301
3733       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3734 #else
3735       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3736 #endif
3737       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3738       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3739 #endif
3740       B->spptr = spptr;
3741     } else {
3742       Mat_SeqAIJCUSPARSETriFactors *spptr;
3743 
3744       PetscCall(PetscNew(&spptr));
3745       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3746       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3747       B->spptr = spptr;
3748     }
3749     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3750   }
3751   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3752   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3753   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3754   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3755   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3756   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3757 
3758   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3759   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3760   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3761 #if defined(PETSC_HAVE_HYPRE)
3762   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3763 #endif
3764   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3765   PetscFunctionReturn(0);
3766 }
3767 
3768 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) {
3769   PetscFunctionBegin;
3770   PetscCall(MatCreate_SeqAIJ(B));
3771   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3772   PetscFunctionReturn(0);
3773 }
3774 
3775 /*MC
3776    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3777 
3778    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3779    CSR, ELL, or Hybrid format.
3780    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3781 
3782    Options Database Keys:
3783 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3784 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3785 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3786 +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3787 
3788   Level: beginner
3789 
3790 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3791 M*/
3792 
3793 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3794 
3795 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) {
3796   PetscFunctionBegin;
3797   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3798   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3799   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3800   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3801   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3802 
3803   PetscFunctionReturn(0);
3804 }
3805 
3806 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) {
3807   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3808 
3809   PetscFunctionBegin;
3810   if (!cusp) PetscFunctionReturn(0);
3811   delete cusp->cooPerm;
3812   delete cusp->cooPerm_a;
3813   cusp->cooPerm   = NULL;
3814   cusp->cooPerm_a = NULL;
3815   if (cusp->use_extended_coo) {
3816     PetscCallCUDA(cudaFree(cusp->jmap_d));
3817     PetscCallCUDA(cudaFree(cusp->perm_d));
3818   }
3819   cusp->use_extended_coo = PETSC_FALSE;
3820   PetscFunctionReturn(0);
3821 }
3822 
3823 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) {
3824   PetscFunctionBegin;
3825   if (*cusparsestruct) {
3826     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3827     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3828     delete (*cusparsestruct)->workVector;
3829     delete (*cusparsestruct)->rowoffsets_gpu;
3830     delete (*cusparsestruct)->cooPerm;
3831     delete (*cusparsestruct)->cooPerm_a;
3832     delete (*cusparsestruct)->csr2csc_i;
3833     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3834     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3835     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3836     PetscCall(PetscFree(*cusparsestruct));
3837   }
3838   PetscFunctionReturn(0);
3839 }
3840 
3841 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) {
3842   PetscFunctionBegin;
3843   if (*mat) {
3844     delete (*mat)->values;
3845     delete (*mat)->column_indices;
3846     delete (*mat)->row_offsets;
3847     delete *mat;
3848     *mat = 0;
3849   }
3850   PetscFunctionReturn(0);
3851 }
3852 
3853 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) {
3854   PetscFunctionBegin;
3855   if (*trifactor) {
3856     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3857     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3858     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3859     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3860     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3861 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3862     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3863 #endif
3864     PetscCall(PetscFree(*trifactor));
3865   }
3866   PetscFunctionReturn(0);
3867 }
3868 
3869 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) {
3870   CsrMatrix *mat;
3871 
3872   PetscFunctionBegin;
3873   if (*matstruct) {
3874     if ((*matstruct)->mat) {
3875       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3876 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3877         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3878 #else
3879         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3880         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3881 #endif
3882       } else {
3883         mat = (CsrMatrix *)(*matstruct)->mat;
3884         CsrMatrix_Destroy(&mat);
3885       }
3886     }
3887     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3888     delete (*matstruct)->cprowIndices;
3889     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3890     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3891     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3892 
3893 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3894     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3895     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3896     for (int i = 0; i < 3; i++) {
3897       if (mdata->cuSpMV[i].initialized) {
3898         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3899         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3900         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3901       }
3902     }
3903 #endif
3904     delete *matstruct;
3905     *matstruct = NULL;
3906   }
3907   PetscFunctionReturn(0);
3908 }
3909 
3910 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) {
3911   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3912 
3913   PetscFunctionBegin;
3914   if (fs) {
3915     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3916     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3917     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3918     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3919     delete fs->rpermIndices;
3920     delete fs->cpermIndices;
3921     delete fs->workVector;
3922     fs->rpermIndices = NULL;
3923     fs->cpermIndices = NULL;
3924     fs->workVector   = NULL;
3925     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3926     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3927     fs->init_dev_prop = PETSC_FALSE;
3928 #if CUSPARSE_VERSION >= 11500
3929     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3930     PetscCallCUDA(cudaFree(fs->csrColIdx));
3931     PetscCallCUDA(cudaFree(fs->csrVal));
3932     PetscCallCUDA(cudaFree(fs->X));
3933     PetscCallCUDA(cudaFree(fs->Y));
3934     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3935     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3936     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3937     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3938     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3939     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3940     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3941     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3942     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3943     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3944     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3945     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3946     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3947     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3948     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3949     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3950 
3951     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3952     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3953 #endif
3954   }
3955   PetscFunctionReturn(0);
3956 }
3957 
3958 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) {
3959   cusparseHandle_t handle;
3960 
3961   PetscFunctionBegin;
3962   if (*trifactors) {
3963     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3964     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
3965     PetscCall(PetscFree(*trifactors));
3966   }
3967   PetscFunctionReturn(0);
3968 }
3969 
3970 struct IJCompare {
3971   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
3972     if (t1.get<0>() < t2.get<0>()) return true;
3973     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3974     return false;
3975   }
3976 };
3977 
3978 struct IJEqual {
3979   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
3980     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3981     return true;
3982   }
3983 };
3984 
3985 struct IJDiff {
3986   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3987 };
3988 
3989 struct IJSum {
3990   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3991 };
3992 
3993 #include <thrust/iterator/discard_iterator.h>
3994 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3995 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) {
3996   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
3997   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
3998   THRUSTARRAY                          *cooPerm_v = NULL;
3999   thrust::device_ptr<const PetscScalar> d_v;
4000   CsrMatrix                            *matrix;
4001   PetscInt                              n;
4002 
4003   PetscFunctionBegin;
4004   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4005   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4006   if (!cusp->cooPerm) {
4007     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4008     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4009     PetscFunctionReturn(0);
4010   }
4011   matrix = (CsrMatrix *)cusp->mat->mat;
4012   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4013   if (!v) {
4014     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4015     goto finalize;
4016   }
4017   n = cusp->cooPerm->size();
4018   if (isCudaMem(v)) {
4019     d_v = thrust::device_pointer_cast(v);
4020   } else {
4021     cooPerm_v = new THRUSTARRAY(n);
4022     cooPerm_v->assign(v, v + n);
4023     d_v = cooPerm_v->data();
4024     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4025   }
4026   PetscCall(PetscLogGpuTimeBegin());
4027   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4028     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4029       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4030       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4031       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4032         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4033         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4034       */
4035       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4036       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4037       delete cooPerm_w;
4038     } else {
4039       /* all nonzeros in d_v[] are unique entries */
4040       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4041       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4042       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4043     }
4044   } else {
4045     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4046       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4047       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4048     } else {
4049       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4050       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4051       thrust::for_each(zibit, zieit, VecCUDAEquals());
4052     }
4053   }
4054   PetscCall(PetscLogGpuTimeEnd());
4055 finalize:
4056   delete cooPerm_v;
4057   A->offloadmask = PETSC_OFFLOAD_GPU;
4058   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4059   /* shorter version of MatAssemblyEnd_SeqAIJ */
4060   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4061   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4062   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4063   a->reallocs = 0;
4064   A->info.mallocs += 0;
4065   A->info.nz_unneeded = 0;
4066   A->assembled = A->was_assembled = PETSC_TRUE;
4067   A->num_ass++;
4068   PetscFunctionReturn(0);
4069 }
4070 
4071 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) {
4072   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4073 
4074   PetscFunctionBegin;
4075   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4076   if (!cusp) PetscFunctionReturn(0);
4077   if (destroy) {
4078     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4079     delete cusp->csr2csc_i;
4080     cusp->csr2csc_i = NULL;
4081   }
4082   A->transupdated = PETSC_FALSE;
4083   PetscFunctionReturn(0);
4084 }
4085 
4086 #include <thrust/binary_search.h>
4087 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4088 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) {
4089   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4090   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4091   PetscInt            cooPerm_n, nzr = 0;
4092 
4093   PetscFunctionBegin;
4094   PetscCall(PetscLayoutSetUp(A->rmap));
4095   PetscCall(PetscLayoutSetUp(A->cmap));
4096   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4097   if (n != cooPerm_n) {
4098     delete cusp->cooPerm;
4099     delete cusp->cooPerm_a;
4100     cusp->cooPerm   = NULL;
4101     cusp->cooPerm_a = NULL;
4102   }
4103   if (n) {
4104     thrust::device_ptr<PetscInt> d_i, d_j;
4105     PetscInt                    *d_raw_i, *d_raw_j;
4106     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4107     PetscMemType                 imtype, jmtype;
4108 
4109     PetscCall(PetscGetMemType(coo_i, &imtype));
4110     if (PetscMemTypeHost(imtype)) {
4111       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4112       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4113       d_i        = thrust::device_pointer_cast(d_raw_i);
4114       free_raw_i = PETSC_TRUE;
4115       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4116     } else {
4117       d_i = thrust::device_pointer_cast(coo_i);
4118     }
4119 
4120     PetscCall(PetscGetMemType(coo_j, &jmtype));
4121     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4122       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4123       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4124       d_j        = thrust::device_pointer_cast(d_raw_j);
4125       free_raw_j = PETSC_TRUE;
4126       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4127     } else {
4128       d_j = thrust::device_pointer_cast(coo_j);
4129     }
4130 
4131     THRUSTINTARRAY ii(A->rmap->n);
4132 
4133     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4134     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4135 
4136     /* Ex.
4137       n = 6
4138       coo_i = [3,3,1,4,1,4]
4139       coo_j = [3,2,2,5,2,6]
4140     */
4141     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4142     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4143 
4144     PetscCall(PetscLogGpuTimeBegin());
4145     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4146     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4147     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4148     THRUSTINTARRAY w(d_j, d_j + n);
4149 
4150     /*
4151       d_i     = [1,1,3,3,4,4]
4152       d_j     = [2,2,2,3,5,6]
4153       cooPerm = [2,4,1,0,3,5]
4154     */
4155     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4156 
4157     /*
4158       d_i     = [1,3,3,4,4,x]
4159                             ^ekey
4160       d_j     = [2,2,3,5,6,x]
4161                            ^nekye
4162     */
4163     if (nekey == ekey) { /* all entries are unique */
4164       delete cusp->cooPerm_a;
4165       cusp->cooPerm_a = NULL;
4166     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4167       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4168       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4169       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4170       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4171       w[0]                  = 0;
4172       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4173       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4174     }
4175     thrust::counting_iterator<PetscInt> search_begin(0);
4176     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4177                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4178                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4179     PetscCall(PetscLogGpuTimeEnd());
4180 
4181     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4182     a->singlemalloc = PETSC_FALSE;
4183     a->free_a       = PETSC_TRUE;
4184     a->free_ij      = PETSC_TRUE;
4185     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4186     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4187     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4188     a->nz = a->maxnz = a->i[A->rmap->n];
4189     a->rmax          = 0;
4190     PetscCall(PetscMalloc1(a->nz, &a->a));
4191     PetscCall(PetscMalloc1(a->nz, &a->j));
4192     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4193     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4194     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4195     for (PetscInt i = 0; i < A->rmap->n; i++) {
4196       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4197       nzr += (PetscInt) !!(nnzr);
4198       a->ilen[i] = a->imax[i] = nnzr;
4199       a->rmax                 = PetscMax(a->rmax, nnzr);
4200     }
4201     a->nonzerorowcnt = nzr;
4202     A->preallocated  = PETSC_TRUE;
4203     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4204     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4205     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4206     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4207   } else {
4208     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4209   }
4210   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4211 
4212   /* We want to allocate the CUSPARSE struct for matvec now.
4213      The code is so convoluted now that I prefer to copy zeros */
4214   PetscCall(PetscArrayzero(a->a, a->nz));
4215   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4216   A->offloadmask = PETSC_OFFLOAD_CPU;
4217   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4218   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4219   PetscFunctionReturn(0);
4220 }
4221 
4222 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
4223   Mat_SeqAIJ         *seq;
4224   Mat_SeqAIJCUSPARSE *dev;
4225   PetscBool           coo_basic = PETSC_TRUE;
4226   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4227 
4228   PetscFunctionBegin;
4229   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4230   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4231   if (coo_i) {
4232     PetscCall(PetscGetMemType(coo_i, &mtype));
4233     if (PetscMemTypeHost(mtype)) {
4234       for (PetscCount k = 0; k < coo_n; k++) {
4235         if (coo_i[k] < 0 || coo_j[k] < 0) {
4236           coo_basic = PETSC_FALSE;
4237           break;
4238         }
4239       }
4240     }
4241   }
4242 
4243   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4244     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4245   } else {
4246     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4247     mat->offloadmask = PETSC_OFFLOAD_CPU;
4248     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4249     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4250     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4251     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4252     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4253     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4254     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4255     dev->use_extended_coo = PETSC_TRUE;
4256   }
4257   PetscFunctionReturn(0);
4258 }
4259 
4260 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) {
4261   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4262   const PetscCount grid_size = gridDim.x * blockDim.x;
4263   for (; i < nnz; i += grid_size) {
4264     PetscScalar sum = 0.0;
4265     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4266     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4267   }
4268 }
4269 
4270 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) {
4271   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4272   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4273   PetscCount          Annz = seq->nz;
4274   PetscMemType        memtype;
4275   const PetscScalar  *v1 = v;
4276   PetscScalar        *Aa;
4277 
4278   PetscFunctionBegin;
4279   if (dev->use_extended_coo) {
4280     PetscCall(PetscGetMemType(v, &memtype));
4281     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4282       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4283       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4284     }
4285 
4286     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4287     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4288 
4289     if (Annz) {
4290       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4291       PetscCallCUDA(cudaPeekAtLastError());
4292     }
4293 
4294     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4295     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4296 
4297     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4298   } else {
4299     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4300   }
4301   PetscFunctionReturn(0);
4302 }
4303 
4304 /*@C
4305     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
4306 
4307    Not collective
4308 
4309     Input Parameters:
4310 +   A - the matrix
4311 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4312 
4313     Output Parameters:
4314 +   ia - the CSR row pointers
4315 -   ja - the CSR column indices
4316 
4317     Level: developer
4318 
4319     Note:
4320       When compressed is true, the CSR structure does not contain empty rows
4321 
4322 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4323 @*/
4324 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
4325   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4326   CsrMatrix          *csr;
4327   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4328 
4329   PetscFunctionBegin;
4330   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4331   if (!i || !j) PetscFunctionReturn(0);
4332   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4333   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4334   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4335   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4336   csr = (CsrMatrix *)cusp->mat->mat;
4337   if (i) {
4338     if (!compressed && a->compressedrow.use) { /* need full row offset */
4339       if (!cusp->rowoffsets_gpu) {
4340         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4341         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4342         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4343       }
4344       *i = cusp->rowoffsets_gpu->data().get();
4345     } else *i = csr->row_offsets->data().get();
4346   }
4347   if (j) *j = csr->column_indices->data().get();
4348   PetscFunctionReturn(0);
4349 }
4350 
4351 /*@C
4352     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4353 
4354    Not collective
4355 
4356     Input Parameters:
4357 +   A - the matrix
4358 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4359 
4360     Output Parameters:
4361 +   ia - the CSR row pointers
4362 -   ja - the CSR column indices
4363 
4364     Level: developer
4365 
4366 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4367 @*/
4368 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
4369   PetscFunctionBegin;
4370   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4371   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4372   if (i) *i = NULL;
4373   if (j) *j = NULL;
4374   PetscFunctionReturn(0);
4375 }
4376 
4377 /*@C
4378    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4379 
4380    Not Collective
4381 
4382    Input Parameter:
4383 .   A - a `MATSEQAIJCUSPARSE` matrix
4384 
4385    Output Parameter:
4386 .   a - pointer to the device data
4387 
4388    Level: developer
4389 
4390    Note:
4391    May trigger host-device copies if up-to-date matrix data is on host
4392 
4393 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4394 @*/
4395 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) {
4396   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4397   CsrMatrix          *csr;
4398 
4399   PetscFunctionBegin;
4400   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4401   PetscValidPointer(a, 2);
4402   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4403   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4404   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4405   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4406   csr = (CsrMatrix *)cusp->mat->mat;
4407   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4408   *a = csr->values->data().get();
4409   PetscFunctionReturn(0);
4410 }
4411 
4412 /*@C
4413    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4414 
4415    Not Collective
4416 
4417    Input Parameter:
4418 .   A - a `MATSEQAIJCUSPARSE` matrix
4419 
4420    Output Parameter:
4421 .   a - pointer to the device data
4422 
4423    Level: developer
4424 
4425 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4426 @*/
4427 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) {
4428   PetscFunctionBegin;
4429   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4430   PetscValidPointer(a, 2);
4431   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4432   *a = NULL;
4433   PetscFunctionReturn(0);
4434 }
4435 
4436 /*@C
4437    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4438 
4439    Not Collective
4440 
4441    Input Parameter:
4442 .   A - a `MATSEQAIJCUSPARSE` matrix
4443 
4444    Output Parameter:
4445 .   a - pointer to the device data
4446 
4447    Level: developer
4448 
4449    Note:
4450    May trigger host-device copies if up-to-date matrix data is on host
4451 
4452 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4453 @*/
4454 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) {
4455   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4456   CsrMatrix          *csr;
4457 
4458   PetscFunctionBegin;
4459   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4460   PetscValidPointer(a, 2);
4461   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4462   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4463   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4464   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4465   csr = (CsrMatrix *)cusp->mat->mat;
4466   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4467   *a             = csr->values->data().get();
4468   A->offloadmask = PETSC_OFFLOAD_GPU;
4469   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4470   PetscFunctionReturn(0);
4471 }
4472 /*@C
4473    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4474 
4475    Not Collective
4476 
4477    Input Parameter:
4478 .   A - a `MATSEQAIJCUSPARSE` matrix
4479 
4480    Output Parameter:
4481 .   a - pointer to the device data
4482 
4483    Level: developer
4484 
4485 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4486 @*/
4487 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) {
4488   PetscFunctionBegin;
4489   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4490   PetscValidPointer(a, 2);
4491   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4492   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4493   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4494   *a = NULL;
4495   PetscFunctionReturn(0);
4496 }
4497 
4498 /*@C
4499    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4500 
4501    Not Collective
4502 
4503    Input Parameter:
4504 .   A - a `MATSEQAIJCUSPARSE` matrix
4505 
4506    Output Parameter:
4507 .   a - pointer to the device data
4508 
4509    Level: developer
4510 
4511    Note:
4512    Does not trigger host-device copies and flags data validity on the GPU
4513 
4514 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4515 @*/
4516 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) {
4517   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4518   CsrMatrix          *csr;
4519 
4520   PetscFunctionBegin;
4521   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4522   PetscValidPointer(a, 2);
4523   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4524   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4525   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4526   csr = (CsrMatrix *)cusp->mat->mat;
4527   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4528   *a             = csr->values->data().get();
4529   A->offloadmask = PETSC_OFFLOAD_GPU;
4530   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4531   PetscFunctionReturn(0);
4532 }
4533 
4534 /*@C
4535    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4536 
4537    Not Collective
4538 
4539    Input Parameter:
4540 .   A - a `MATSEQAIJCUSPARSE` matrix
4541 
4542    Output Parameter:
4543 .   a - pointer to the device data
4544 
4545    Level: developer
4546 
4547 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4548 @*/
4549 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) {
4550   PetscFunctionBegin;
4551   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4552   PetscValidPointer(a, 2);
4553   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4554   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4555   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4556   *a = NULL;
4557   PetscFunctionReturn(0);
4558 }
4559 
4560 struct IJCompare4 {
4561   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) {
4562     if (t1.get<0>() < t2.get<0>()) return true;
4563     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4564     return false;
4565   }
4566 };
4567 
4568 struct Shift {
4569   int _shift;
4570 
4571   Shift(int shift) : _shift(shift) { }
4572   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4573 };
4574 
4575 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4576 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) {
4577   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4578   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4579   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4580   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4581   PetscInt                      Annz, Bnnz;
4582   cusparseStatus_t              stat;
4583   PetscInt                      i, m, n, zero = 0;
4584 
4585   PetscFunctionBegin;
4586   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4587   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4588   PetscValidPointer(C, 4);
4589   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4590   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4591   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4592   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4593   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4594   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4595   if (reuse == MAT_INITIAL_MATRIX) {
4596     m = A->rmap->n;
4597     n = A->cmap->n + B->cmap->n;
4598     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4599     PetscCall(MatSetSizes(*C, m, n, m, n));
4600     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4601     c                       = (Mat_SeqAIJ *)(*C)->data;
4602     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4603     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4604     Ccsr                    = new CsrMatrix;
4605     Cmat->cprowIndices      = NULL;
4606     c->compressedrow.use    = PETSC_FALSE;
4607     c->compressedrow.nrows  = 0;
4608     c->compressedrow.i      = NULL;
4609     c->compressedrow.rindex = NULL;
4610     Ccusp->workVector       = NULL;
4611     Ccusp->nrows            = m;
4612     Ccusp->mat              = Cmat;
4613     Ccusp->mat->mat         = Ccsr;
4614     Ccsr->num_rows          = m;
4615     Ccsr->num_cols          = n;
4616     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4617     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4618     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4619     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4620     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4621     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4622     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4623     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4624     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4625     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4626     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4627     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4628     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4629 
4630     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4631     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4632     Annz                 = (PetscInt)Acsr->column_indices->size();
4633     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4634     c->nz                = Annz + Bnnz;
4635     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4636     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4637     Ccsr->values         = new THRUSTARRAY(c->nz);
4638     Ccsr->num_entries    = c->nz;
4639     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4640     if (c->nz) {
4641       auto              Acoo = new THRUSTINTARRAY32(Annz);
4642       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4643       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4644       THRUSTINTARRAY32 *Aroff, *Broff;
4645 
4646       if (a->compressedrow.use) { /* need full row offset */
4647         if (!Acusp->rowoffsets_gpu) {
4648           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4649           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4650           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4651         }
4652         Aroff = Acusp->rowoffsets_gpu;
4653       } else Aroff = Acsr->row_offsets;
4654       if (b->compressedrow.use) { /* need full row offset */
4655         if (!Bcusp->rowoffsets_gpu) {
4656           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4657           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4658           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4659         }
4660         Broff = Bcusp->rowoffsets_gpu;
4661       } else Broff = Bcsr->row_offsets;
4662       PetscCall(PetscLogGpuTimeBegin());
4663       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4664       PetscCallCUSPARSE(stat);
4665       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4666       PetscCallCUSPARSE(stat);
4667       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4668       auto Aperm = thrust::make_constant_iterator(1);
4669       auto Bperm = thrust::make_constant_iterator(0);
4670 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4671       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4672       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4673 #else
4674       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4675       auto Bcib = Bcsr->column_indices->begin();
4676       auto Bcie = Bcsr->column_indices->end();
4677       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4678 #endif
4679       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4680       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4681       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4682       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4683       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4684       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4685       auto p1    = Ccusp->cooPerm->begin();
4686       auto p2    = Ccusp->cooPerm->begin();
4687       thrust::advance(p2, Annz);
4688       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4689 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4690       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4691 #endif
4692       auto cci = thrust::make_counting_iterator(zero);
4693       auto cce = thrust::make_counting_iterator(c->nz);
4694 #if 0 //Errors on SUMMIT cuda 11.1.0
4695       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4696 #else
4697       auto pred = thrust::identity<int>();
4698       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4699       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4700 #endif
4701       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4702       PetscCallCUSPARSE(stat);
4703       PetscCall(PetscLogGpuTimeEnd());
4704       delete wPerm;
4705       delete Acoo;
4706       delete Bcoo;
4707       delete Ccoo;
4708 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4709       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4710       PetscCallCUSPARSE(stat);
4711 #endif
4712       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4713         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4714         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4715         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4716         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4717         CsrMatrix                    *CcsrT = new CsrMatrix;
4718         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4719         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4720 
4721         (*C)->form_explicit_transpose = PETSC_TRUE;
4722         (*C)->transupdated            = PETSC_TRUE;
4723         Ccusp->rowoffsets_gpu         = NULL;
4724         CmatT->cprowIndices           = NULL;
4725         CmatT->mat                    = CcsrT;
4726         CcsrT->num_rows               = n;
4727         CcsrT->num_cols               = m;
4728         CcsrT->num_entries            = c->nz;
4729 
4730         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4731         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4732         CcsrT->values         = new THRUSTARRAY(c->nz);
4733 
4734         PetscCall(PetscLogGpuTimeBegin());
4735         auto rT = CcsrT->row_offsets->begin();
4736         if (AT) {
4737           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4738           thrust::advance(rT, -1);
4739         }
4740         if (BT) {
4741           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4742           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4743           thrust::copy(titb, tite, rT);
4744         }
4745         auto cT = CcsrT->column_indices->begin();
4746         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4747         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4748         auto vT = CcsrT->values->begin();
4749         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4750         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4751         PetscCall(PetscLogGpuTimeEnd());
4752 
4753         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4754         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4755         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4756         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4757         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4758         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4759         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4760         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4761         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4762 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4763         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4764         PetscCallCUSPARSE(stat);
4765 #endif
4766         Ccusp->matTranspose = CmatT;
4767       }
4768     }
4769 
4770     c->singlemalloc = PETSC_FALSE;
4771     c->free_a       = PETSC_TRUE;
4772     c->free_ij      = PETSC_TRUE;
4773     PetscCall(PetscMalloc1(m + 1, &c->i));
4774     PetscCall(PetscMalloc1(c->nz, &c->j));
4775     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4776       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4777       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4778       ii = *Ccsr->row_offsets;
4779       jj = *Ccsr->column_indices;
4780       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4781       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4782     } else {
4783       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4784       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4785     }
4786     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4787     PetscCall(PetscMalloc1(m, &c->ilen));
4788     PetscCall(PetscMalloc1(m, &c->imax));
4789     c->maxnz         = c->nz;
4790     c->nonzerorowcnt = 0;
4791     c->rmax          = 0;
4792     for (i = 0; i < m; i++) {
4793       const PetscInt nn = c->i[i + 1] - c->i[i];
4794       c->ilen[i] = c->imax[i] = nn;
4795       c->nonzerorowcnt += (PetscInt) !!nn;
4796       c->rmax = PetscMax(c->rmax, nn);
4797     }
4798     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4799     PetscCall(PetscMalloc1(c->nz, &c->a));
4800     (*C)->nonzerostate++;
4801     PetscCall(PetscLayoutSetUp((*C)->rmap));
4802     PetscCall(PetscLayoutSetUp((*C)->cmap));
4803     Ccusp->nonzerostate = (*C)->nonzerostate;
4804     (*C)->preallocated  = PETSC_TRUE;
4805   } else {
4806     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4807     c = (Mat_SeqAIJ *)(*C)->data;
4808     if (c->nz) {
4809       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4810       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4811       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4812       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4813       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4814       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4815       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4816       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4817       Acsr = (CsrMatrix *)Acusp->mat->mat;
4818       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4819       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4820       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4821       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4822       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4823       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4824       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4825       auto pmid = Ccusp->cooPerm->begin();
4826       thrust::advance(pmid, Acsr->num_entries);
4827       PetscCall(PetscLogGpuTimeBegin());
4828       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4829       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4830       thrust::for_each(zibait, zieait, VecCUDAEquals());
4831       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4832       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4833       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4834       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4835       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4836         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4837         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4838         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4839         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4840         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4841         auto       vT    = CcsrT->values->begin();
4842         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4843         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4844         (*C)->transupdated = PETSC_TRUE;
4845       }
4846       PetscCall(PetscLogGpuTimeEnd());
4847     }
4848   }
4849   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4850   (*C)->assembled     = PETSC_TRUE;
4851   (*C)->was_assembled = PETSC_FALSE;
4852   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4853   PetscFunctionReturn(0);
4854 }
4855 
4856 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) {
4857   bool               dmem;
4858   const PetscScalar *av;
4859 
4860   PetscFunctionBegin;
4861   dmem = isCudaMem(v);
4862   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4863   if (n && idx) {
4864     THRUSTINTARRAY widx(n);
4865     widx.assign(idx, idx + n);
4866     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4867 
4868     THRUSTARRAY                    *w = NULL;
4869     thrust::device_ptr<PetscScalar> dv;
4870     if (dmem) {
4871       dv = thrust::device_pointer_cast(v);
4872     } else {
4873       w  = new THRUSTARRAY(n);
4874       dv = w->data();
4875     }
4876     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4877 
4878     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4879     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4880     thrust::for_each(zibit, zieit, VecCUDAEquals());
4881     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4882     delete w;
4883   } else {
4884     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4885   }
4886   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4887   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4888   PetscFunctionReturn(0);
4889 }
4890