xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
55 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
94 
95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
96   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
97 
98   PetscFunctionBegin;
99   switch (op) {
100   case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break;
101   case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break;
102   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
103   }
104   PetscFunctionReturn(0);
105 }
106 
107 /*@
108    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
109    operation. Only the MatMult operation can use different GPU storage formats
110    for MPIAIJCUSPARSE matrices.
111    Not Collective
112 
113    Input Parameters:
114 +  A - Matrix of type SEQAIJCUSPARSE
115 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
116 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
117 
118    Output Parameter:
119 
120    Level: intermediate
121 
122 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
123 @*/
124 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
125   PetscFunctionBegin;
126   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
127   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
128   PetscFunctionReturn(0);
129 }
130 
131 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) {
132   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
133 
134   PetscFunctionBegin;
135   cusparsestruct->use_cpu_solve = use_cpu;
136   PetscFunctionReturn(0);
137 }
138 
139 /*@
140    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
141 
142    Input Parameters:
143 +  A - Matrix of type SEQAIJCUSPARSE
144 -  use_cpu - set flag for using the built-in CPU MatSolve
145 
146    Output Parameter:
147 
148    Notes:
149    The cuSparse LU solver currently computes the factors with the built-in CPU method
150    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
151    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
152 
153    Level: intermediate
154 
155 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
156 @*/
157 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) {
158   PetscFunctionBegin;
159   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
160   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
161   PetscFunctionReturn(0);
162 }
163 
164 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) {
165   PetscFunctionBegin;
166   switch (op) {
167   case MAT_FORM_EXPLICIT_TRANSPOSE:
168     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
169     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
170     A->form_explicit_transpose = flg;
171     break;
172   default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break;
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
178 
179 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
180   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
181   IS                  isrow = b->row, iscol = b->col;
182   PetscBool           row_identity, col_identity;
183   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
184 
185   PetscFunctionBegin;
186   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
187   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
188   B->offloadmask = PETSC_OFFLOAD_CPU;
189   /* determine which version of MatSolve needs to be used. */
190   PetscCall(ISIdentity(isrow, &row_identity));
191   PetscCall(ISIdentity(iscol, &col_identity));
192 
193   if (!cusparsestruct->use_cpu_solve) {
194     if (row_identity && col_identity) {
195       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
196       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
197     } else {
198       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
199       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
200     }
201   }
202   B->ops->matsolve          = NULL;
203   B->ops->matsolvetranspose = NULL;
204 
205   /* get the triangular factors */
206   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
207   PetscFunctionReturn(0);
208 }
209 
210 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) {
211   MatCUSPARSEStorageFormat format;
212   PetscBool                flg;
213   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
214 
215   PetscFunctionBegin;
216   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
217   if (A->factortype == MAT_FACTOR_NONE) {
218     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
219     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
220 
221     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
222     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
223     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
224     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
225 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
226     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
227     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
228 #if CUSPARSE_VERSION > 11301
229     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
230 #else
231     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
232 #endif
233     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
234     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
235 
236     PetscCall(
237       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
238     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
239 #endif
240   }
241   PetscOptionsHeadEnd();
242   PetscFunctionReturn(0);
243 }
244 
245 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) {
246   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
247   PetscInt                           n                  = A->rmap->n;
248   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
249   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
250   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
251   const MatScalar                   *aa = a->a, *v;
252   PetscInt                          *AiLo, *AjLo;
253   PetscInt                           i, nz, nzLower, offset, rowOffset;
254 
255   PetscFunctionBegin;
256   if (!n) PetscFunctionReturn(0);
257   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
258     try {
259       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
260       nzLower = n + ai[n] - ai[1];
261       if (!loTriFactor) {
262         PetscScalar *AALo;
263 
264         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
265 
266         /* Allocate Space for the lower triangular matrix */
267         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
268         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
269 
270         /* Fill the lower triangular matrix */
271         AiLo[0]   = (PetscInt)0;
272         AiLo[n]   = nzLower;
273         AjLo[0]   = (PetscInt)0;
274         AALo[0]   = (MatScalar)1.0;
275         v         = aa;
276         vi        = aj;
277         offset    = 1;
278         rowOffset = 1;
279         for (i = 1; i < n; i++) {
280           nz      = ai[i + 1] - ai[i];
281           /* additional 1 for the term on the diagonal */
282           AiLo[i] = rowOffset;
283           rowOffset += nz + 1;
284 
285           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
286           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
287 
288           offset += nz;
289           AjLo[offset] = (PetscInt)i;
290           AALo[offset] = (MatScalar)1.0;
291           offset += 1;
292 
293           v += nz;
294           vi += nz;
295         }
296 
297         /* allocate space for the triangular factor information */
298         PetscCall(PetscNew(&loTriFactor));
299         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
300         /* Create the matrix description */
301         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
302         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
303 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
304         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
305 #else
306         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
307 #endif
308         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
309         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
310 
311         /* set the operation */
312         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
313 
314         /* set the matrix */
315         loTriFactor->csrMat              = new CsrMatrix;
316         loTriFactor->csrMat->num_rows    = n;
317         loTriFactor->csrMat->num_cols    = n;
318         loTriFactor->csrMat->num_entries = nzLower;
319 
320         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
321         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
322 
323         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
324         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
325 
326         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
327         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
328 
329         /* Create the solve analysis information */
330         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
331         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
332 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
333         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
334                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
335         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
336 #endif
337 
338         /* perform the solve analysis */
339         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
340                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
341 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
342                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
343 #else
344                                                   loTriFactor->solveInfo));
345 #endif
346         PetscCallCUDA(WaitForCUDA());
347         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
348 
349         /* assign the pointer */
350         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
351         loTriFactor->AA_h                                          = AALo;
352         PetscCallCUDA(cudaFreeHost(AiLo));
353         PetscCallCUDA(cudaFreeHost(AjLo));
354         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
355       } else { /* update values only */
356         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
357         /* Fill the lower triangular matrix */
358         loTriFactor->AA_h[0] = 1.0;
359         v                    = aa;
360         vi                   = aj;
361         offset               = 1;
362         for (i = 1; i < n; i++) {
363           nz = ai[i + 1] - ai[i];
364           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
365           offset += nz;
366           loTriFactor->AA_h[offset] = 1.0;
367           offset += 1;
368           v += nz;
369         }
370         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
371         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
372       }
373     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
374   }
375   PetscFunctionReturn(0);
376 }
377 
378 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) {
379   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
380   PetscInt                           n                  = A->rmap->n;
381   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
382   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
383   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
384   const MatScalar                   *aa = a->a, *v;
385   PetscInt                          *AiUp, *AjUp;
386   PetscInt                           i, nz, nzUpper, offset;
387 
388   PetscFunctionBegin;
389   if (!n) PetscFunctionReturn(0);
390   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
391     try {
392       /* next, figure out the number of nonzeros in the upper triangular matrix. */
393       nzUpper = adiag[0] - adiag[n];
394       if (!upTriFactor) {
395         PetscScalar *AAUp;
396 
397         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
398 
399         /* Allocate Space for the upper triangular matrix */
400         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
401         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
402 
403         /* Fill the upper triangular matrix */
404         AiUp[0] = (PetscInt)0;
405         AiUp[n] = nzUpper;
406         offset  = nzUpper;
407         for (i = n - 1; i >= 0; i--) {
408           v  = aa + adiag[i + 1] + 1;
409           vi = aj + adiag[i + 1] + 1;
410 
411           /* number of elements NOT on the diagonal */
412           nz = adiag[i] - adiag[i + 1] - 1;
413 
414           /* decrement the offset */
415           offset -= (nz + 1);
416 
417           /* first, set the diagonal elements */
418           AjUp[offset] = (PetscInt)i;
419           AAUp[offset] = (MatScalar)1. / v[nz];
420           AiUp[i]      = AiUp[i + 1] - (nz + 1);
421 
422           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
423           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
424         }
425 
426         /* allocate space for the triangular factor information */
427         PetscCall(PetscNew(&upTriFactor));
428         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
429 
430         /* Create the matrix description */
431         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
432         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
433 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
434         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
435 #else
436         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
437 #endif
438         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
439         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
440 
441         /* set the operation */
442         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
443 
444         /* set the matrix */
445         upTriFactor->csrMat              = new CsrMatrix;
446         upTriFactor->csrMat->num_rows    = n;
447         upTriFactor->csrMat->num_cols    = n;
448         upTriFactor->csrMat->num_entries = nzUpper;
449 
450         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
451         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
452 
453         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
454         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
455 
456         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
457         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
458 
459         /* Create the solve analysis information */
460         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
461         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
462 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
463         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
464                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
465         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
466 #endif
467 
468         /* perform the solve analysis */
469         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
470                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
471 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
472                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
473 #else
474                                                   upTriFactor->solveInfo));
475 #endif
476         PetscCallCUDA(WaitForCUDA());
477         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
478 
479         /* assign the pointer */
480         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
481         upTriFactor->AA_h                                          = AAUp;
482         PetscCallCUDA(cudaFreeHost(AiUp));
483         PetscCallCUDA(cudaFreeHost(AjUp));
484         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
485       } else {
486         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
487         /* Fill the upper triangular matrix */
488         offset = nzUpper;
489         for (i = n - 1; i >= 0; i--) {
490           v = aa + adiag[i + 1] + 1;
491 
492           /* number of elements NOT on the diagonal */
493           nz = adiag[i] - adiag[i + 1] - 1;
494 
495           /* decrement the offset */
496           offset -= (nz + 1);
497 
498           /* first, set the diagonal elements */
499           upTriFactor->AA_h[offset] = 1. / v[nz];
500           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
501         }
502         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
503         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
504       }
505     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
506   }
507   PetscFunctionReturn(0);
508 }
509 
510 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) {
511   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
512   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
513   IS                            isrow = a->row, iscol = a->icol;
514   PetscBool                     row_identity, col_identity;
515   PetscInt                      n = A->rmap->n;
516 
517   PetscFunctionBegin;
518   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
519   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
520   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
521 
522   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
523   cusparseTriFactors->nnz = a->nz;
524 
525   A->offloadmask = PETSC_OFFLOAD_BOTH;
526   /* lower triangular indices */
527   PetscCall(ISIdentity(isrow, &row_identity));
528   if (!row_identity && !cusparseTriFactors->rpermIndices) {
529     const PetscInt *r;
530 
531     PetscCall(ISGetIndices(isrow, &r));
532     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
533     cusparseTriFactors->rpermIndices->assign(r, r + n);
534     PetscCall(ISRestoreIndices(isrow, &r));
535     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
536   }
537 
538   /* upper triangular indices */
539   PetscCall(ISIdentity(iscol, &col_identity));
540   if (!col_identity && !cusparseTriFactors->cpermIndices) {
541     const PetscInt *c;
542 
543     PetscCall(ISGetIndices(iscol, &c));
544     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
545     cusparseTriFactors->cpermIndices->assign(c, c + n);
546     PetscCall(ISRestoreIndices(iscol, &c));
547     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
548   }
549   PetscFunctionReturn(0);
550 }
551 
552 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) {
553   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
554   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
555   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
556   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
557   PetscInt                          *AiUp, *AjUp;
558   PetscScalar                       *AAUp;
559   PetscScalar                       *AALo;
560   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
561   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
562   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
563   const MatScalar                   *aa = b->a, *v;
564 
565   PetscFunctionBegin;
566   if (!n) PetscFunctionReturn(0);
567   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
568     try {
569       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
570       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
571       if (!upTriFactor && !loTriFactor) {
572         /* Allocate Space for the upper triangular matrix */
573         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
574         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
575 
576         /* Fill the upper triangular matrix */
577         AiUp[0] = (PetscInt)0;
578         AiUp[n] = nzUpper;
579         offset  = 0;
580         for (i = 0; i < n; i++) {
581           /* set the pointers */
582           v  = aa + ai[i];
583           vj = aj + ai[i];
584           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
585 
586           /* first, set the diagonal elements */
587           AjUp[offset] = (PetscInt)i;
588           AAUp[offset] = (MatScalar)1.0 / v[nz];
589           AiUp[i]      = offset;
590           AALo[offset] = (MatScalar)1.0 / v[nz];
591 
592           offset += 1;
593           if (nz > 0) {
594             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
595             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
596             for (j = offset; j < offset + nz; j++) {
597               AAUp[j] = -AAUp[j];
598               AALo[j] = AAUp[j] / v[nz];
599             }
600             offset += nz;
601           }
602         }
603 
604         /* allocate space for the triangular factor information */
605         PetscCall(PetscNew(&upTriFactor));
606         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
607 
608         /* Create the matrix description */
609         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
610         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
611 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
612         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
613 #else
614         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
615 #endif
616         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
617         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
618 
619         /* set the matrix */
620         upTriFactor->csrMat              = new CsrMatrix;
621         upTriFactor->csrMat->num_rows    = A->rmap->n;
622         upTriFactor->csrMat->num_cols    = A->cmap->n;
623         upTriFactor->csrMat->num_entries = a->nz;
624 
625         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
626         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
627 
628         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
629         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
630 
631         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
632         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
633 
634         /* set the operation */
635         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
636 
637         /* Create the solve analysis information */
638         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
639         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
640 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
641         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
642                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
643         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
644 #endif
645 
646         /* perform the solve analysis */
647         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
648                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
649 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
650                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
651 #else
652                                                   upTriFactor->solveInfo));
653 #endif
654         PetscCallCUDA(WaitForCUDA());
655         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
656 
657         /* assign the pointer */
658         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
659 
660         /* allocate space for the triangular factor information */
661         PetscCall(PetscNew(&loTriFactor));
662         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
663 
664         /* Create the matrix description */
665         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
666         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
667 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
668         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
669 #else
670         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
671 #endif
672         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
673         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
674 
675         /* set the operation */
676         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
677 
678         /* set the matrix */
679         loTriFactor->csrMat              = new CsrMatrix;
680         loTriFactor->csrMat->num_rows    = A->rmap->n;
681         loTriFactor->csrMat->num_cols    = A->cmap->n;
682         loTriFactor->csrMat->num_entries = a->nz;
683 
684         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
685         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
686 
687         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
688         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
689 
690         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
691         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
692 
693         /* Create the solve analysis information */
694         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
695         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
696 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
697         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
698                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
699         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
700 #endif
701 
702         /* perform the solve analysis */
703         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
704                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
705 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
706                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
707 #else
708                                                   loTriFactor->solveInfo));
709 #endif
710         PetscCallCUDA(WaitForCUDA());
711         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
712 
713         /* assign the pointer */
714         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
715 
716         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
717         PetscCallCUDA(cudaFreeHost(AiUp));
718         PetscCallCUDA(cudaFreeHost(AjUp));
719       } else {
720         /* Fill the upper triangular matrix */
721         offset = 0;
722         for (i = 0; i < n; i++) {
723           /* set the pointers */
724           v  = aa + ai[i];
725           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
726 
727           /* first, set the diagonal elements */
728           AAUp[offset] = 1.0 / v[nz];
729           AALo[offset] = 1.0 / v[nz];
730 
731           offset += 1;
732           if (nz > 0) {
733             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
734             for (j = offset; j < offset + nz; j++) {
735               AAUp[j] = -AAUp[j];
736               AALo[j] = AAUp[j] / v[nz];
737             }
738             offset += nz;
739           }
740         }
741         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
742         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
743         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
744         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
745         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
746       }
747       PetscCallCUDA(cudaFreeHost(AAUp));
748       PetscCallCUDA(cudaFreeHost(AALo));
749     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
750   }
751   PetscFunctionReturn(0);
752 }
753 
754 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) {
755   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
756   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
757   IS                            ip                 = a->row;
758   PetscBool                     perm_identity;
759   PetscInt                      n = A->rmap->n;
760 
761   PetscFunctionBegin;
762   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
763   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
764   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
765   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
766 
767   A->offloadmask = PETSC_OFFLOAD_BOTH;
768 
769   /* lower triangular indices */
770   PetscCall(ISIdentity(ip, &perm_identity));
771   if (!perm_identity) {
772     IS              iip;
773     const PetscInt *irip, *rip;
774 
775     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
776     PetscCall(ISGetIndices(iip, &irip));
777     PetscCall(ISGetIndices(ip, &rip));
778     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
779     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
780     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
781     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
782     PetscCall(ISRestoreIndices(iip, &irip));
783     PetscCall(ISDestroy(&iip));
784     PetscCall(ISRestoreIndices(ip, &rip));
785     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
786   }
787   PetscFunctionReturn(0);
788 }
789 
790 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
791   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
792   IS          ip = b->row;
793   PetscBool   perm_identity;
794 
795   PetscFunctionBegin;
796   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
797   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
798   B->offloadmask = PETSC_OFFLOAD_CPU;
799   /* determine which version of MatSolve needs to be used. */
800   PetscCall(ISIdentity(ip, &perm_identity));
801   if (perm_identity) {
802     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
803     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
804     B->ops->matsolve          = NULL;
805     B->ops->matsolvetranspose = NULL;
806   } else {
807     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
808     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
809     B->ops->matsolve          = NULL;
810     B->ops->matsolvetranspose = NULL;
811   }
812 
813   /* get the triangular factors */
814   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
815   PetscFunctionReturn(0);
816 }
817 
818 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) {
819   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
820   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
821   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
822   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
823   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
824   cusparseIndexBase_t                indexBase;
825   cusparseMatrixType_t               matrixType;
826   cusparseFillMode_t                 fillMode;
827   cusparseDiagType_t                 diagType;
828 
829   PetscFunctionBegin;
830   /* allocate space for the transpose of the lower triangular factor */
831   PetscCall(PetscNew(&loTriFactorT));
832   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
833 
834   /* set the matrix descriptors of the lower triangular factor */
835   matrixType = cusparseGetMatType(loTriFactor->descr);
836   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
837   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
838   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
839 
840   /* Create the matrix description */
841   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
842   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
843   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
844   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
845   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
846 
847   /* set the operation */
848   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
849 
850   /* allocate GPU space for the CSC of the lower triangular factor*/
851   loTriFactorT->csrMat                 = new CsrMatrix;
852   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
853   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
854   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
855   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
856   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
857   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
858 
859   /* compute the transpose of the lower triangular factor, i.e. the CSC */
860 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
861   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
862                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
863                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
864   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
865 #endif
866 
867   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
868   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
869                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
870 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
871                                      loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
872 #else
873                                      loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
874 #endif
875   PetscCallCUDA(WaitForCUDA());
876   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
877 
878   /* Create the solve analysis information */
879   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
880   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
881 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
882   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
883                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
884   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
885 #endif
886 
887   /* perform the solve analysis */
888   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
889                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
890 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
891                                             loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
892 #else
893                                             loTriFactorT->solveInfo));
894 #endif
895   PetscCallCUDA(WaitForCUDA());
896   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
897 
898   /* assign the pointer */
899   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
900 
901   /*********************************************/
902   /* Now the Transpose of the Upper Tri Factor */
903   /*********************************************/
904 
905   /* allocate space for the transpose of the upper triangular factor */
906   PetscCall(PetscNew(&upTriFactorT));
907   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
908 
909   /* set the matrix descriptors of the upper triangular factor */
910   matrixType = cusparseGetMatType(upTriFactor->descr);
911   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
912   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
913   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
914 
915   /* Create the matrix description */
916   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
917   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
918   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
919   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
920   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
921 
922   /* set the operation */
923   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
924 
925   /* allocate GPU space for the CSC of the upper triangular factor*/
926   upTriFactorT->csrMat                 = new CsrMatrix;
927   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
928   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
929   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
930   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
931   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
932   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
933 
934   /* compute the transpose of the upper triangular factor, i.e. the CSC */
935 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
936   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
937                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
938                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
939   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
940 #endif
941 
942   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
943   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
944                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
945 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
946                                      upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
947 #else
948                                      upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
949 #endif
950 
951   PetscCallCUDA(WaitForCUDA());
952   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
953 
954   /* Create the solve analysis information */
955   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
956   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
957 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
958   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
959                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
960   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
961 #endif
962 
963   /* perform the solve analysis */
964   /* christ, would it have killed you to put this stuff in a function????????? */
965   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
966                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
967 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
968                                             upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
969 #else
970                                             upTriFactorT->solveInfo));
971 #endif
972 
973   PetscCallCUDA(WaitForCUDA());
974   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
975 
976   /* assign the pointer */
977   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
978   PetscFunctionReturn(0);
979 }
980 
981 struct PetscScalarToPetscInt {
982   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
983 };
984 
985 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) {
986   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
987   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
988   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
989   cusparseStatus_t              stat;
990   cusparseIndexBase_t           indexBase;
991 
992   PetscFunctionBegin;
993   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
994   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
995   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
996   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
997   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
998   if (A->transupdated) PetscFunctionReturn(0);
999   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1000   PetscCall(PetscLogGpuTimeBegin());
1001   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1002   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1003     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1004     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1005     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1006     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1007     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1008 
1009     /* set alpha and beta */
1010     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1011     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1012     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1013     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1014     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1015     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1016 
1017     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1018       CsrMatrix *matrixT      = new CsrMatrix;
1019       matstructT->mat         = matrixT;
1020       matrixT->num_rows       = A->cmap->n;
1021       matrixT->num_cols       = A->rmap->n;
1022       matrixT->num_entries    = a->nz;
1023       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1024       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1025       matrixT->values         = new THRUSTARRAY(a->nz);
1026 
1027       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1028       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1029 
1030 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1031 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1032       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1033                                indexBase, cusparse_scalartype);
1034       PetscCallCUSPARSE(stat);
1035 #else
1036       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1037            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1038 
1039            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1040            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1041            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1042         */
1043       if (matrixT->num_entries) {
1044         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1045         PetscCallCUSPARSE(stat);
1046 
1047       } else {
1048         matstructT->matDescr = NULL;
1049         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1050       }
1051 #endif
1052 #endif
1053     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1054 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1055       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1056 #else
1057       CsrMatrix *temp = new CsrMatrix;
1058       CsrMatrix *tempT = new CsrMatrix;
1059       /* First convert HYB to CSR */
1060       temp->num_rows = A->rmap->n;
1061       temp->num_cols = A->cmap->n;
1062       temp->num_entries = a->nz;
1063       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1064       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1065       temp->values = new THRUSTARRAY(a->nz);
1066 
1067       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1068       PetscCallCUSPARSE(stat);
1069 
1070       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1071       tempT->num_rows = A->rmap->n;
1072       tempT->num_cols = A->cmap->n;
1073       tempT->num_entries = a->nz;
1074       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1075       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1076       tempT->values = new THRUSTARRAY(a->nz);
1077 
1078       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1079                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1080       PetscCallCUSPARSE(stat);
1081 
1082       /* Last, convert CSC to HYB */
1083       cusparseHybMat_t hybMat;
1084       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1085       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1086       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1087       PetscCallCUSPARSE(stat);
1088 
1089       /* assign the pointer */
1090       matstructT->mat = hybMat;
1091       A->transupdated = PETSC_TRUE;
1092       /* delete temporaries */
1093       if (tempT) {
1094         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1095         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1096         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1097         delete (CsrMatrix *)tempT;
1098       }
1099       if (temp) {
1100         if (temp->values) delete (THRUSTARRAY *)temp->values;
1101         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1102         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1103         delete (CsrMatrix *)temp;
1104       }
1105 #endif
1106     }
1107   }
1108   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1109     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1110     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1111     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1112     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1113     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1114     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1115     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1116     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1117     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1118     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1119     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1120       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1121       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1122       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1123     }
1124     if (!cusparsestruct->csr2csc_i) {
1125       THRUSTARRAY csr2csc_a(matrix->num_entries);
1126       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1127 
1128       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1129 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1130       void  *csr2cscBuffer;
1131       size_t csr2cscBufferSize;
1132       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1133                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1134       PetscCallCUSPARSE(stat);
1135       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1136 #endif
1137 
1138       if (matrix->num_entries) {
1139         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1140            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1141            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1142 
1143            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1144            should be filled with indexBase. So I just take a shortcut here.
1145         */
1146         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1147 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1148                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1149         PetscCallCUSPARSE(stat);
1150 #else
1151                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1152         PetscCallCUSPARSE(stat);
1153 #endif
1154       } else {
1155         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1156       }
1157 
1158       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1159       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1160 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1161       PetscCallCUDA(cudaFree(csr2cscBuffer));
1162 #endif
1163     }
1164     PetscCallThrust(
1165       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1166   }
1167   PetscCall(PetscLogGpuTimeEnd());
1168   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1169   /* the compressed row indices is not used for matTranspose */
1170   matstructT->cprowIndices                       = NULL;
1171   /* assign the pointer */
1172   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1173   A->transupdated                                = PETSC_TRUE;
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1178 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1179   PetscInt                              n = xx->map->n;
1180   const PetscScalar                    *barray;
1181   PetscScalar                          *xarray;
1182   thrust::device_ptr<const PetscScalar> bGPU;
1183   thrust::device_ptr<PetscScalar>       xGPU;
1184   cusparseStatus_t                      stat;
1185   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1186   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1187   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1188   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1189 
1190   PetscFunctionBegin;
1191   /* Analyze the matrix and create the transpose ... on the fly */
1192   if (!loTriFactorT && !upTriFactorT) {
1193     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1194     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1195     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1196   }
1197 
1198   /* Get the GPU pointers */
1199   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1200   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1201   xGPU = thrust::device_pointer_cast(xarray);
1202   bGPU = thrust::device_pointer_cast(barray);
1203 
1204   PetscCall(PetscLogGpuTimeBegin());
1205   /* First, reorder with the row permutation */
1206   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1207 
1208   /* First, solve U */
1209   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
1210 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1211                               upTriFactorT->csrMat->num_entries,
1212 #endif
1213                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray,
1214 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1215                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
1216   PetscCallCUSPARSE(stat);
1217 #else
1218                               tempGPU->data().get());
1219   PetscCallCUSPARSE(stat);
1220 #endif
1221 
1222   /* Then, solve L */
1223   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
1224 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1225                               loTriFactorT->csrMat->num_entries,
1226 #endif
1227                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1228                               tempGPU->data().get(),
1229 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1230                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
1231   PetscCallCUSPARSE(stat);
1232 #else
1233                               xarray);
1234   PetscCallCUSPARSE(stat);
1235 #endif
1236 
1237   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1238   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1239 
1240   /* Copy the temporary to the full solution. */
1241   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1242 
1243   /* restore */
1244   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1245   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1246   PetscCall(PetscLogGpuTimeEnd());
1247   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1252   const PetscScalar                 *barray;
1253   PetscScalar                       *xarray;
1254   cusparseStatus_t                   stat;
1255   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1256   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1257   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1258   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1259 
1260   PetscFunctionBegin;
1261   /* Analyze the matrix and create the transpose ... on the fly */
1262   if (!loTriFactorT && !upTriFactorT) {
1263     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1264     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1265     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1266   }
1267 
1268   /* Get the GPU pointers */
1269   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1270   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1271 
1272   PetscCall(PetscLogGpuTimeBegin());
1273   /* First, solve U */
1274   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
1275 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1276                               upTriFactorT->csrMat->num_entries,
1277 #endif
1278                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray,
1279 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1280                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
1281   PetscCallCUSPARSE(stat);
1282 #else
1283                               tempGPU->data().get());
1284   PetscCallCUSPARSE(stat);
1285 #endif
1286 
1287   /* Then, solve L */
1288   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
1289 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1290                               loTriFactorT->csrMat->num_entries,
1291 #endif
1292                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1293                               tempGPU->data().get(),
1294 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1295                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
1296   PetscCallCUSPARSE(stat);
1297 #else
1298                               xarray);
1299   PetscCallCUSPARSE(stat);
1300 #endif
1301 
1302   /* restore */
1303   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1304   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1305   PetscCall(PetscLogGpuTimeEnd());
1306   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1307   PetscFunctionReturn(0);
1308 }
1309 
1310 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1311   const PetscScalar                    *barray;
1312   PetscScalar                          *xarray;
1313   thrust::device_ptr<const PetscScalar> bGPU;
1314   thrust::device_ptr<PetscScalar>       xGPU;
1315   cusparseStatus_t                      stat;
1316   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1317   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1318   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1319   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1320 
1321   PetscFunctionBegin;
1322 
1323   /* Get the GPU pointers */
1324   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1325   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1326   xGPU = thrust::device_pointer_cast(xarray);
1327   bGPU = thrust::device_pointer_cast(barray);
1328 
1329   PetscCall(PetscLogGpuTimeBegin());
1330   /* First, reorder with the row permutation */
1331   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1332 
1333   /* Next, solve L */
1334   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
1335 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1336                               loTriFactor->csrMat->num_entries,
1337 #endif
1338                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
1339                               tempGPU->data().get(),
1340 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1341                               xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer);
1342   PetscCallCUSPARSE(stat);
1343 #else
1344                               xarray);
1345   PetscCallCUSPARSE(stat);
1346 #endif
1347 
1348   /* Then, solve U */
1349   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
1350 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1351                               upTriFactor->csrMat->num_entries,
1352 #endif
1353                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray,
1354 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1355                               tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer);
1356   PetscCallCUSPARSE(stat);
1357 #else
1358                               tempGPU->data().get());
1359   PetscCallCUSPARSE(stat);
1360 #endif
1361 
1362   /* Last, reorder with the column permutation */
1363   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1364 
1365   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1366   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1367   PetscCall(PetscLogGpuTimeEnd());
1368   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1369   PetscFunctionReturn(0);
1370 }
1371 
1372 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1373   const PetscScalar                 *barray;
1374   PetscScalar                       *xarray;
1375   cusparseStatus_t                   stat;
1376   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1377   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1378   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1379   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1380 
1381   PetscFunctionBegin;
1382   /* Get the GPU pointers */
1383   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1384   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1385 
1386   PetscCall(PetscLogGpuTimeBegin());
1387   /* First, solve L */
1388   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
1389 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1390                               loTriFactor->csrMat->num_entries,
1391 #endif
1392                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray,
1393 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1394                               tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer);
1395   PetscCallCUSPARSE(stat);
1396 #else
1397                               tempGPU->data().get());
1398   PetscCallCUSPARSE(stat);
1399 #endif
1400 
1401   /* Next, solve U */
1402   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
1403 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1404                               upTriFactor->csrMat->num_entries,
1405 #endif
1406                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
1407                               tempGPU->data().get(),
1408 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1409                               xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer);
1410   PetscCallCUSPARSE(stat);
1411 #else
1412                               xarray);
1413   PetscCallCUSPARSE(stat);
1414 #endif
1415 
1416   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1417   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1418   PetscCall(PetscLogGpuTimeEnd());
1419   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1420   PetscFunctionReturn(0);
1421 }
1422 
1423 #if CUSPARSE_VERSION >= 11500
1424 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1425 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1426   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1427   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1428   const PetscScalar            *barray;
1429   PetscScalar                  *xarray;
1430 
1431   PetscFunctionBegin;
1432   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1433   PetscCall(VecCUDAGetArrayRead(b, &barray));
1434   PetscCall(PetscLogGpuTimeBegin());
1435 
1436   /* Solve L*y = b */
1437   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1438   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1440                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1441                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1442 
1443   /* Solve U*x = y */
1444   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1445   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1446                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1447 
1448   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1449   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1450 
1451   PetscCall(PetscLogGpuTimeEnd());
1452   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1453   PetscFunctionReturn(0);
1454 }
1455 
1456 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1457   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1458   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1459   const PetscScalar            *barray;
1460   PetscScalar                  *xarray;
1461 
1462   PetscFunctionBegin;
1463   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1464     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1465     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1466                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1467 
1468     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1469     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1470     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1471     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1472     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1473   }
1474 
1475   if (!fs->updatedTransposeSpSVAnalysis) {
1476     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1477 
1478     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1479     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1480   }
1481 
1482   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1483   PetscCall(VecCUDAGetArrayRead(b, &barray));
1484   PetscCall(PetscLogGpuTimeBegin());
1485 
1486   /* Solve Ut*y = b */
1487   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1488   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1489   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1490                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1491 
1492   /* Solve Lt*x = y */
1493   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1494   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1495                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1496 
1497   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1498   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1499   PetscCall(PetscLogGpuTimeEnd());
1500   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1501   PetscFunctionReturn(0);
1502 }
1503 
1504 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) {
1505   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1506   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1507   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1508   CsrMatrix                    *Acsr;
1509   PetscInt                      m, nz;
1510   PetscBool                     flg;
1511 
1512   PetscFunctionBegin;
1513   if (PetscDefined(USE_DEBUG)) {
1514     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1515     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1516   }
1517 
1518   /* Copy A's value to fact */
1519   m  = fact->rmap->n;
1520   nz = aij->nz;
1521   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1522   Acsr = (CsrMatrix *)Acusp->mat->mat;
1523   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1524 
1525   /* Factorize fact inplace */
1526   if (m)
1527     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1528                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1529   if (PetscDefined(USE_DEBUG)) {
1530     int              numerical_zero;
1531     cusparseStatus_t status;
1532     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1533     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1534   }
1535 
1536   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1537      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1538   */
1539   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1540 
1541   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1542 
1543   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1544   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1545 
1546   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1547   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1548   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1549   fact->ops->matsolve          = NULL;
1550   fact->ops->matsolvetranspose = NULL;
1551   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1556   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1557   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1558   PetscInt                      m, nz;
1559 
1560   PetscFunctionBegin;
1561   if (PetscDefined(USE_DEBUG)) {
1562     PetscInt  i;
1563     PetscBool flg, missing;
1564 
1565     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1566     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1567     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1568     PetscCall(MatMissingDiagonal(A, &missing, &i));
1569     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1570   }
1571 
1572   /* Free the old stale stuff */
1573   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1574 
1575   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1576      but they will not be used. Allocate them just for easy debugging.
1577    */
1578   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1579 
1580   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1581   fact->factortype             = MAT_FACTOR_ILU;
1582   fact->info.factor_mallocs    = 0;
1583   fact->info.fill_ratio_given  = info->fill;
1584   fact->info.fill_ratio_needed = 1.0;
1585 
1586   aij->row = NULL;
1587   aij->col = NULL;
1588 
1589   /* ====================================================================== */
1590   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1591   /* We'll do in-place factorization on fact                                */
1592   /* ====================================================================== */
1593   const int *Ai, *Aj;
1594 
1595   m  = fact->rmap->n;
1596   nz = aij->nz;
1597 
1598   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1599   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1600   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1601   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1602   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1603   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1604 
1605   /* ====================================================================== */
1606   /* Create descriptors for M, L, U                                         */
1607   /* ====================================================================== */
1608   cusparseFillMode_t fillMode;
1609   cusparseDiagType_t diagType;
1610 
1611   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1612   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1613   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1614 
1615   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1616     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1617     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1618     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1619     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1620   */
1621   fillMode = CUSPARSE_FILL_MODE_LOWER;
1622   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1623   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1624   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1625   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1626 
1627   fillMode = CUSPARSE_FILL_MODE_UPPER;
1628   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1629   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1630   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1631   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1632 
1633   /* ========================================================================= */
1634   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1635   /* ========================================================================= */
1636   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1637   if (m)
1638     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1639                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1640 
1641   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1642   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1643 
1644   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1645   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1646 
1647   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1648   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1649 
1650   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1651   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1652 
1653   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1654      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1655      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1656      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1657    */
1658   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1659     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1660     fs->spsvBuffer_L = fs->factBuffer_M;
1661     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1662   } else {
1663     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1664     fs->spsvBuffer_U = fs->factBuffer_M;
1665     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1666   }
1667 
1668   /* ========================================================================== */
1669   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1670   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1671   /* ========================================================================== */
1672   int              structural_zero;
1673   cusparseStatus_t status;
1674 
1675   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1676   if (m)
1677     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1678                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1679   if (PetscDefined(USE_DEBUG)) {
1680     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1681     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1682     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1683   }
1684 
1685   /* Estimate FLOPs of the numeric factorization */
1686   {
1687     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1688     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1689     PetscLogDouble flops = 0.0;
1690 
1691     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1692     Ai    = Aseq->i;
1693     Adiag = Aseq->diag;
1694     for (PetscInt i = 0; i < m; i++) {
1695       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1696         nzRow  = Ai[i + 1] - Ai[i];
1697         nzLeft = Adiag[i] - Ai[i];
1698         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1699           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1700         */
1701         nzLeft = (nzRow - 1) / 2;
1702         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1703       }
1704     }
1705     fs->numericFactFlops = flops;
1706   }
1707   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) {
1712   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1713   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1714   const PetscScalar            *barray;
1715   PetscScalar                  *xarray;
1716 
1717   PetscFunctionBegin;
1718   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1719   PetscCall(VecCUDAGetArrayRead(b, &barray));
1720   PetscCall(PetscLogGpuTimeBegin());
1721 
1722   /* Solve L*y = b */
1723   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1724   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1725   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1726                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1727 
1728   /* Solve Lt*x = y */
1729   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1730   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1731                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1732 
1733   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1734   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1735 
1736   PetscCall(PetscLogGpuTimeEnd());
1737   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1738   PetscFunctionReturn(0);
1739 }
1740 
1741 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) {
1742   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1743   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1744   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1745   CsrMatrix                    *Acsr;
1746   PetscInt                      m, nz;
1747   PetscBool                     flg;
1748 
1749   PetscFunctionBegin;
1750   if (PetscDefined(USE_DEBUG)) {
1751     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1752     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1753   }
1754 
1755   /* Copy A's value to fact */
1756   m  = fact->rmap->n;
1757   nz = aij->nz;
1758   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1759   Acsr = (CsrMatrix *)Acusp->mat->mat;
1760   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1761 
1762   /* Factorize fact inplace */
1763   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1764      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1765      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1766      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1767      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1768    */
1769   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1770   if (PetscDefined(USE_DEBUG)) {
1771     int              numerical_zero;
1772     cusparseStatus_t status;
1773     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1774     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1775   }
1776 
1777   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1778 
1779   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1780     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1781   */
1782   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1783 
1784   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1785   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1786   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1787   fact->ops->matsolve          = NULL;
1788   fact->ops->matsolvetranspose = NULL;
1789   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1790   PetscFunctionReturn(0);
1791 }
1792 
1793 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
1794   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1795   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1796   PetscInt                      m, nz;
1797 
1798   PetscFunctionBegin;
1799   if (PetscDefined(USE_DEBUG)) {
1800     PetscInt  i;
1801     PetscBool flg, missing;
1802 
1803     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1804     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1805     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1806     PetscCall(MatMissingDiagonal(A, &missing, &i));
1807     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1808   }
1809 
1810   /* Free the old stale stuff */
1811   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1812 
1813   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1814      but they will not be used. Allocate them just for easy debugging.
1815    */
1816   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1817 
1818   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1819   fact->factortype             = MAT_FACTOR_ICC;
1820   fact->info.factor_mallocs    = 0;
1821   fact->info.fill_ratio_given  = info->fill;
1822   fact->info.fill_ratio_needed = 1.0;
1823 
1824   aij->row = NULL;
1825   aij->col = NULL;
1826 
1827   /* ====================================================================== */
1828   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1829   /* We'll do in-place factorization on fact                                */
1830   /* ====================================================================== */
1831   const int *Ai, *Aj;
1832 
1833   m  = fact->rmap->n;
1834   nz = aij->nz;
1835 
1836   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1837   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1838   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1839   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1840   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1841   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1842 
1843   /* ====================================================================== */
1844   /* Create mat descriptors for M, L                                        */
1845   /* ====================================================================== */
1846   cusparseFillMode_t fillMode;
1847   cusparseDiagType_t diagType;
1848 
1849   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1850   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1851   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1852 
1853   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1854     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1855     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1856     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1857     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1858   */
1859   fillMode = CUSPARSE_FILL_MODE_LOWER;
1860   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1861   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1862   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1863   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1864 
1865   /* ========================================================================= */
1866   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1867   /* ========================================================================= */
1868   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1869   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1870 
1871   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1872   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1873 
1874   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1875   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1876 
1877   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1878   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1879 
1880   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1881   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1882 
1883   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1884      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1885    */
1886   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1887     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1888     fs->spsvBuffer_L = fs->factBuffer_M;
1889     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1890   } else {
1891     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1892     fs->spsvBuffer_Lt = fs->factBuffer_M;
1893     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1894   }
1895 
1896   /* ========================================================================== */
1897   /* Perform analysis of ic0 on M                                               */
1898   /* The lower triangular part of M has the same sparsity pattern as L          */
1899   /* ========================================================================== */
1900   int              structural_zero;
1901   cusparseStatus_t status;
1902 
1903   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1904   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1905   if (PetscDefined(USE_DEBUG)) {
1906     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1907     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1908     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1909   }
1910 
1911   /* Estimate FLOPs of the numeric factorization */
1912   {
1913     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1914     PetscInt      *Ai, nzRow, nzLeft;
1915     PetscLogDouble flops = 0.0;
1916 
1917     Ai = Aseq->i;
1918     for (PetscInt i = 0; i < m; i++) {
1919       nzRow = Ai[i + 1] - Ai[i];
1920       if (nzRow > 1) {
1921         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1922           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1923         */
1924         nzLeft = (nzRow - 1) / 2;
1925         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1926       }
1927     }
1928     fs->numericFactFlops = flops;
1929   }
1930   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1931   PetscFunctionReturn(0);
1932 }
1933 #endif
1934 
1935 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1936   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1937 
1938   PetscFunctionBegin;
1939 #if CUSPARSE_VERSION >= 11500
1940   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1941   if (cusparseTriFactors->factorizeOnDevice) {
1942     PetscCall(ISIdentity(isrow, &row_identity));
1943     PetscCall(ISIdentity(iscol, &col_identity));
1944   }
1945   if (!info->levels && row_identity && col_identity) {
1946     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1947   } else
1948 #endif
1949   {
1950     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1951     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1952     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1953   }
1954   PetscFunctionReturn(0);
1955 }
1956 
1957 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1958   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1959 
1960   PetscFunctionBegin;
1961   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1962   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1963   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1964   PetscFunctionReturn(0);
1965 }
1966 
1967 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1968   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1969 
1970   PetscFunctionBegin;
1971 #if CUSPARSE_VERSION >= 11500
1972   PetscBool perm_identity = PETSC_FALSE;
1973   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1974   if (!info->levels && perm_identity) {
1975     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1976   } else
1977 #endif
1978   {
1979     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1980     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1981     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1982   }
1983   PetscFunctionReturn(0);
1984 }
1985 
1986 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1987   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1988 
1989   PetscFunctionBegin;
1990   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1991   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1992   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1993   PetscFunctionReturn(0);
1994 }
1995 
1996 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) {
1997   PetscFunctionBegin;
1998   *type = MATSOLVERCUSPARSE;
1999   PetscFunctionReturn(0);
2000 }
2001 
2002 /*MC
2003   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2004   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
2005   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2006   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2007   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2008   algorithms are not recommended. This class does NOT support direct solver operations.
2009 
2010   Level: beginner
2011 
2012 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2013 M*/
2014 
2015 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) {
2016   PetscInt  n = A->rmap->n;
2017   PetscBool factOnDevice, factOnHost;
2018   char     *prefix;
2019   char      factPlace[32] = "device"; /* the default */
2020 
2021   PetscFunctionBegin;
2022   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2023   PetscCall(MatSetSizes(*B, n, n, n, n));
2024   (*B)->factortype = ftype;
2025   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2026 
2027   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2028   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2029   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2030   PetscOptionsEnd();
2031   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2032   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2033   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2034   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2035 
2036   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2037   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2038     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2039     if (!A->boundtocpu) {
2040       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2041       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2042     } else {
2043       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2044       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2045     }
2046     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2047     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2048     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2049   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2050     if (!A->boundtocpu) {
2051       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2052       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2053     } else {
2054       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2055       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2056     }
2057     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2058     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2059   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2060 
2061   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2062   (*B)->canuseordering = PETSC_TRUE;
2063   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2064   PetscFunctionReturn(0);
2065 }
2066 
2067 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) {
2068   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2069   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2070 #if CUSPARSE_VERSION >= 13500
2071   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2072 #endif
2073 
2074   PetscFunctionBegin;
2075   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2076     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2077     if (A->factortype == MAT_FACTOR_NONE) {
2078       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2079       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2080     }
2081 #if CUSPARSE_VERSION >= 13500
2082     else if (fs->csrVal) {
2083       /* We have a factorized matrix on device and are able to copy it to host */
2084       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2085     }
2086 #endif
2087     else
2088       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2089     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2090     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2091     A->offloadmask = PETSC_OFFLOAD_BOTH;
2092   }
2093   PetscFunctionReturn(0);
2094 }
2095 
2096 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2097   PetscFunctionBegin;
2098   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2099   *array = ((Mat_SeqAIJ *)A->data)->a;
2100   PetscFunctionReturn(0);
2101 }
2102 
2103 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2104   PetscFunctionBegin;
2105   A->offloadmask = PETSC_OFFLOAD_CPU;
2106   *array         = NULL;
2107   PetscFunctionReturn(0);
2108 }
2109 
2110 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
2111   PetscFunctionBegin;
2112   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2113   *array = ((Mat_SeqAIJ *)A->data)->a;
2114   PetscFunctionReturn(0);
2115 }
2116 
2117 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
2118   PetscFunctionBegin;
2119   *array = NULL;
2120   PetscFunctionReturn(0);
2121 }
2122 
2123 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2124   PetscFunctionBegin;
2125   *array = ((Mat_SeqAIJ *)A->data)->a;
2126   PetscFunctionReturn(0);
2127 }
2128 
2129 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2130   PetscFunctionBegin;
2131   A->offloadmask = PETSC_OFFLOAD_CPU;
2132   *array         = NULL;
2133   PetscFunctionReturn(0);
2134 }
2135 
2136 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) {
2137   Mat_SeqAIJCUSPARSE *cusp;
2138   CsrMatrix          *matrix;
2139 
2140   PetscFunctionBegin;
2141   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2142   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2143   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2144   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2145   matrix = (CsrMatrix *)cusp->mat->mat;
2146 
2147   if (i) {
2148 #if !defined(PETSC_USE_64BIT_INDICES)
2149     *i = matrix->row_offsets->data().get();
2150 #else
2151     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2152 #endif
2153   }
2154   if (j) {
2155 #if !defined(PETSC_USE_64BIT_INDICES)
2156     *j = matrix->column_indices->data().get();
2157 #else
2158     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2159 #endif
2160   }
2161   if (a) *a = matrix->values->data().get();
2162   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2163   PetscFunctionReturn(0);
2164 }
2165 
2166 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) {
2167   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2168   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2169   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2170   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2171   cusparseStatus_t              stat;
2172   PetscBool                     both = PETSC_TRUE;
2173 
2174   PetscFunctionBegin;
2175   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2176   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2177     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2178       CsrMatrix *matrix;
2179       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2180 
2181       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2182       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2183       matrix->values->assign(a->a, a->a + a->nz);
2184       PetscCallCUDA(WaitForCUDA());
2185       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2186       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2187       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2188     } else {
2189       PetscInt nnz;
2190       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2191       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2192       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2193       delete cusparsestruct->workVector;
2194       delete cusparsestruct->rowoffsets_gpu;
2195       cusparsestruct->workVector     = NULL;
2196       cusparsestruct->rowoffsets_gpu = NULL;
2197       try {
2198         if (a->compressedrow.use) {
2199           m    = a->compressedrow.nrows;
2200           ii   = a->compressedrow.i;
2201           ridx = a->compressedrow.rindex;
2202         } else {
2203           m    = A->rmap->n;
2204           ii   = a->i;
2205           ridx = NULL;
2206         }
2207         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2208         if (!a->a) {
2209           nnz  = ii[m];
2210           both = PETSC_FALSE;
2211         } else nnz = a->nz;
2212         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2213 
2214         /* create cusparse matrix */
2215         cusparsestruct->nrows = m;
2216         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2217         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2218         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2219         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2220 
2221         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2222         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2223         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2224         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2225         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2226         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2227         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2228 
2229         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2230         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2231           /* set the matrix */
2232           CsrMatrix *mat   = new CsrMatrix;
2233           mat->num_rows    = m;
2234           mat->num_cols    = A->cmap->n;
2235           mat->num_entries = nnz;
2236           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2237           mat->row_offsets->assign(ii, ii + m + 1);
2238 
2239           mat->column_indices = new THRUSTINTARRAY32(nnz);
2240           mat->column_indices->assign(a->j, a->j + nnz);
2241 
2242           mat->values = new THRUSTARRAY(nnz);
2243           if (a->a) mat->values->assign(a->a, a->a + nnz);
2244 
2245           /* assign the pointer */
2246           matstruct->mat = mat;
2247 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2248           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2249             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2250                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2251             PetscCallCUSPARSE(stat);
2252           }
2253 #endif
2254         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2255 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2256           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2257 #else
2258           CsrMatrix *mat = new CsrMatrix;
2259           mat->num_rows = m;
2260           mat->num_cols = A->cmap->n;
2261           mat->num_entries = nnz;
2262           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2263           mat->row_offsets->assign(ii, ii + m + 1);
2264 
2265           mat->column_indices = new THRUSTINTARRAY32(nnz);
2266           mat->column_indices->assign(a->j, a->j + nnz);
2267 
2268           mat->values = new THRUSTARRAY(nnz);
2269           if (a->a) mat->values->assign(a->a, a->a + nnz);
2270 
2271           cusparseHybMat_t hybMat;
2272           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2273           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2274           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2275           PetscCallCUSPARSE(stat);
2276           /* assign the pointer */
2277           matstruct->mat = hybMat;
2278 
2279           if (mat) {
2280             if (mat->values) delete (THRUSTARRAY *)mat->values;
2281             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2282             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2283             delete (CsrMatrix *)mat;
2284           }
2285 #endif
2286         }
2287 
2288         /* assign the compressed row indices */
2289         if (a->compressedrow.use) {
2290           cusparsestruct->workVector = new THRUSTARRAY(m);
2291           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2292           matstruct->cprowIndices->assign(ridx, ridx + m);
2293           tmp = m;
2294         } else {
2295           cusparsestruct->workVector = NULL;
2296           matstruct->cprowIndices    = NULL;
2297           tmp                        = 0;
2298         }
2299         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2300 
2301         /* assign the pointer */
2302         cusparsestruct->mat = matstruct;
2303       } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
2304       PetscCallCUDA(WaitForCUDA());
2305       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2306       cusparsestruct->nonzerostate = A->nonzerostate;
2307     }
2308     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2309   }
2310   PetscFunctionReturn(0);
2311 }
2312 
2313 struct VecCUDAPlusEquals {
2314   template <typename Tuple>
2315   __host__ __device__ void operator()(Tuple t) {
2316     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2317   }
2318 };
2319 
2320 struct VecCUDAEquals {
2321   template <typename Tuple>
2322   __host__ __device__ void operator()(Tuple t) {
2323     thrust::get<1>(t) = thrust::get<0>(t);
2324   }
2325 };
2326 
2327 struct VecCUDAEqualsReverse {
2328   template <typename Tuple>
2329   __host__ __device__ void operator()(Tuple t) {
2330     thrust::get<0>(t) = thrust::get<1>(t);
2331   }
2332 };
2333 
2334 struct MatMatCusparse {
2335   PetscBool      cisdense;
2336   PetscScalar   *Bt;
2337   Mat            X;
2338   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2339   PetscLogDouble flops;
2340   CsrMatrix     *Bcsr;
2341 
2342 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2343   cusparseSpMatDescr_t matSpBDescr;
2344   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2345   cusparseDnMatDescr_t matBDescr;
2346   cusparseDnMatDescr_t matCDescr;
2347   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2348 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2349   void *dBuffer4;
2350   void *dBuffer5;
2351 #endif
2352   size_t                mmBufferSize;
2353   void                 *mmBuffer;
2354   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2355   cusparseSpGEMMDescr_t spgemmDesc;
2356 #endif
2357 };
2358 
2359 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) {
2360   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2361 
2362   PetscFunctionBegin;
2363   PetscCallCUDA(cudaFree(mmdata->Bt));
2364   delete mmdata->Bcsr;
2365 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2366   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2367   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2368   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2369   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2370 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2371   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2372   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2373 #endif
2374   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2375   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2376 #endif
2377   PetscCall(MatDestroy(&mmdata->X));
2378   PetscCall(PetscFree(data));
2379   PetscFunctionReturn(0);
2380 }
2381 
2382 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2383 
2384 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2385   Mat_Product                  *product = C->product;
2386   Mat                           A, B;
2387   PetscInt                      m, n, blda, clda;
2388   PetscBool                     flg, biscuda;
2389   Mat_SeqAIJCUSPARSE           *cusp;
2390   cusparseStatus_t              stat;
2391   cusparseOperation_t           opA;
2392   const PetscScalar            *barray;
2393   PetscScalar                  *carray;
2394   MatMatCusparse               *mmdata;
2395   Mat_SeqAIJCUSPARSEMultStruct *mat;
2396   CsrMatrix                    *csrmat;
2397 
2398   PetscFunctionBegin;
2399   MatCheckProduct(C, 1);
2400   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2401   mmdata = (MatMatCusparse *)product->data;
2402   A      = product->A;
2403   B      = product->B;
2404   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2405   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2406   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2407      Instead of silently accepting the wrong answer, I prefer to raise the error */
2408   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2409   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2410   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2411   switch (product->type) {
2412   case MATPRODUCT_AB:
2413   case MATPRODUCT_PtAP:
2414     mat = cusp->mat;
2415     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2416     m   = A->rmap->n;
2417     n   = B->cmap->n;
2418     break;
2419   case MATPRODUCT_AtB:
2420     if (!A->form_explicit_transpose) {
2421       mat = cusp->mat;
2422       opA = CUSPARSE_OPERATION_TRANSPOSE;
2423     } else {
2424       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2425       mat = cusp->matTranspose;
2426       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2427     }
2428     m = A->cmap->n;
2429     n = B->cmap->n;
2430     break;
2431   case MATPRODUCT_ABt:
2432   case MATPRODUCT_RARt:
2433     mat = cusp->mat;
2434     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2435     m   = A->rmap->n;
2436     n   = B->rmap->n;
2437     break;
2438   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2439   }
2440   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2441   csrmat = (CsrMatrix *)mat->mat;
2442   /* if the user passed a CPU matrix, copy the data to the GPU */
2443   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2444   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2445   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2446 
2447   PetscCall(MatDenseGetLDA(B, &blda));
2448   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2449     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
2450     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2451   } else {
2452     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
2453     PetscCall(MatDenseGetLDA(C, &clda));
2454   }
2455 
2456   PetscCall(PetscLogGpuTimeBegin());
2457 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2458   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2459   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2460   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2461     size_t mmBufferSize;
2462     if (mmdata->initialized && mmdata->Blda != blda) {
2463       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2464       mmdata->matBDescr = NULL;
2465     }
2466     if (!mmdata->matBDescr) {
2467       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2468       mmdata->Blda = blda;
2469     }
2470 
2471     if (mmdata->initialized && mmdata->Clda != clda) {
2472       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2473       mmdata->matCDescr = NULL;
2474     }
2475     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2476       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2477       mmdata->Clda = clda;
2478     }
2479 
2480     if (!mat->matDescr) {
2481       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2482                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2483       PetscCallCUSPARSE(stat);
2484     }
2485     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2486     PetscCallCUSPARSE(stat);
2487     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2488       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2489       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2490       mmdata->mmBufferSize = mmBufferSize;
2491     }
2492     mmdata->initialized = PETSC_TRUE;
2493   } else {
2494     /* to be safe, always update pointers of the mats */
2495     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2496     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2497     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2498   }
2499 
2500   /* do cusparseSpMM, which supports transpose on B */
2501   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2502   PetscCallCUSPARSE(stat);
2503 #else
2504   PetscInt k;
2505   /* cusparseXcsrmm does not support transpose on B */
2506   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2507     cublasHandle_t cublasv2handle;
2508     cublasStatus_t cerr;
2509 
2510     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2511     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2512     PetscCallCUBLAS(cerr);
2513     blda = B->cmap->n;
2514     k = B->cmap->n;
2515   } else {
2516     k = B->rmap->n;
2517   }
2518 
2519   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2520   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2521   PetscCallCUSPARSE(stat);
2522 #endif
2523   PetscCall(PetscLogGpuTimeEnd());
2524   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2525   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2526   if (product->type == MATPRODUCT_RARt) {
2527     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2528     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2529   } else if (product->type == MATPRODUCT_PtAP) {
2530     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2531     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2532   } else {
2533     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2534   }
2535   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2536   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2537   PetscFunctionReturn(0);
2538 }
2539 
2540 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2541   Mat_Product        *product = C->product;
2542   Mat                 A, B;
2543   PetscInt            m, n;
2544   PetscBool           cisdense, flg;
2545   MatMatCusparse     *mmdata;
2546   Mat_SeqAIJCUSPARSE *cusp;
2547 
2548   PetscFunctionBegin;
2549   MatCheckProduct(C, 1);
2550   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2551   A = product->A;
2552   B = product->B;
2553   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2554   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2555   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2556   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2557   switch (product->type) {
2558   case MATPRODUCT_AB:
2559     m = A->rmap->n;
2560     n = B->cmap->n;
2561     break;
2562   case MATPRODUCT_AtB:
2563     m = A->cmap->n;
2564     n = B->cmap->n;
2565     break;
2566   case MATPRODUCT_ABt:
2567     m = A->rmap->n;
2568     n = B->rmap->n;
2569     break;
2570   case MATPRODUCT_PtAP:
2571     m = B->cmap->n;
2572     n = B->cmap->n;
2573     break;
2574   case MATPRODUCT_RARt:
2575     m = B->rmap->n;
2576     n = B->rmap->n;
2577     break;
2578   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2579   }
2580   PetscCall(MatSetSizes(C, m, n, m, n));
2581   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2582   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2583   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2584 
2585   /* product data */
2586   PetscCall(PetscNew(&mmdata));
2587   mmdata->cisdense = cisdense;
2588 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2589   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2590   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2591 #endif
2592   /* for these products we need intermediate storage */
2593   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2594     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2595     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2596     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2597       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2598     } else {
2599       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2600     }
2601   }
2602   C->product->data    = mmdata;
2603   C->product->destroy = MatDestroy_MatMatCusparse;
2604 
2605   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2606   PetscFunctionReturn(0);
2607 }
2608 
2609 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2610   Mat_Product                  *product = C->product;
2611   Mat                           A, B;
2612   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2613   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2614   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2615   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2616   PetscBool                     flg;
2617   cusparseStatus_t              stat;
2618   MatProductType                ptype;
2619   MatMatCusparse               *mmdata;
2620 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2621   cusparseSpMatDescr_t BmatSpDescr;
2622 #endif
2623   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2624 
2625   PetscFunctionBegin;
2626   MatCheckProduct(C, 1);
2627   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2628   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2629   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2630   mmdata = (MatMatCusparse *)C->product->data;
2631   A      = product->A;
2632   B      = product->B;
2633   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2634     mmdata->reusesym = PETSC_FALSE;
2635     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2636     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2637     Cmat = Ccusp->mat;
2638     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2639     Ccsr = (CsrMatrix *)Cmat->mat;
2640     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2641     goto finalize;
2642   }
2643   if (!c->nz) goto finalize;
2644   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2645   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2646   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2647   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2648   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2649   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2650   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2651   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2652   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2653   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2654   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2655   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2656   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2657   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2658 
2659   ptype = product->type;
2660   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2661     ptype = MATPRODUCT_AB;
2662     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2663   }
2664   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2665     ptype = MATPRODUCT_AB;
2666     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2667   }
2668   switch (ptype) {
2669   case MATPRODUCT_AB:
2670     Amat = Acusp->mat;
2671     Bmat = Bcusp->mat;
2672     break;
2673   case MATPRODUCT_AtB:
2674     Amat = Acusp->matTranspose;
2675     Bmat = Bcusp->mat;
2676     break;
2677   case MATPRODUCT_ABt:
2678     Amat = Acusp->mat;
2679     Bmat = Bcusp->matTranspose;
2680     break;
2681   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2682   }
2683   Cmat = Ccusp->mat;
2684   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2685   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2686   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2687   Acsr = (CsrMatrix *)Amat->mat;
2688   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2689   Ccsr = (CsrMatrix *)Cmat->mat;
2690   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2691   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2692   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2693   PetscCall(PetscLogGpuTimeBegin());
2694 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2695   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2696   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2697 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2698   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2699   PetscCallCUSPARSE(stat);
2700 #else
2701   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2702   PetscCallCUSPARSE(stat);
2703   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2704   PetscCallCUSPARSE(stat);
2705 #endif
2706 #else
2707   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2708                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2709   PetscCallCUSPARSE(stat);
2710 #endif
2711   PetscCall(PetscLogGpuFlops(mmdata->flops));
2712   PetscCallCUDA(WaitForCUDA());
2713   PetscCall(PetscLogGpuTimeEnd());
2714   C->offloadmask = PETSC_OFFLOAD_GPU;
2715 finalize:
2716   /* shorter version of MatAssemblyEnd_SeqAIJ */
2717   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2718   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2719   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2720   c->reallocs = 0;
2721   C->info.mallocs += 0;
2722   C->info.nz_unneeded = 0;
2723   C->assembled = C->was_assembled = PETSC_TRUE;
2724   C->num_ass++;
2725   PetscFunctionReturn(0);
2726 }
2727 
2728 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2729   Mat_Product                  *product = C->product;
2730   Mat                           A, B;
2731   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2732   Mat_SeqAIJ                   *a, *b, *c;
2733   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2734   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2735   PetscInt                      i, j, m, n, k;
2736   PetscBool                     flg;
2737   cusparseStatus_t              stat;
2738   MatProductType                ptype;
2739   MatMatCusparse               *mmdata;
2740   PetscLogDouble                flops;
2741   PetscBool                     biscompressed, ciscompressed;
2742 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2743   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2744   cusparseSpMatDescr_t BmatSpDescr;
2745 #else
2746   int cnz;
2747 #endif
2748   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2749 
2750   PetscFunctionBegin;
2751   MatCheckProduct(C, 1);
2752   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2753   A = product->A;
2754   B = product->B;
2755   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2756   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2757   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2758   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2759   a = (Mat_SeqAIJ *)A->data;
2760   b = (Mat_SeqAIJ *)B->data;
2761   /* product data */
2762   PetscCall(PetscNew(&mmdata));
2763   C->product->data    = mmdata;
2764   C->product->destroy = MatDestroy_MatMatCusparse;
2765 
2766   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2767   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2768   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2769   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2770   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2771   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2772 
2773   ptype = product->type;
2774   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2775     ptype                                          = MATPRODUCT_AB;
2776     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2777   }
2778   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2779     ptype                                          = MATPRODUCT_AB;
2780     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2781   }
2782   biscompressed = PETSC_FALSE;
2783   ciscompressed = PETSC_FALSE;
2784   switch (ptype) {
2785   case MATPRODUCT_AB:
2786     m    = A->rmap->n;
2787     n    = B->cmap->n;
2788     k    = A->cmap->n;
2789     Amat = Acusp->mat;
2790     Bmat = Bcusp->mat;
2791     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2792     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2793     break;
2794   case MATPRODUCT_AtB:
2795     m = A->cmap->n;
2796     n = B->cmap->n;
2797     k = A->rmap->n;
2798     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2799     Amat = Acusp->matTranspose;
2800     Bmat = Bcusp->mat;
2801     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2802     break;
2803   case MATPRODUCT_ABt:
2804     m = A->rmap->n;
2805     n = B->rmap->n;
2806     k = A->cmap->n;
2807     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2808     Amat = Acusp->mat;
2809     Bmat = Bcusp->matTranspose;
2810     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2811     break;
2812   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2813   }
2814 
2815   /* create cusparse matrix */
2816   PetscCall(MatSetSizes(C, m, n, m, n));
2817   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2818   c     = (Mat_SeqAIJ *)C->data;
2819   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2820   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2821   Ccsr  = new CsrMatrix;
2822 
2823   c->compressedrow.use = ciscompressed;
2824   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2825     c->compressedrow.nrows = a->compressedrow.nrows;
2826     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2827     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2828     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2829     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2830     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2831   } else {
2832     c->compressedrow.nrows  = 0;
2833     c->compressedrow.i      = NULL;
2834     c->compressedrow.rindex = NULL;
2835     Ccusp->workVector       = NULL;
2836     Cmat->cprowIndices      = NULL;
2837   }
2838   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2839   Ccusp->mat        = Cmat;
2840   Ccusp->mat->mat   = Ccsr;
2841   Ccsr->num_rows    = Ccusp->nrows;
2842   Ccsr->num_cols    = n;
2843   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2844   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2845   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2846   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2847   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2848   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2849   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2850   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2851   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2852   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2853   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2854     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2855     c->nz                = 0;
2856     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2857     Ccsr->values         = new THRUSTARRAY(c->nz);
2858     goto finalizesym;
2859   }
2860 
2861   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2862   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2863   Acsr = (CsrMatrix *)Amat->mat;
2864   if (!biscompressed) {
2865     Bcsr = (CsrMatrix *)Bmat->mat;
2866 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2867     BmatSpDescr = Bmat->matDescr;
2868 #endif
2869   } else { /* we need to use row offsets for the full matrix */
2870     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2871     Bcsr                 = new CsrMatrix;
2872     Bcsr->num_rows       = B->rmap->n;
2873     Bcsr->num_cols       = cBcsr->num_cols;
2874     Bcsr->num_entries    = cBcsr->num_entries;
2875     Bcsr->column_indices = cBcsr->column_indices;
2876     Bcsr->values         = cBcsr->values;
2877     if (!Bcusp->rowoffsets_gpu) {
2878       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2879       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2880       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2881     }
2882     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2883     mmdata->Bcsr      = Bcsr;
2884 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2885     if (Bcsr->num_rows && Bcsr->num_cols) {
2886       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2887       PetscCallCUSPARSE(stat);
2888     }
2889     BmatSpDescr = mmdata->matSpBDescr;
2890 #endif
2891   }
2892   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2893   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2894   /* precompute flops count */
2895   if (ptype == MATPRODUCT_AB) {
2896     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2897       const PetscInt st = a->i[i];
2898       const PetscInt en = a->i[i + 1];
2899       for (j = st; j < en; j++) {
2900         const PetscInt brow = a->j[j];
2901         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2902       }
2903     }
2904   } else if (ptype == MATPRODUCT_AtB) {
2905     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2906       const PetscInt anzi = a->i[i + 1] - a->i[i];
2907       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2908       flops += (2. * anzi) * bnzi;
2909     }
2910   } else { /* TODO */
2911     flops = 0.;
2912   }
2913 
2914   mmdata->flops = flops;
2915   PetscCall(PetscLogGpuTimeBegin());
2916 
2917 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2918   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2919   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2920   PetscCallCUSPARSE(stat);
2921   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2922 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2923   {
2924     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2925      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2926   */
2927     void  *dBuffer1    = NULL;
2928     void  *dBuffer2    = NULL;
2929     void  *dBuffer3    = NULL;
2930     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2931     size_t bufferSize1 = 0;
2932     size_t bufferSize2 = 0;
2933     size_t bufferSize3 = 0;
2934     size_t bufferSize4 = 0;
2935     size_t bufferSize5 = 0;
2936 
2937     /*----------------------------------------------------------------------*/
2938     /* ask bufferSize1 bytes for external memory */
2939     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2940     PetscCallCUSPARSE(stat);
2941     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2942     /* inspect the matrices A and B to understand the memory requirement for the next step */
2943     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2944     PetscCallCUSPARSE(stat);
2945 
2946     /*----------------------------------------------------------------------*/
2947     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2948     PetscCallCUSPARSE(stat);
2949     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2950     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2951     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2952     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2953     PetscCallCUSPARSE(stat);
2954     PetscCallCUDA(cudaFree(dBuffer1));
2955     PetscCallCUDA(cudaFree(dBuffer2));
2956 
2957     /*----------------------------------------------------------------------*/
2958     /* get matrix C non-zero entries C_nnz1 */
2959     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2960     c->nz                = (PetscInt)C_nnz1;
2961     /* allocate matrix C */
2962     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2963     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2964     Ccsr->values = new THRUSTARRAY(c->nz);
2965     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2966     /* update matC with the new pointers */
2967     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2968     PetscCallCUSPARSE(stat);
2969 
2970     /*----------------------------------------------------------------------*/
2971     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2972     PetscCallCUSPARSE(stat);
2973     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2974     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2975     PetscCallCUSPARSE(stat);
2976     PetscCallCUDA(cudaFree(dBuffer3));
2977     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2978     PetscCallCUSPARSE(stat);
2979     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2980   }
2981 #else
2982   size_t bufSize2;
2983   /* ask bufferSize bytes for external memory */
2984   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2985   PetscCallCUSPARSE(stat);
2986   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2987   /* inspect the matrices A and B to understand the memory requirement for the next step */
2988   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2989   PetscCallCUSPARSE(stat);
2990   /* ask bufferSize again bytes for external memory */
2991   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2992   PetscCallCUSPARSE(stat);
2993   /* The CUSPARSE documentation is not clear, nor the API
2994      We need both buffers to perform the operations properly!
2995      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2996      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2997      is stored in the descriptor! What a messy API... */
2998   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2999   /* compute the intermediate product of A * B */
3000   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3001   PetscCallCUSPARSE(stat);
3002   /* get matrix C non-zero entries C_nnz1 */
3003   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3004   c->nz = (PetscInt)C_nnz1;
3005   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3006                       mmdata->mmBufferSize / 1024));
3007   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3008   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3009   Ccsr->values = new THRUSTARRAY(c->nz);
3010   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3011   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3012   PetscCallCUSPARSE(stat);
3013   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3014   PetscCallCUSPARSE(stat);
3015 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3016 #else
3017   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3018   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3019                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3020   PetscCallCUSPARSE(stat);
3021   c->nz = cnz;
3022   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3023   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3024   Ccsr->values = new THRUSTARRAY(c->nz);
3025   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3026 
3027   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3028   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3029      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3030      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3031   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3032                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3033   PetscCallCUSPARSE(stat);
3034 #endif
3035   PetscCall(PetscLogGpuFlops(mmdata->flops));
3036   PetscCall(PetscLogGpuTimeEnd());
3037 finalizesym:
3038   c->singlemalloc = PETSC_FALSE;
3039   c->free_a       = PETSC_TRUE;
3040   c->free_ij      = PETSC_TRUE;
3041   PetscCall(PetscMalloc1(m + 1, &c->i));
3042   PetscCall(PetscMalloc1(c->nz, &c->j));
3043   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3044     PetscInt      *d_i = c->i;
3045     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3046     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3047     ii = *Ccsr->row_offsets;
3048     jj = *Ccsr->column_indices;
3049     if (ciscompressed) d_i = c->compressedrow.i;
3050     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3051     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3052   } else {
3053     PetscInt *d_i = c->i;
3054     if (ciscompressed) d_i = c->compressedrow.i;
3055     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3056     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3057   }
3058   if (ciscompressed) { /* need to expand host row offsets */
3059     PetscInt r = 0;
3060     c->i[0]    = 0;
3061     for (k = 0; k < c->compressedrow.nrows; k++) {
3062       const PetscInt next = c->compressedrow.rindex[k];
3063       const PetscInt old  = c->compressedrow.i[k];
3064       for (; r < next; r++) c->i[r + 1] = old;
3065     }
3066     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3067   }
3068   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3069   PetscCall(PetscMalloc1(m, &c->ilen));
3070   PetscCall(PetscMalloc1(m, &c->imax));
3071   c->maxnz         = c->nz;
3072   c->nonzerorowcnt = 0;
3073   c->rmax          = 0;
3074   for (k = 0; k < m; k++) {
3075     const PetscInt nn = c->i[k + 1] - c->i[k];
3076     c->ilen[k] = c->imax[k] = nn;
3077     c->nonzerorowcnt += (PetscInt) !!nn;
3078     c->rmax = PetscMax(c->rmax, nn);
3079   }
3080   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3081   PetscCall(PetscMalloc1(c->nz, &c->a));
3082   Ccsr->num_entries = c->nz;
3083 
3084   C->nonzerostate++;
3085   PetscCall(PetscLayoutSetUp(C->rmap));
3086   PetscCall(PetscLayoutSetUp(C->cmap));
3087   Ccusp->nonzerostate = C->nonzerostate;
3088   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3089   C->preallocated     = PETSC_TRUE;
3090   C->assembled        = PETSC_FALSE;
3091   C->was_assembled    = PETSC_FALSE;
3092   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3093     mmdata->reusesym = PETSC_TRUE;
3094     C->offloadmask   = PETSC_OFFLOAD_GPU;
3095   }
3096   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3097   PetscFunctionReturn(0);
3098 }
3099 
3100 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3101 
3102 /* handles sparse or dense B */
3103 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) {
3104   Mat_Product *product = mat->product;
3105   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3106 
3107   PetscFunctionBegin;
3108   MatCheckProduct(mat, 1);
3109   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3110   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3111   if (product->type == MATPRODUCT_ABC) {
3112     Ciscusp = PETSC_FALSE;
3113     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3114   }
3115   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3116     PetscBool usecpu = PETSC_FALSE;
3117     switch (product->type) {
3118     case MATPRODUCT_AB:
3119       if (product->api_user) {
3120         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3121         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3122         PetscOptionsEnd();
3123       } else {
3124         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3125         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3126         PetscOptionsEnd();
3127       }
3128       break;
3129     case MATPRODUCT_AtB:
3130       if (product->api_user) {
3131         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3132         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3133         PetscOptionsEnd();
3134       } else {
3135         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3136         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3137         PetscOptionsEnd();
3138       }
3139       break;
3140     case MATPRODUCT_PtAP:
3141       if (product->api_user) {
3142         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3143         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3144         PetscOptionsEnd();
3145       } else {
3146         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3147         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3148         PetscOptionsEnd();
3149       }
3150       break;
3151     case MATPRODUCT_RARt:
3152       if (product->api_user) {
3153         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3154         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3155         PetscOptionsEnd();
3156       } else {
3157         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3158         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3159         PetscOptionsEnd();
3160       }
3161       break;
3162     case MATPRODUCT_ABC:
3163       if (product->api_user) {
3164         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3165         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3166         PetscOptionsEnd();
3167       } else {
3168         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3169         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3170         PetscOptionsEnd();
3171       }
3172       break;
3173     default: break;
3174     }
3175     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3176   }
3177   /* dispatch */
3178   if (isdense) {
3179     switch (product->type) {
3180     case MATPRODUCT_AB:
3181     case MATPRODUCT_AtB:
3182     case MATPRODUCT_ABt:
3183     case MATPRODUCT_PtAP:
3184     case MATPRODUCT_RARt:
3185       if (product->A->boundtocpu) {
3186         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3187       } else {
3188         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3189       }
3190       break;
3191     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
3192     default: break;
3193     }
3194   } else if (Biscusp && Ciscusp) {
3195     switch (product->type) {
3196     case MATPRODUCT_AB:
3197     case MATPRODUCT_AtB:
3198     case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break;
3199     case MATPRODUCT_PtAP:
3200     case MATPRODUCT_RARt:
3201     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
3202     default: break;
3203     }
3204   } else { /* fallback for AIJ */
3205     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3206   }
3207   PetscFunctionReturn(0);
3208 }
3209 
3210 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3211   PetscFunctionBegin;
3212   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3213   PetscFunctionReturn(0);
3214 }
3215 
3216 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3217   PetscFunctionBegin;
3218   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3219   PetscFunctionReturn(0);
3220 }
3221 
3222 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3223   PetscFunctionBegin;
3224   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3225   PetscFunctionReturn(0);
3226 }
3227 
3228 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3229   PetscFunctionBegin;
3230   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3231   PetscFunctionReturn(0);
3232 }
3233 
3234 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3235   PetscFunctionBegin;
3236   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3237   PetscFunctionReturn(0);
3238 }
3239 
3240 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) {
3241   int i = blockIdx.x * blockDim.x + threadIdx.x;
3242   if (i < n) y[idx[i]] += x[i];
3243 }
3244 
3245 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3246 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) {
3247   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3248   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3249   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3250   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3251   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3252   PetscBool                     compressed;
3253 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3254   PetscInt nx, ny;
3255 #endif
3256 
3257   PetscFunctionBegin;
3258   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3259   if (!a->nz) {
3260     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
3261     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3262     PetscFunctionReturn(0);
3263   }
3264   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3265   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3266   if (!trans) {
3267     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3268     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3269   } else {
3270     if (herm || !A->form_explicit_transpose) {
3271       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3272       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3273     } else {
3274       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3275       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3276     }
3277   }
3278   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3279   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3280 
3281   try {
3282     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3283     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3284     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3285 
3286     PetscCall(PetscLogGpuTimeBegin());
3287     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3288       /* z = A x + beta y.
3289          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3290          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3291       */
3292       xptr = xarray;
3293       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3294       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3295 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3296       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3297           allocated to accommodate different uses. So we get the length info directly from mat.
3298        */
3299       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3300         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3301         nx             = mat->num_cols;
3302         ny             = mat->num_rows;
3303       }
3304 #endif
3305     } else {
3306       /* z = A^T x + beta y
3307          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3308          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3309        */
3310       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3311       dptr = zarray;
3312       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3313       if (compressed) { /* Scatter x to work vector */
3314         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3315         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3316                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3317       }
3318 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3319       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3320         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3321         nx             = mat->num_rows;
3322         ny             = mat->num_cols;
3323       }
3324 #endif
3325     }
3326 
3327     /* csr_spmv does y = alpha op(A) x + beta y */
3328     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3329 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3330       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3331       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3332         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3333         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3334         PetscCallCUSPARSE(
3335           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3336         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3337 
3338         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3339       } else {
3340         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3341         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3342         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3343       }
3344 
3345       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3346                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3347 #else
3348       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3349       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3350 #endif
3351     } else {
3352       if (cusparsestruct->nrows) {
3353 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3354         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3355 #else
3356         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3357         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3358 #endif
3359       }
3360     }
3361     PetscCall(PetscLogGpuTimeEnd());
3362 
3363     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3364       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3365         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3366           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3367         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3368           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
3369         }
3370       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3371         PetscCall(VecSet_SeqCUDA(zz, 0));
3372       }
3373 
3374       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3375       if (compressed) {
3376         PetscCall(PetscLogGpuTimeBegin());
3377         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3378            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3379            prevent that. So I just add a ScatterAdd kernel.
3380          */
3381 #if 0
3382         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3383         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3384                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3385                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3386                          VecCUDAPlusEquals());
3387 #else
3388         PetscInt n = matstruct->cprowIndices->size();
3389         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3390 #endif
3391         PetscCall(PetscLogGpuTimeEnd());
3392       }
3393     } else {
3394       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3395     }
3396     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3397     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3398     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3399   } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
3400   if (yy) {
3401     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3402   } else {
3403     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3404   }
3405   PetscFunctionReturn(0);
3406 }
3407 
3408 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3409   PetscFunctionBegin;
3410   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3411   PetscFunctionReturn(0);
3412 }
3413 
3414 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) {
3415   PetscObjectState    onnz = A->nonzerostate;
3416   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3417 
3418   PetscFunctionBegin;
3419   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3420   if (onnz != A->nonzerostate && cusp->deviceMat) {
3421     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3422     PetscCallCUDA(cudaFree(cusp->deviceMat));
3423     cusp->deviceMat = NULL;
3424   }
3425   PetscFunctionReturn(0);
3426 }
3427 
3428 /* --------------------------------------------------------------------------------*/
3429 /*@
3430    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3431    (the default parallel PETSc format). This matrix will ultimately pushed down
3432    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3433    assembly performance the user should preallocate the matrix storage by setting
3434    the parameter nz (or the array nnz).  By setting these parameters accurately,
3435    performance during matrix assembly can be increased by more than a factor of 50.
3436 
3437    Collective
3438 
3439    Input Parameters:
3440 +  comm - MPI communicator, set to PETSC_COMM_SELF
3441 .  m - number of rows
3442 .  n - number of columns
3443 .  nz - number of nonzeros per row (same for all rows)
3444 -  nnz - array containing the number of nonzeros in the various rows
3445          (possibly different for each row) or NULL
3446 
3447    Output Parameter:
3448 .  A - the matrix
3449 
3450    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3451    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3452    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3453 
3454    Notes:
3455    If nnz is given then nz is ignored
3456 
3457    The AIJ format (also called the Yale sparse matrix format or
3458    compressed row storage), is fully compatible with standard Fortran 77
3459    storage.  That is, the stored row and column indices can begin at
3460    either one (as in Fortran) or zero.  See the users' manual for details.
3461 
3462    Specify the preallocated storage with either nz or nnz (not both).
3463    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3464    allocation.  For large problems you MUST preallocate memory or you
3465    will get TERRIBLE performance, see the users' manual chapter on matrices.
3466 
3467    By default, this format uses inodes (identical nodes) when possible, to
3468    improve numerical efficiency of matrix-vector products and solves. We
3469    search for consecutive rows with the same nonzero structure, thereby
3470    reusing matrix information to achieve increased efficiency.
3471 
3472    Level: intermediate
3473 
3474 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3475 @*/
3476 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
3477   PetscFunctionBegin;
3478   PetscCall(MatCreate(comm, A));
3479   PetscCall(MatSetSizes(*A, m, n, m, n));
3480   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3481   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3482   PetscFunctionReturn(0);
3483 }
3484 
3485 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) {
3486   PetscFunctionBegin;
3487   if (A->factortype == MAT_FACTOR_NONE) {
3488     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3489   } else {
3490     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3491   }
3492   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3493   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3494   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3495   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3496   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3497   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3498   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3499   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3500   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3501   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3502   PetscCall(MatDestroy_SeqAIJ(A));
3503   PetscFunctionReturn(0);
3504 }
3505 
3506 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3507 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3508 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) {
3509         PetscFunctionBegin;
3510         PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3511         PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3512         PetscFunctionReturn(0);
3513 }
3514 
3515 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) {
3516   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3517   Mat_SeqAIJCUSPARSE *cy;
3518   Mat_SeqAIJCUSPARSE *cx;
3519   PetscScalar        *ay;
3520   const PetscScalar  *ax;
3521   CsrMatrix          *csry, *csrx;
3522 
3523   PetscFunctionBegin;
3524   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3525   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3526   if (X->ops->axpy != Y->ops->axpy) {
3527     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3528     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3529     PetscFunctionReturn(0);
3530   }
3531   /* if we are here, it means both matrices are bound to GPU */
3532   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3533   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3534   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3535   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3536   csry = (CsrMatrix *)cy->mat->mat;
3537   csrx = (CsrMatrix *)cx->mat->mat;
3538   /* see if we can turn this into a cublas axpy */
3539   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3540     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3541     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3542     if (eq) str = SAME_NONZERO_PATTERN;
3543   }
3544   /* spgeam is buggy with one column */
3545   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3546 
3547   if (str == SUBSET_NONZERO_PATTERN) {
3548     PetscScalar b = 1.0;
3549 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3550     size_t bufferSize;
3551     void  *buffer;
3552 #endif
3553 
3554     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3555     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3556     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3557 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3558     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3559                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3560     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3561     PetscCall(PetscLogGpuTimeBegin());
3562     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3563                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3564     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3565     PetscCall(PetscLogGpuTimeEnd());
3566     PetscCallCUDA(cudaFree(buffer));
3567 #else
3568     PetscCall(PetscLogGpuTimeBegin());
3569     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3570                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3571     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3572     PetscCall(PetscLogGpuTimeEnd());
3573 #endif
3574     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3575     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3576     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3577     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3578   } else if (str == SAME_NONZERO_PATTERN) {
3579     cublasHandle_t cublasv2handle;
3580     PetscBLASInt   one = 1, bnz = 1;
3581 
3582     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3583     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3584     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3585     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3586     PetscCall(PetscLogGpuTimeBegin());
3587     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3588     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3589     PetscCall(PetscLogGpuTimeEnd());
3590     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3591     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3592     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3593   } else {
3594     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3595     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3596   }
3597   PetscFunctionReturn(0);
3598 }
3599 
3600 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) {
3601   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3602   PetscScalar   *ay;
3603   cublasHandle_t cublasv2handle;
3604   PetscBLASInt   one = 1, bnz = 1;
3605 
3606   PetscFunctionBegin;
3607   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3608   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3609   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3610   PetscCall(PetscLogGpuTimeBegin());
3611   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3612   PetscCall(PetscLogGpuFlops(bnz));
3613   PetscCall(PetscLogGpuTimeEnd());
3614   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3615   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3616   PetscFunctionReturn(0);
3617 }
3618 
3619 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) {
3620   PetscBool   both = PETSC_FALSE;
3621   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3622 
3623   PetscFunctionBegin;
3624   if (A->factortype == MAT_FACTOR_NONE) {
3625     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3626     if (spptr->mat) {
3627       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3628       if (matrix->values) {
3629         both = PETSC_TRUE;
3630         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3631       }
3632     }
3633     if (spptr->matTranspose) {
3634       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3635       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3636     }
3637   }
3638   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3639   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3640   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3641   else A->offloadmask = PETSC_OFFLOAD_CPU;
3642   PetscFunctionReturn(0);
3643 }
3644 
3645 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) {
3646   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3647 
3648   PetscFunctionBegin;
3649   if (A->factortype != MAT_FACTOR_NONE) {
3650     A->boundtocpu = flg;
3651     PetscFunctionReturn(0);
3652   }
3653   if (flg) {
3654     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3655 
3656     A->ops->scale                     = MatScale_SeqAIJ;
3657     A->ops->axpy                      = MatAXPY_SeqAIJ;
3658     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3659     A->ops->mult                      = MatMult_SeqAIJ;
3660     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3661     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3662     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3663     A->ops->multhermitiantranspose    = NULL;
3664     A->ops->multhermitiantransposeadd = NULL;
3665     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3666     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3667     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3668     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3669     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3670     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3671     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3672     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3673   } else {
3674     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3675     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3676     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3677     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3678     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3679     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3680     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3681     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3682     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3683     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3684     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3685     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3686     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3687     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3688     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3689     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3690     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3691 
3692     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3693     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3694     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3695     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3696     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3697     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3698   }
3699   A->boundtocpu = flg;
3700   if (flg && a->inode.size) {
3701     a->inode.use = PETSC_TRUE;
3702   } else {
3703     a->inode.use = PETSC_FALSE;
3704   }
3705   PetscFunctionReturn(0);
3706 }
3707 
3708 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) {
3709   Mat B;
3710 
3711   PetscFunctionBegin;
3712   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3713   if (reuse == MAT_INITIAL_MATRIX) {
3714     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3715   } else if (reuse == MAT_REUSE_MATRIX) {
3716     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3717   }
3718   B = *newmat;
3719 
3720   PetscCall(PetscFree(B->defaultvectype));
3721   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3722 
3723   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3724     if (B->factortype == MAT_FACTOR_NONE) {
3725       Mat_SeqAIJCUSPARSE *spptr;
3726       PetscCall(PetscNew(&spptr));
3727       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3728       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3729       spptr->format = MAT_CUSPARSE_CSR;
3730 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3731 #if CUSPARSE_VERSION > 11301
3732       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3733 #else
3734       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3735 #endif
3736       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3737       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3738 #endif
3739       B->spptr = spptr;
3740     } else {
3741       Mat_SeqAIJCUSPARSETriFactors *spptr;
3742 
3743       PetscCall(PetscNew(&spptr));
3744       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3745       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3746       B->spptr = spptr;
3747     }
3748     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3749   }
3750   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3751   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3752   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3753   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3754   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3755   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3756 
3757   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3758   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3759   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3760 #if defined(PETSC_HAVE_HYPRE)
3761   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3762 #endif
3763   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3764   PetscFunctionReturn(0);
3765 }
3766 
3767 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) {
3768   PetscFunctionBegin;
3769   PetscCall(MatCreate_SeqAIJ(B));
3770   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3771   PetscFunctionReturn(0);
3772 }
3773 
3774 /*MC
3775    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3776 
3777    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3778    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3779    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3780 
3781    Options Database Keys:
3782 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3783 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3784 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3785 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3786 
3787   Level: beginner
3788 
3789 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3790 M*/
3791 
3792 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3793 
3794 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) {
3795   PetscFunctionBegin;
3796   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3797   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3798   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3799   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3800   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3801 
3802   PetscFunctionReturn(0);
3803 }
3804 
3805 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) {
3806   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3807 
3808   PetscFunctionBegin;
3809   if (!cusp) PetscFunctionReturn(0);
3810   delete cusp->cooPerm;
3811   delete cusp->cooPerm_a;
3812   cusp->cooPerm   = NULL;
3813   cusp->cooPerm_a = NULL;
3814   if (cusp->use_extended_coo) {
3815     PetscCallCUDA(cudaFree(cusp->jmap_d));
3816     PetscCallCUDA(cudaFree(cusp->perm_d));
3817   }
3818   cusp->use_extended_coo = PETSC_FALSE;
3819   PetscFunctionReturn(0);
3820 }
3821 
3822 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) {
3823   PetscFunctionBegin;
3824   if (*cusparsestruct) {
3825     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3826     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3827     delete (*cusparsestruct)->workVector;
3828     delete (*cusparsestruct)->rowoffsets_gpu;
3829     delete (*cusparsestruct)->cooPerm;
3830     delete (*cusparsestruct)->cooPerm_a;
3831     delete (*cusparsestruct)->csr2csc_i;
3832     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3833     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3834     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3835     PetscCall(PetscFree(*cusparsestruct));
3836   }
3837   PetscFunctionReturn(0);
3838 }
3839 
3840 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) {
3841   PetscFunctionBegin;
3842   if (*mat) {
3843     delete (*mat)->values;
3844     delete (*mat)->column_indices;
3845     delete (*mat)->row_offsets;
3846     delete *mat;
3847     *mat = 0;
3848   }
3849   PetscFunctionReturn(0);
3850 }
3851 
3852 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) {
3853   PetscFunctionBegin;
3854   if (*trifactor) {
3855     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3856     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3857     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3858     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3859     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3860 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3861     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3862 #endif
3863     PetscCall(PetscFree(*trifactor));
3864   }
3865   PetscFunctionReturn(0);
3866 }
3867 
3868 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) {
3869   CsrMatrix *mat;
3870 
3871   PetscFunctionBegin;
3872   if (*matstruct) {
3873     if ((*matstruct)->mat) {
3874       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3875 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3876         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3877 #else
3878         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3879         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3880 #endif
3881       } else {
3882         mat = (CsrMatrix *)(*matstruct)->mat;
3883         CsrMatrix_Destroy(&mat);
3884       }
3885     }
3886     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3887     delete (*matstruct)->cprowIndices;
3888     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3889     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3890     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3891 
3892 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3893     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3894     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3895     for (int i = 0; i < 3; i++) {
3896       if (mdata->cuSpMV[i].initialized) {
3897         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3898         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3899         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3900       }
3901     }
3902 #endif
3903     delete *matstruct;
3904     *matstruct = NULL;
3905   }
3906   PetscFunctionReturn(0);
3907 }
3908 
3909 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) {
3910   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3911 
3912   PetscFunctionBegin;
3913   if (fs) {
3914     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3915     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3916     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3917     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3918     delete fs->rpermIndices;
3919     delete fs->cpermIndices;
3920     delete fs->workVector;
3921     fs->rpermIndices = NULL;
3922     fs->cpermIndices = NULL;
3923     fs->workVector   = NULL;
3924     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3925     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3926     fs->init_dev_prop = PETSC_FALSE;
3927 #if CUSPARSE_VERSION >= 11500
3928     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3929     PetscCallCUDA(cudaFree(fs->csrColIdx));
3930     PetscCallCUDA(cudaFree(fs->csrVal));
3931     PetscCallCUDA(cudaFree(fs->X));
3932     PetscCallCUDA(cudaFree(fs->Y));
3933     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3934     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3935     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3936     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3937     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3938     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3939     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3940     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3941     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3942     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3943     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3944     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3945     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3946     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3947     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3948     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3949 
3950     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3951     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3952 #endif
3953   }
3954   PetscFunctionReturn(0);
3955 }
3956 
3957 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) {
3958   cusparseHandle_t handle;
3959 
3960   PetscFunctionBegin;
3961   if (*trifactors) {
3962     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3963     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
3964     PetscCall(PetscFree(*trifactors));
3965   }
3966   PetscFunctionReturn(0);
3967 }
3968 
3969 struct IJCompare {
3970   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
3971     if (t1.get<0>() < t2.get<0>()) return true;
3972     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3973     return false;
3974   }
3975 };
3976 
3977 struct IJEqual {
3978   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
3979     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3980     return true;
3981   }
3982 };
3983 
3984 struct IJDiff {
3985   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3986 };
3987 
3988 struct IJSum {
3989   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3990 };
3991 
3992 #include <thrust/iterator/discard_iterator.h>
3993 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3994 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) {
3995   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
3996   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
3997   THRUSTARRAY                          *cooPerm_v = NULL;
3998   thrust::device_ptr<const PetscScalar> d_v;
3999   CsrMatrix                            *matrix;
4000   PetscInt                              n;
4001 
4002   PetscFunctionBegin;
4003   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4004   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4005   if (!cusp->cooPerm) {
4006     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4007     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4008     PetscFunctionReturn(0);
4009   }
4010   matrix = (CsrMatrix *)cusp->mat->mat;
4011   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4012   if (!v) {
4013     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4014     goto finalize;
4015   }
4016   n = cusp->cooPerm->size();
4017   if (isCudaMem(v)) {
4018     d_v = thrust::device_pointer_cast(v);
4019   } else {
4020     cooPerm_v = new THRUSTARRAY(n);
4021     cooPerm_v->assign(v, v + n);
4022     d_v = cooPerm_v->data();
4023     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4024   }
4025   PetscCall(PetscLogGpuTimeBegin());
4026   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4027     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4028       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4029       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4030       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4031         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4032         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4033       */
4034       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4035       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4036       delete cooPerm_w;
4037     } else {
4038       /* all nonzeros in d_v[] are unique entries */
4039       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4040       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4041       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4042     }
4043   } else {
4044     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4045       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4046       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4047     } else {
4048       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4049       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4050       thrust::for_each(zibit, zieit, VecCUDAEquals());
4051     }
4052   }
4053   PetscCall(PetscLogGpuTimeEnd());
4054 finalize:
4055   delete cooPerm_v;
4056   A->offloadmask = PETSC_OFFLOAD_GPU;
4057   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4058   /* shorter version of MatAssemblyEnd_SeqAIJ */
4059   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4060   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4061   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4062   a->reallocs = 0;
4063   A->info.mallocs += 0;
4064   A->info.nz_unneeded = 0;
4065   A->assembled = A->was_assembled = PETSC_TRUE;
4066   A->num_ass++;
4067   PetscFunctionReturn(0);
4068 }
4069 
4070 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) {
4071   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4072 
4073   PetscFunctionBegin;
4074   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4075   if (!cusp) PetscFunctionReturn(0);
4076   if (destroy) {
4077     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4078     delete cusp->csr2csc_i;
4079     cusp->csr2csc_i = NULL;
4080   }
4081   A->transupdated = PETSC_FALSE;
4082   PetscFunctionReturn(0);
4083 }
4084 
4085 #include <thrust/binary_search.h>
4086 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4087 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) {
4088   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4089   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4090   PetscInt            cooPerm_n, nzr = 0;
4091 
4092   PetscFunctionBegin;
4093   PetscCall(PetscLayoutSetUp(A->rmap));
4094   PetscCall(PetscLayoutSetUp(A->cmap));
4095   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4096   if (n != cooPerm_n) {
4097     delete cusp->cooPerm;
4098     delete cusp->cooPerm_a;
4099     cusp->cooPerm   = NULL;
4100     cusp->cooPerm_a = NULL;
4101   }
4102   if (n) {
4103     thrust::device_ptr<PetscInt> d_i, d_j;
4104     PetscInt                    *d_raw_i, *d_raw_j;
4105     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4106     PetscMemType                 imtype, jmtype;
4107 
4108     PetscCall(PetscGetMemType(coo_i, &imtype));
4109     if (PetscMemTypeHost(imtype)) {
4110       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4111       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4112       d_i        = thrust::device_pointer_cast(d_raw_i);
4113       free_raw_i = PETSC_TRUE;
4114       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4115     } else {
4116       d_i = thrust::device_pointer_cast(coo_i);
4117     }
4118 
4119     PetscCall(PetscGetMemType(coo_j, &jmtype));
4120     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4121       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4122       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4123       d_j        = thrust::device_pointer_cast(d_raw_j);
4124       free_raw_j = PETSC_TRUE;
4125       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4126     } else {
4127       d_j = thrust::device_pointer_cast(coo_j);
4128     }
4129 
4130     THRUSTINTARRAY ii(A->rmap->n);
4131 
4132     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4133     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4134 
4135     /* Ex.
4136       n = 6
4137       coo_i = [3,3,1,4,1,4]
4138       coo_j = [3,2,2,5,2,6]
4139     */
4140     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4141     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4142 
4143     PetscCall(PetscLogGpuTimeBegin());
4144     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4145     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4146     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4147     THRUSTINTARRAY w(d_j, d_j + n);
4148 
4149     /*
4150       d_i     = [1,1,3,3,4,4]
4151       d_j     = [2,2,2,3,5,6]
4152       cooPerm = [2,4,1,0,3,5]
4153     */
4154     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4155 
4156     /*
4157       d_i     = [1,3,3,4,4,x]
4158                             ^ekey
4159       d_j     = [2,2,3,5,6,x]
4160                            ^nekye
4161     */
4162     if (nekey == ekey) { /* all entries are unique */
4163       delete cusp->cooPerm_a;
4164       cusp->cooPerm_a = NULL;
4165     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4166       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4167       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4168       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4169       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4170       w[0]                  = 0;
4171       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4172       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4173     }
4174     thrust::counting_iterator<PetscInt> search_begin(0);
4175     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4176                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4177                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4178     PetscCall(PetscLogGpuTimeEnd());
4179 
4180     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4181     a->singlemalloc = PETSC_FALSE;
4182     a->free_a       = PETSC_TRUE;
4183     a->free_ij      = PETSC_TRUE;
4184     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4185     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4186     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4187     a->nz = a->maxnz = a->i[A->rmap->n];
4188     a->rmax          = 0;
4189     PetscCall(PetscMalloc1(a->nz, &a->a));
4190     PetscCall(PetscMalloc1(a->nz, &a->j));
4191     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4192     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4193     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4194     for (PetscInt i = 0; i < A->rmap->n; i++) {
4195       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4196       nzr += (PetscInt) !!(nnzr);
4197       a->ilen[i] = a->imax[i] = nnzr;
4198       a->rmax                 = PetscMax(a->rmax, nnzr);
4199     }
4200     a->nonzerorowcnt = nzr;
4201     A->preallocated  = PETSC_TRUE;
4202     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4203     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4204     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4205     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4206   } else {
4207     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4208   }
4209   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4210 
4211   /* We want to allocate the CUSPARSE struct for matvec now.
4212      The code is so convoluted now that I prefer to copy zeros */
4213   PetscCall(PetscArrayzero(a->a, a->nz));
4214   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4215   A->offloadmask = PETSC_OFFLOAD_CPU;
4216   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4217   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4218   PetscFunctionReturn(0);
4219 }
4220 
4221 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
4222   Mat_SeqAIJ         *seq;
4223   Mat_SeqAIJCUSPARSE *dev;
4224   PetscBool           coo_basic = PETSC_TRUE;
4225   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4226 
4227   PetscFunctionBegin;
4228   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4229   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4230   if (coo_i) {
4231     PetscCall(PetscGetMemType(coo_i, &mtype));
4232     if (PetscMemTypeHost(mtype)) {
4233       for (PetscCount k = 0; k < coo_n; k++) {
4234         if (coo_i[k] < 0 || coo_j[k] < 0) {
4235           coo_basic = PETSC_FALSE;
4236           break;
4237         }
4238       }
4239     }
4240   }
4241 
4242   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4243     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4244   } else {
4245     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4246     mat->offloadmask = PETSC_OFFLOAD_CPU;
4247     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4248     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4249     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4250     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4251     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4252     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4253     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4254     dev->use_extended_coo = PETSC_TRUE;
4255   }
4256   PetscFunctionReturn(0);
4257 }
4258 
4259 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) {
4260   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4261   const PetscCount grid_size = gridDim.x * blockDim.x;
4262   for (; i < nnz; i += grid_size) {
4263     PetscScalar sum = 0.0;
4264     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4265     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4266   }
4267 }
4268 
4269 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) {
4270   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4271   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4272   PetscCount          Annz = seq->nz;
4273   PetscMemType        memtype;
4274   const PetscScalar  *v1 = v;
4275   PetscScalar        *Aa;
4276 
4277   PetscFunctionBegin;
4278   if (dev->use_extended_coo) {
4279     PetscCall(PetscGetMemType(v, &memtype));
4280     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4281       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4282       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4283     }
4284 
4285     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4286     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4287 
4288     if (Annz) {
4289       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4290       PetscCallCUDA(cudaPeekAtLastError());
4291     }
4292 
4293     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4294     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4295 
4296     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4297   } else {
4298     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4299   }
4300   PetscFunctionReturn(0);
4301 }
4302 
4303 /*@C
4304     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4305 
4306    Not collective
4307 
4308     Input Parameters:
4309 +   A - the matrix
4310 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4311 
4312     Output Parameters:
4313 +   ia - the CSR row pointers
4314 -   ja - the CSR column indices
4315 
4316     Level: developer
4317 
4318     Notes:
4319       When compressed is true, the CSR structure does not contain empty rows
4320 
4321 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4322 @*/
4323 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
4324   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4325   CsrMatrix          *csr;
4326   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4327 
4328   PetscFunctionBegin;
4329   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4330   if (!i || !j) PetscFunctionReturn(0);
4331   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4332   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4333   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4334   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4335   csr = (CsrMatrix *)cusp->mat->mat;
4336   if (i) {
4337     if (!compressed && a->compressedrow.use) { /* need full row offset */
4338       if (!cusp->rowoffsets_gpu) {
4339         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4340         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4341         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4342       }
4343       *i = cusp->rowoffsets_gpu->data().get();
4344     } else *i = csr->row_offsets->data().get();
4345   }
4346   if (j) *j = csr->column_indices->data().get();
4347   PetscFunctionReturn(0);
4348 }
4349 
4350 /*@C
4351     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4352 
4353    Not collective
4354 
4355     Input Parameters:
4356 +   A - the matrix
4357 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4358 
4359     Output Parameters:
4360 +   ia - the CSR row pointers
4361 -   ja - the CSR column indices
4362 
4363     Level: developer
4364 
4365 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4366 @*/
4367 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
4368   PetscFunctionBegin;
4369   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4370   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4371   if (i) *i = NULL;
4372   if (j) *j = NULL;
4373   PetscFunctionReturn(0);
4374 }
4375 
4376 /*@C
4377    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4378 
4379    Not Collective
4380 
4381    Input Parameter:
4382 .   A - a MATSEQAIJCUSPARSE matrix
4383 
4384    Output Parameter:
4385 .   a - pointer to the device data
4386 
4387    Level: developer
4388 
4389    Notes: may trigger host-device copies if up-to-date matrix data is on host
4390 
4391 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4392 @*/
4393 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) {
4394   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4395   CsrMatrix          *csr;
4396 
4397   PetscFunctionBegin;
4398   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4399   PetscValidPointer(a, 2);
4400   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4401   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4402   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4403   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4404   csr = (CsrMatrix *)cusp->mat->mat;
4405   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4406   *a = csr->values->data().get();
4407   PetscFunctionReturn(0);
4408 }
4409 
4410 /*@C
4411    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4412 
4413    Not Collective
4414 
4415    Input Parameter:
4416 .   A - a MATSEQAIJCUSPARSE matrix
4417 
4418    Output Parameter:
4419 .   a - pointer to the device data
4420 
4421    Level: developer
4422 
4423 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4424 @*/
4425 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) {
4426   PetscFunctionBegin;
4427   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4428   PetscValidPointer(a, 2);
4429   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4430   *a = NULL;
4431   PetscFunctionReturn(0);
4432 }
4433 
4434 /*@C
4435    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4436 
4437    Not Collective
4438 
4439    Input Parameter:
4440 .   A - a MATSEQAIJCUSPARSE matrix
4441 
4442    Output Parameter:
4443 .   a - pointer to the device data
4444 
4445    Level: developer
4446 
4447    Notes: may trigger host-device copies if up-to-date matrix data is on host
4448 
4449 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4450 @*/
4451 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) {
4452   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4453   CsrMatrix          *csr;
4454 
4455   PetscFunctionBegin;
4456   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4457   PetscValidPointer(a, 2);
4458   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4459   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4460   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4461   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4462   csr = (CsrMatrix *)cusp->mat->mat;
4463   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4464   *a             = csr->values->data().get();
4465   A->offloadmask = PETSC_OFFLOAD_GPU;
4466   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4467   PetscFunctionReturn(0);
4468 }
4469 /*@C
4470    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4471 
4472    Not Collective
4473 
4474    Input Parameter:
4475 .   A - a MATSEQAIJCUSPARSE matrix
4476 
4477    Output Parameter:
4478 .   a - pointer to the device data
4479 
4480    Level: developer
4481 
4482 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4483 @*/
4484 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) {
4485   PetscFunctionBegin;
4486   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4487   PetscValidPointer(a, 2);
4488   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4489   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4490   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4491   *a = NULL;
4492   PetscFunctionReturn(0);
4493 }
4494 
4495 /*@C
4496    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4497 
4498    Not Collective
4499 
4500    Input Parameter:
4501 .   A - a MATSEQAIJCUSPARSE matrix
4502 
4503    Output Parameter:
4504 .   a - pointer to the device data
4505 
4506    Level: developer
4507 
4508    Notes: does not trigger host-device copies and flags data validity on the GPU
4509 
4510 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4511 @*/
4512 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) {
4513   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4514   CsrMatrix          *csr;
4515 
4516   PetscFunctionBegin;
4517   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4518   PetscValidPointer(a, 2);
4519   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4520   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4521   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4522   csr = (CsrMatrix *)cusp->mat->mat;
4523   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4524   *a             = csr->values->data().get();
4525   A->offloadmask = PETSC_OFFLOAD_GPU;
4526   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4527   PetscFunctionReturn(0);
4528 }
4529 
4530 /*@C
4531    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4532 
4533    Not Collective
4534 
4535    Input Parameter:
4536 .   A - a MATSEQAIJCUSPARSE matrix
4537 
4538    Output Parameter:
4539 .   a - pointer to the device data
4540 
4541    Level: developer
4542 
4543 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4544 @*/
4545 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) {
4546   PetscFunctionBegin;
4547   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4548   PetscValidPointer(a, 2);
4549   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4550   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4551   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4552   *a = NULL;
4553   PetscFunctionReturn(0);
4554 }
4555 
4556 struct IJCompare4 {
4557   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) {
4558     if (t1.get<0>() < t2.get<0>()) return true;
4559     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4560     return false;
4561   }
4562 };
4563 
4564 struct Shift {
4565   int _shift;
4566 
4567   Shift(int shift) : _shift(shift) { }
4568   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4569 };
4570 
4571 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4572 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) {
4573   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4574   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4575   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4576   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4577   PetscInt                      Annz, Bnnz;
4578   cusparseStatus_t              stat;
4579   PetscInt                      i, m, n, zero = 0;
4580 
4581   PetscFunctionBegin;
4582   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4583   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4584   PetscValidPointer(C, 4);
4585   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4586   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4587   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4588   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4589   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4590   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4591   if (reuse == MAT_INITIAL_MATRIX) {
4592     m = A->rmap->n;
4593     n = A->cmap->n + B->cmap->n;
4594     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4595     PetscCall(MatSetSizes(*C, m, n, m, n));
4596     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4597     c                       = (Mat_SeqAIJ *)(*C)->data;
4598     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4599     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4600     Ccsr                    = new CsrMatrix;
4601     Cmat->cprowIndices      = NULL;
4602     c->compressedrow.use    = PETSC_FALSE;
4603     c->compressedrow.nrows  = 0;
4604     c->compressedrow.i      = NULL;
4605     c->compressedrow.rindex = NULL;
4606     Ccusp->workVector       = NULL;
4607     Ccusp->nrows            = m;
4608     Ccusp->mat              = Cmat;
4609     Ccusp->mat->mat         = Ccsr;
4610     Ccsr->num_rows          = m;
4611     Ccsr->num_cols          = n;
4612     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4613     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4614     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4615     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4616     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4617     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4618     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4619     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4620     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4621     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4622     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4623     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4624     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4625 
4626     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4627     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4628     Annz                 = (PetscInt)Acsr->column_indices->size();
4629     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4630     c->nz                = Annz + Bnnz;
4631     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4632     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4633     Ccsr->values         = new THRUSTARRAY(c->nz);
4634     Ccsr->num_entries    = c->nz;
4635     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4636     if (c->nz) {
4637       auto              Acoo = new THRUSTINTARRAY32(Annz);
4638       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4639       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4640       THRUSTINTARRAY32 *Aroff, *Broff;
4641 
4642       if (a->compressedrow.use) { /* need full row offset */
4643         if (!Acusp->rowoffsets_gpu) {
4644           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4645           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4646           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4647         }
4648         Aroff = Acusp->rowoffsets_gpu;
4649       } else Aroff = Acsr->row_offsets;
4650       if (b->compressedrow.use) { /* need full row offset */
4651         if (!Bcusp->rowoffsets_gpu) {
4652           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4653           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4654           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4655         }
4656         Broff = Bcusp->rowoffsets_gpu;
4657       } else Broff = Bcsr->row_offsets;
4658       PetscCall(PetscLogGpuTimeBegin());
4659       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4660       PetscCallCUSPARSE(stat);
4661       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4662       PetscCallCUSPARSE(stat);
4663       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4664       auto Aperm = thrust::make_constant_iterator(1);
4665       auto Bperm = thrust::make_constant_iterator(0);
4666 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4667       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4668       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4669 #else
4670       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4671       auto Bcib = Bcsr->column_indices->begin();
4672       auto Bcie = Bcsr->column_indices->end();
4673       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4674 #endif
4675       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4676       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4677       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4678       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4679       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4680       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4681       auto p1    = Ccusp->cooPerm->begin();
4682       auto p2    = Ccusp->cooPerm->begin();
4683       thrust::advance(p2, Annz);
4684       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4685 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4686       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4687 #endif
4688       auto cci = thrust::make_counting_iterator(zero);
4689       auto cce = thrust::make_counting_iterator(c->nz);
4690 #if 0 //Errors on SUMMIT cuda 11.1.0
4691       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4692 #else
4693       auto pred = thrust::identity<int>();
4694       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4695       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4696 #endif
4697       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4698       PetscCallCUSPARSE(stat);
4699       PetscCall(PetscLogGpuTimeEnd());
4700       delete wPerm;
4701       delete Acoo;
4702       delete Bcoo;
4703       delete Ccoo;
4704 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4705       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4706       PetscCallCUSPARSE(stat);
4707 #endif
4708       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4709         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4710         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4711         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4712         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4713         CsrMatrix                    *CcsrT = new CsrMatrix;
4714         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4715         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4716 
4717         (*C)->form_explicit_transpose = PETSC_TRUE;
4718         (*C)->transupdated            = PETSC_TRUE;
4719         Ccusp->rowoffsets_gpu         = NULL;
4720         CmatT->cprowIndices           = NULL;
4721         CmatT->mat                    = CcsrT;
4722         CcsrT->num_rows               = n;
4723         CcsrT->num_cols               = m;
4724         CcsrT->num_entries            = c->nz;
4725 
4726         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4727         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4728         CcsrT->values         = new THRUSTARRAY(c->nz);
4729 
4730         PetscCall(PetscLogGpuTimeBegin());
4731         auto rT = CcsrT->row_offsets->begin();
4732         if (AT) {
4733           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4734           thrust::advance(rT, -1);
4735         }
4736         if (BT) {
4737           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4738           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4739           thrust::copy(titb, tite, rT);
4740         }
4741         auto cT = CcsrT->column_indices->begin();
4742         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4743         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4744         auto vT = CcsrT->values->begin();
4745         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4746         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4747         PetscCall(PetscLogGpuTimeEnd());
4748 
4749         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4750         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4751         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4752         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4753         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4754         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4755         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4756         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4757         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4758 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4759         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4760         PetscCallCUSPARSE(stat);
4761 #endif
4762         Ccusp->matTranspose = CmatT;
4763       }
4764     }
4765 
4766     c->singlemalloc = PETSC_FALSE;
4767     c->free_a       = PETSC_TRUE;
4768     c->free_ij      = PETSC_TRUE;
4769     PetscCall(PetscMalloc1(m + 1, &c->i));
4770     PetscCall(PetscMalloc1(c->nz, &c->j));
4771     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4772       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4773       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4774       ii = *Ccsr->row_offsets;
4775       jj = *Ccsr->column_indices;
4776       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4777       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4778     } else {
4779       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4780       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4781     }
4782     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4783     PetscCall(PetscMalloc1(m, &c->ilen));
4784     PetscCall(PetscMalloc1(m, &c->imax));
4785     c->maxnz         = c->nz;
4786     c->nonzerorowcnt = 0;
4787     c->rmax          = 0;
4788     for (i = 0; i < m; i++) {
4789       const PetscInt nn = c->i[i + 1] - c->i[i];
4790       c->ilen[i] = c->imax[i] = nn;
4791       c->nonzerorowcnt += (PetscInt) !!nn;
4792       c->rmax = PetscMax(c->rmax, nn);
4793     }
4794     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4795     PetscCall(PetscMalloc1(c->nz, &c->a));
4796     (*C)->nonzerostate++;
4797     PetscCall(PetscLayoutSetUp((*C)->rmap));
4798     PetscCall(PetscLayoutSetUp((*C)->cmap));
4799     Ccusp->nonzerostate = (*C)->nonzerostate;
4800     (*C)->preallocated  = PETSC_TRUE;
4801   } else {
4802     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4803     c = (Mat_SeqAIJ *)(*C)->data;
4804     if (c->nz) {
4805       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4806       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4807       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4808       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4809       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4810       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4811       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4812       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4813       Acsr = (CsrMatrix *)Acusp->mat->mat;
4814       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4815       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4816       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4817       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4818       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4819       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4820       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4821       auto pmid = Ccusp->cooPerm->begin();
4822       thrust::advance(pmid, Acsr->num_entries);
4823       PetscCall(PetscLogGpuTimeBegin());
4824       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4825       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4826       thrust::for_each(zibait, zieait, VecCUDAEquals());
4827       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4828       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4829       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4830       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4831       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4832         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4833         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4834         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4835         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4836         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4837         auto       vT    = CcsrT->values->begin();
4838         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4839         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4840         (*C)->transupdated = PETSC_TRUE;
4841       }
4842       PetscCall(PetscLogGpuTimeEnd());
4843     }
4844   }
4845   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4846   (*C)->assembled     = PETSC_TRUE;
4847   (*C)->was_assembled = PETSC_FALSE;
4848   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4849   PetscFunctionReturn(0);
4850 }
4851 
4852 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) {
4853   bool               dmem;
4854   const PetscScalar *av;
4855 
4856   PetscFunctionBegin;
4857   dmem = isCudaMem(v);
4858   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4859   if (n && idx) {
4860     THRUSTINTARRAY widx(n);
4861     widx.assign(idx, idx + n);
4862     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4863 
4864     THRUSTARRAY                    *w = NULL;
4865     thrust::device_ptr<PetscScalar> dv;
4866     if (dmem) {
4867       dv = thrust::device_pointer_cast(v);
4868     } else {
4869       w  = new THRUSTARRAY(n);
4870       dv = w->data();
4871     }
4872     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4873 
4874     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4875     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4876     thrust::for_each(zibit, zieit, VecCUDAEquals());
4877     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4878     delete w;
4879   } else {
4880     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4881   }
4882   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4883   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4884   PetscFunctionReturn(0);
4885 }
4886