xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 69f65dfb176f3d3e756fc44d2300fd6791726976)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 #include <thrust/adjacent_difference.h>
15 #if PETSC_CPP_VERSION >= 14
16   #define PETSC_HAVE_THRUST_ASYNC 1
17   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18   #include <thrust/async/for_each.h>
19 #endif
20 #include <thrust/iterator/constant_iterator.h>
21 #include <thrust/remove.h>
22 #include <thrust/sort.h>
23 #include <thrust/unique.h>
24 
25 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29 
30   typedef enum {
31       CUSPARSE_MV_ALG_DEFAULT = 0,
32       CUSPARSE_COOMV_ALG      = 1,
33       CUSPARSE_CSRMV_ALG1     = 2,
34       CUSPARSE_CSRMV_ALG2     = 3
35   } cusparseSpMVAlg_t;
36 
37   typedef enum {
38       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44       CUSPARSE_SPMM_COO_ALG1    = 1,
45       CUSPARSE_SPMM_COO_ALG2    = 2,
46       CUSPARSE_SPMM_COO_ALG3    = 3,
47       CUSPARSE_SPMM_COO_ALG4    = 5,
48       CUSPARSE_SPMM_CSR_ALG1    = 4,
49       CUSPARSE_SPMM_CSR_ALG2    = 6,
50   } cusparseSpMMAlg_t;
51 
52   typedef enum {
53       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55   } cusparseCsr2CscAlg_t;
56   */
57 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60 #endif
61 
62 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65 
66 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
69 
70 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
74 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
75 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
84 
85 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
88 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
89 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
90 
91 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
92 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
93 
94 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
95 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
96 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
97 
98 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
99 {
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
101 
102   PetscFunctionBegin;
103   switch (op) {
104   case MAT_CUSPARSE_MULT:
105     cusparsestruct->format = format;
106     break;
107   case MAT_CUSPARSE_ALL:
108     cusparsestruct->format = format;
109     break;
110   default:
111     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112   }
113   PetscFunctionReturn(PETSC_SUCCESS);
114 }
115 
116 /*@
117    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118    operation. Only the `MatMult()` operation can use different GPU storage formats
119 
120    Not Collective
121 
122    Input Parameters:
123 +  A - Matrix of type `MATSEQAIJCUSPARSE`
124 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
125         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
127 
128    Level: intermediate
129 
130 .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
131 @*/
132 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
133 {
134   PetscFunctionBegin;
135   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
136   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
137   PetscFunctionReturn(PETSC_SUCCESS);
138 }
139 
140 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
141 {
142   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
143 
144   PetscFunctionBegin;
145   cusparsestruct->use_cpu_solve = use_cpu;
146   PetscFunctionReturn(PETSC_SUCCESS);
147 }
148 
149 /*@
150    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
151 
152    Input Parameters:
153 +  A - Matrix of type `MATSEQAIJCUSPARSE`
154 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
155 
156    Level: intermediate
157 
158    Note:
159    The cuSparse LU solver currently computes the factors with the built-in CPU method
160    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
162 
163 .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164 @*/
165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
166 {
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
169   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
170   PetscFunctionReturn(PETSC_SUCCESS);
171 }
172 
173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
174 {
175   PetscFunctionBegin;
176   switch (op) {
177   case MAT_FORM_EXPLICIT_TRANSPOSE:
178     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
180     A->form_explicit_transpose = flg;
181     break;
182   default:
183     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
184     break;
185   }
186   PetscFunctionReturn(PETSC_SUCCESS);
187 }
188 
189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190 
191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
192 {
193   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
194   IS                  isrow = b->row, iscol = b->col;
195   PetscBool           row_identity, col_identity;
196   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
197 
198   PetscFunctionBegin;
199   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
201   B->offloadmask = PETSC_OFFLOAD_CPU;
202   /* determine which version of MatSolve needs to be used. */
203   PetscCall(ISIdentity(isrow, &row_identity));
204   PetscCall(ISIdentity(iscol, &col_identity));
205 
206   if (!cusparsestruct->use_cpu_solve) {
207     if (row_identity && col_identity) {
208       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210     } else {
211       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
212       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213     }
214   }
215   B->ops->matsolve          = NULL;
216   B->ops->matsolvetranspose = NULL;
217 
218   /* get the triangular factors */
219   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
224 {
225   MatCUSPARSEStorageFormat format;
226   PetscBool                flg;
227   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
228 
229   PetscFunctionBegin;
230   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
231   if (A->factortype == MAT_FACTOR_NONE) {
232     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
233     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
234 
235     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
236     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
237     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
238     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
239 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
240     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
241     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
242   #if CUSPARSE_VERSION > 11301
243     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
244   #else
245     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
246   #endif
247     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
248     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
249 
250     PetscCall(
251       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
252     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
253 #endif
254   }
255   PetscOptionsHeadEnd();
256   PetscFunctionReturn(PETSC_SUCCESS);
257 }
258 
259 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
260 {
261   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
262   PetscInt                           n                  = A->rmap->n;
263   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
264   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
265   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
266   const MatScalar                   *aa = a->a, *v;
267   PetscInt                          *AiLo, *AjLo;
268   PetscInt                           i, nz, nzLower, offset, rowOffset;
269 
270   PetscFunctionBegin;
271   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
272   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
273     try {
274       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
275       nzLower = n + ai[n] - ai[1];
276       if (!loTriFactor) {
277         PetscScalar *AALo;
278 
279         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
280 
281         /* Allocate Space for the lower triangular matrix */
282         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
283         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
284 
285         /* Fill the lower triangular matrix */
286         AiLo[0]   = (PetscInt)0;
287         AiLo[n]   = nzLower;
288         AjLo[0]   = (PetscInt)0;
289         AALo[0]   = (MatScalar)1.0;
290         v         = aa;
291         vi        = aj;
292         offset    = 1;
293         rowOffset = 1;
294         for (i = 1; i < n; i++) {
295           nz = ai[i + 1] - ai[i];
296           /* additional 1 for the term on the diagonal */
297           AiLo[i] = rowOffset;
298           rowOffset += nz + 1;
299 
300           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
301           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
302 
303           offset += nz;
304           AjLo[offset] = (PetscInt)i;
305           AALo[offset] = (MatScalar)1.0;
306           offset += 1;
307 
308           v += nz;
309           vi += nz;
310         }
311 
312         /* allocate space for the triangular factor information */
313         PetscCall(PetscNew(&loTriFactor));
314         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
315         /* Create the matrix description */
316         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
317         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
318 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
319         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
320 #else
321         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
322 #endif
323         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
324         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
325 
326         /* set the operation */
327         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
328 
329         /* set the matrix */
330         loTriFactor->csrMat              = new CsrMatrix;
331         loTriFactor->csrMat->num_rows    = n;
332         loTriFactor->csrMat->num_cols    = n;
333         loTriFactor->csrMat->num_entries = nzLower;
334 
335         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
336         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
337 
338         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
340 
341         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
343 
344         /* Create the solve analysis information */
345         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
346         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
347 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
348         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
349                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
350         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
351 #endif
352 
353         /* perform the solve analysis */
354         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
355                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
356         PetscCallCUDA(WaitForCUDA());
357         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
358 
359         /* assign the pointer */
360         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
361         loTriFactor->AA_h                                          = AALo;
362         PetscCallCUDA(cudaFreeHost(AiLo));
363         PetscCallCUDA(cudaFreeHost(AjLo));
364         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
365       } else { /* update values only */
366         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
367         /* Fill the lower triangular matrix */
368         loTriFactor->AA_h[0] = 1.0;
369         v                    = aa;
370         vi                   = aj;
371         offset               = 1;
372         for (i = 1; i < n; i++) {
373           nz = ai[i + 1] - ai[i];
374           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
375           offset += nz;
376           loTriFactor->AA_h[offset] = 1.0;
377           offset += 1;
378           v += nz;
379         }
380         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
381         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
382       }
383     } catch (char *ex) {
384       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
385     }
386   }
387   PetscFunctionReturn(PETSC_SUCCESS);
388 }
389 
390 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
391 {
392   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
393   PetscInt                           n                  = A->rmap->n;
394   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
395   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
396   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
397   const MatScalar                   *aa = a->a, *v;
398   PetscInt                          *AiUp, *AjUp;
399   PetscInt                           i, nz, nzUpper, offset;
400 
401   PetscFunctionBegin;
402   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
403   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
404     try {
405       /* next, figure out the number of nonzeros in the upper triangular matrix. */
406       nzUpper = adiag[0] - adiag[n];
407       if (!upTriFactor) {
408         PetscScalar *AAUp;
409 
410         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
411 
412         /* Allocate Space for the upper triangular matrix */
413         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
414         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
415 
416         /* Fill the upper triangular matrix */
417         AiUp[0] = (PetscInt)0;
418         AiUp[n] = nzUpper;
419         offset  = nzUpper;
420         for (i = n - 1; i >= 0; i--) {
421           v  = aa + adiag[i + 1] + 1;
422           vi = aj + adiag[i + 1] + 1;
423 
424           /* number of elements NOT on the diagonal */
425           nz = adiag[i] - adiag[i + 1] - 1;
426 
427           /* decrement the offset */
428           offset -= (nz + 1);
429 
430           /* first, set the diagonal elements */
431           AjUp[offset] = (PetscInt)i;
432           AAUp[offset] = (MatScalar)1. / v[nz];
433           AiUp[i]      = AiUp[i + 1] - (nz + 1);
434 
435           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
436           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
437         }
438 
439         /* allocate space for the triangular factor information */
440         PetscCall(PetscNew(&upTriFactor));
441         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
442 
443         /* Create the matrix description */
444         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
445         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
446 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
447         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
448 #else
449         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
450 #endif
451         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
452         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
453 
454         /* set the operation */
455         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
456 
457         /* set the matrix */
458         upTriFactor->csrMat              = new CsrMatrix;
459         upTriFactor->csrMat->num_rows    = n;
460         upTriFactor->csrMat->num_cols    = n;
461         upTriFactor->csrMat->num_entries = nzUpper;
462 
463         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
464         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
465 
466         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
467         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
468 
469         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
470         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
471 
472         /* Create the solve analysis information */
473         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
474         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
475 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
476         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
477                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
478         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
479 #endif
480 
481         /* perform the solve analysis */
482         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
483                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
484 
485         PetscCallCUDA(WaitForCUDA());
486         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
487 
488         /* assign the pointer */
489         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
490         upTriFactor->AA_h                                          = AAUp;
491         PetscCallCUDA(cudaFreeHost(AiUp));
492         PetscCallCUDA(cudaFreeHost(AjUp));
493         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
494       } else {
495         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
496         /* Fill the upper triangular matrix */
497         offset = nzUpper;
498         for (i = n - 1; i >= 0; i--) {
499           v = aa + adiag[i + 1] + 1;
500 
501           /* number of elements NOT on the diagonal */
502           nz = adiag[i] - adiag[i + 1] - 1;
503 
504           /* decrement the offset */
505           offset -= (nz + 1);
506 
507           /* first, set the diagonal elements */
508           upTriFactor->AA_h[offset] = 1. / v[nz];
509           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
510         }
511         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
512         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
513       }
514     } catch (char *ex) {
515       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
516     }
517   }
518   PetscFunctionReturn(PETSC_SUCCESS);
519 }
520 
521 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
522 {
523   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
524   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
525   IS                            isrow = a->row, iscol = a->icol;
526   PetscBool                     row_identity, col_identity;
527   PetscInt                      n = A->rmap->n;
528 
529   PetscFunctionBegin;
530   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
531   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
532   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
533 
534   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
535   cusparseTriFactors->nnz = a->nz;
536 
537   A->offloadmask = PETSC_OFFLOAD_BOTH;
538   /* lower triangular indices */
539   PetscCall(ISIdentity(isrow, &row_identity));
540   if (!row_identity && !cusparseTriFactors->rpermIndices) {
541     const PetscInt *r;
542 
543     PetscCall(ISGetIndices(isrow, &r));
544     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
545     cusparseTriFactors->rpermIndices->assign(r, r + n);
546     PetscCall(ISRestoreIndices(isrow, &r));
547     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
548   }
549 
550   /* upper triangular indices */
551   PetscCall(ISIdentity(iscol, &col_identity));
552   if (!col_identity && !cusparseTriFactors->cpermIndices) {
553     const PetscInt *c;
554 
555     PetscCall(ISGetIndices(iscol, &c));
556     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
557     cusparseTriFactors->cpermIndices->assign(c, c + n);
558     PetscCall(ISRestoreIndices(iscol, &c));
559     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
560   }
561   PetscFunctionReturn(PETSC_SUCCESS);
562 }
563 
564 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
565 {
566   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
567   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
568   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
569   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
570   PetscInt                          *AiUp, *AjUp;
571   PetscScalar                       *AAUp;
572   PetscScalar                       *AALo;
573   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
574   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
575   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
576   const MatScalar                   *aa = b->a, *v;
577 
578   PetscFunctionBegin;
579   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
580   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
581     try {
582       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
583       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
584       if (!upTriFactor && !loTriFactor) {
585         /* Allocate Space for the upper triangular matrix */
586         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
587         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
588 
589         /* Fill the upper triangular matrix */
590         AiUp[0] = (PetscInt)0;
591         AiUp[n] = nzUpper;
592         offset  = 0;
593         for (i = 0; i < n; i++) {
594           /* set the pointers */
595           v  = aa + ai[i];
596           vj = aj + ai[i];
597           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
598 
599           /* first, set the diagonal elements */
600           AjUp[offset] = (PetscInt)i;
601           AAUp[offset] = (MatScalar)1.0 / v[nz];
602           AiUp[i]      = offset;
603           AALo[offset] = (MatScalar)1.0 / v[nz];
604 
605           offset += 1;
606           if (nz > 0) {
607             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
608             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
609             for (j = offset; j < offset + nz; j++) {
610               AAUp[j] = -AAUp[j];
611               AALo[j] = AAUp[j] / v[nz];
612             }
613             offset += nz;
614           }
615         }
616 
617         /* allocate space for the triangular factor information */
618         PetscCall(PetscNew(&upTriFactor));
619         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
620 
621         /* Create the matrix description */
622         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
623         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
624 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
625         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
626 #else
627         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
628 #endif
629         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
630         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
631 
632         /* set the matrix */
633         upTriFactor->csrMat              = new CsrMatrix;
634         upTriFactor->csrMat->num_rows    = A->rmap->n;
635         upTriFactor->csrMat->num_cols    = A->cmap->n;
636         upTriFactor->csrMat->num_entries = a->nz;
637 
638         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
639         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
640 
641         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
642         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
643 
644         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
645         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
646 
647         /* set the operation */
648         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
649 
650         /* Create the solve analysis information */
651         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
652         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
653 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
654         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
655                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
656         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
657 #endif
658 
659         /* perform the solve analysis */
660         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
661                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
662 
663         PetscCallCUDA(WaitForCUDA());
664         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
665 
666         /* assign the pointer */
667         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
668 
669         /* allocate space for the triangular factor information */
670         PetscCall(PetscNew(&loTriFactor));
671         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
672 
673         /* Create the matrix description */
674         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
675         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
676 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
677         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
678 #else
679         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
680 #endif
681         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
682         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
683 
684         /* set the operation */
685         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
686 
687         /* set the matrix */
688         loTriFactor->csrMat              = new CsrMatrix;
689         loTriFactor->csrMat->num_rows    = A->rmap->n;
690         loTriFactor->csrMat->num_cols    = A->cmap->n;
691         loTriFactor->csrMat->num_entries = a->nz;
692 
693         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
694         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
695 
696         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
697         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
698 
699         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
700         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
701 
702         /* Create the solve analysis information */
703         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
704         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
705 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
706         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
707                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
708         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
709 #endif
710 
711         /* perform the solve analysis */
712         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
713                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
714 
715         PetscCallCUDA(WaitForCUDA());
716         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
717 
718         /* assign the pointer */
719         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
720 
721         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
722         PetscCallCUDA(cudaFreeHost(AiUp));
723         PetscCallCUDA(cudaFreeHost(AjUp));
724       } else {
725         /* Fill the upper triangular matrix */
726         offset = 0;
727         for (i = 0; i < n; i++) {
728           /* set the pointers */
729           v  = aa + ai[i];
730           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
731 
732           /* first, set the diagonal elements */
733           AAUp[offset] = 1.0 / v[nz];
734           AALo[offset] = 1.0 / v[nz];
735 
736           offset += 1;
737           if (nz > 0) {
738             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
739             for (j = offset; j < offset + nz; j++) {
740               AAUp[j] = -AAUp[j];
741               AALo[j] = AAUp[j] / v[nz];
742             }
743             offset += nz;
744           }
745         }
746         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
747         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
748         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
749         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
750         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
751       }
752       PetscCallCUDA(cudaFreeHost(AAUp));
753       PetscCallCUDA(cudaFreeHost(AALo));
754     } catch (char *ex) {
755       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
756     }
757   }
758   PetscFunctionReturn(PETSC_SUCCESS);
759 }
760 
761 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
762 {
763   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
764   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
765   IS                            ip                 = a->row;
766   PetscBool                     perm_identity;
767   PetscInt                      n = A->rmap->n;
768 
769   PetscFunctionBegin;
770   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
771   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
772   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
773   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
774 
775   A->offloadmask = PETSC_OFFLOAD_BOTH;
776 
777   /* lower triangular indices */
778   PetscCall(ISIdentity(ip, &perm_identity));
779   if (!perm_identity) {
780     IS              iip;
781     const PetscInt *irip, *rip;
782 
783     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
784     PetscCall(ISGetIndices(iip, &irip));
785     PetscCall(ISGetIndices(ip, &rip));
786     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
787     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
788     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
789     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
790     PetscCall(ISRestoreIndices(iip, &irip));
791     PetscCall(ISDestroy(&iip));
792     PetscCall(ISRestoreIndices(ip, &rip));
793     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
794   }
795   PetscFunctionReturn(PETSC_SUCCESS);
796 }
797 
798 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
799 {
800   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
801   IS          ip = b->row;
802   PetscBool   perm_identity;
803 
804   PetscFunctionBegin;
805   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
806   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
807   B->offloadmask = PETSC_OFFLOAD_CPU;
808   /* determine which version of MatSolve needs to be used. */
809   PetscCall(ISIdentity(ip, &perm_identity));
810   if (perm_identity) {
811     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
812     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
813     B->ops->matsolve          = NULL;
814     B->ops->matsolvetranspose = NULL;
815   } else {
816     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
817     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
818     B->ops->matsolve          = NULL;
819     B->ops->matsolvetranspose = NULL;
820   }
821 
822   /* get the triangular factors */
823   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
824   PetscFunctionReturn(PETSC_SUCCESS);
825 }
826 
827 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
828 {
829   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
830   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
831   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
832   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
833   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
834   cusparseIndexBase_t                indexBase;
835   cusparseMatrixType_t               matrixType;
836   cusparseFillMode_t                 fillMode;
837   cusparseDiagType_t                 diagType;
838 
839   PetscFunctionBegin;
840   /* allocate space for the transpose of the lower triangular factor */
841   PetscCall(PetscNew(&loTriFactorT));
842   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
843 
844   /* set the matrix descriptors of the lower triangular factor */
845   matrixType = cusparseGetMatType(loTriFactor->descr);
846   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
847   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
848   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
849 
850   /* Create the matrix description */
851   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
852   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
853   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
854   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
855   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
856 
857   /* set the operation */
858   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
859 
860   /* allocate GPU space for the CSC of the lower triangular factor*/
861   loTriFactorT->csrMat                 = new CsrMatrix;
862   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
863   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
864   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
865   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
866   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
867   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
868 
869   /* compute the transpose of the lower triangular factor, i.e. the CSC */
870 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
871   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
872                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
873                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
874   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
875 #endif
876 
877   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
878   {
879     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
880     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
881                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
882 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
883                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
884 #else
885                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
886 #endif
887     PetscCallCUSPARSE(stat);
888   }
889 
890   PetscCallCUDA(WaitForCUDA());
891   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
892 
893   /* Create the solve analysis information */
894   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
895   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
896 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
897   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
898                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
899   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
900 #endif
901 
902   /* perform the solve analysis */
903   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
904                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
905 
906   PetscCallCUDA(WaitForCUDA());
907   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
908 
909   /* assign the pointer */
910   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
911 
912   /*********************************************/
913   /* Now the Transpose of the Upper Tri Factor */
914   /*********************************************/
915 
916   /* allocate space for the transpose of the upper triangular factor */
917   PetscCall(PetscNew(&upTriFactorT));
918   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
919 
920   /* set the matrix descriptors of the upper triangular factor */
921   matrixType = cusparseGetMatType(upTriFactor->descr);
922   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
923   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
924   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
925 
926   /* Create the matrix description */
927   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
928   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
929   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
930   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
931   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
932 
933   /* set the operation */
934   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
935 
936   /* allocate GPU space for the CSC of the upper triangular factor*/
937   upTriFactorT->csrMat                 = new CsrMatrix;
938   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
939   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
940   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
941   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
942   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
943   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
944 
945   /* compute the transpose of the upper triangular factor, i.e. the CSC */
946 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
947   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
948                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
949                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
950   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
951 #endif
952 
953   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
954   {
955     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
956     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
957                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
958 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
959                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
960 #else
961                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
962 #endif
963     PetscCallCUSPARSE(stat);
964   }
965 
966   PetscCallCUDA(WaitForCUDA());
967   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
968 
969   /* Create the solve analysis information */
970   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
971   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
972 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
973   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
974                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
975   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
976 #endif
977 
978   /* perform the solve analysis */
979   /* christ, would it have killed you to put this stuff in a function????????? */
980   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
981                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
982 
983   PetscCallCUDA(WaitForCUDA());
984   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
985 
986   /* assign the pointer */
987   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
988   PetscFunctionReturn(PETSC_SUCCESS);
989 }
990 
991 struct PetscScalarToPetscInt {
992   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
993 };
994 
995 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
996 {
997   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
998   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
999   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1000   cusparseStatus_t              stat;
1001   cusparseIndexBase_t           indexBase;
1002 
1003   PetscFunctionBegin;
1004   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1005   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1006   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1007   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1008   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1009   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1010   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1011   PetscCall(PetscLogGpuTimeBegin());
1012   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1013   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1014     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1015     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1016     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1017     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1018     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1019 
1020     /* set alpha and beta */
1021     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1022     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1023     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1024     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1025     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1026     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1027 
1028     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1029       CsrMatrix *matrixT      = new CsrMatrix;
1030       matstructT->mat         = matrixT;
1031       matrixT->num_rows       = A->cmap->n;
1032       matrixT->num_cols       = A->rmap->n;
1033       matrixT->num_entries    = a->nz;
1034       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1035       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1036       matrixT->values         = new THRUSTARRAY(a->nz);
1037 
1038       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1039       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1040 
1041 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1042   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1043       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1044                                indexBase, cusparse_scalartype);
1045       PetscCallCUSPARSE(stat);
1046   #else
1047       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1048            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1049 
1050            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1051            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1052            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1053         */
1054       if (matrixT->num_entries) {
1055         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1056         PetscCallCUSPARSE(stat);
1057 
1058       } else {
1059         matstructT->matDescr = NULL;
1060         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1061       }
1062   #endif
1063 #endif
1064     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1065 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1066       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1067 #else
1068       CsrMatrix *temp = new CsrMatrix;
1069       CsrMatrix *tempT = new CsrMatrix;
1070       /* First convert HYB to CSR */
1071       temp->num_rows = A->rmap->n;
1072       temp->num_cols = A->cmap->n;
1073       temp->num_entries = a->nz;
1074       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1075       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1076       temp->values = new THRUSTARRAY(a->nz);
1077 
1078       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1079       PetscCallCUSPARSE(stat);
1080 
1081       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1082       tempT->num_rows = A->rmap->n;
1083       tempT->num_cols = A->cmap->n;
1084       tempT->num_entries = a->nz;
1085       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1086       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1087       tempT->values = new THRUSTARRAY(a->nz);
1088 
1089       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1090                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1091       PetscCallCUSPARSE(stat);
1092 
1093       /* Last, convert CSC to HYB */
1094       cusparseHybMat_t hybMat;
1095       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1096       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1097       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1098       PetscCallCUSPARSE(stat);
1099 
1100       /* assign the pointer */
1101       matstructT->mat = hybMat;
1102       A->transupdated = PETSC_TRUE;
1103       /* delete temporaries */
1104       if (tempT) {
1105         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1106         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1107         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1108         delete (CsrMatrix *)tempT;
1109       }
1110       if (temp) {
1111         if (temp->values) delete (THRUSTARRAY *)temp->values;
1112         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1113         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1114         delete (CsrMatrix *)temp;
1115       }
1116 #endif
1117     }
1118   }
1119   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1120     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1121     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1122     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1123     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1124     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1125     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1126     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1127     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1128     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1129     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1130     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1131       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1132       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1133       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1134     }
1135     if (!cusparsestruct->csr2csc_i) {
1136       THRUSTARRAY csr2csc_a(matrix->num_entries);
1137       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1138 
1139       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1140 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1141       void  *csr2cscBuffer;
1142       size_t csr2cscBufferSize;
1143       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1144                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1145       PetscCallCUSPARSE(stat);
1146       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1147 #endif
1148 
1149       if (matrix->num_entries) {
1150         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1151            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1152            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1153 
1154            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1155            should be filled with indexBase. So I just take a shortcut here.
1156         */
1157         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1158 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1159                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1160         PetscCallCUSPARSE(stat);
1161 #else
1162                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1163         PetscCallCUSPARSE(stat);
1164 #endif
1165       } else {
1166         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1167       }
1168 
1169       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1170       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1172       PetscCallCUDA(cudaFree(csr2cscBuffer));
1173 #endif
1174     }
1175     PetscCallThrust(
1176       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1177   }
1178   PetscCall(PetscLogGpuTimeEnd());
1179   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1180   /* the compressed row indices is not used for matTranspose */
1181   matstructT->cprowIndices = NULL;
1182   /* assign the pointer */
1183   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1184   A->transupdated                                = PETSC_TRUE;
1185   PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187 
1188 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1189 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1190 {
1191   PetscInt                              n = xx->map->n;
1192   const PetscScalar                    *barray;
1193   PetscScalar                          *xarray;
1194   thrust::device_ptr<const PetscScalar> bGPU;
1195   thrust::device_ptr<PetscScalar>       xGPU;
1196   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1197   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1198   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1199   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1200 
1201   PetscFunctionBegin;
1202   /* Analyze the matrix and create the transpose ... on the fly */
1203   if (!loTriFactorT && !upTriFactorT) {
1204     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1205     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1206     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1207   }
1208 
1209   /* Get the GPU pointers */
1210   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1211   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1212   xGPU = thrust::device_pointer_cast(xarray);
1213   bGPU = thrust::device_pointer_cast(barray);
1214 
1215   PetscCall(PetscLogGpuTimeBegin());
1216   /* First, reorder with the row permutation */
1217   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1218 
1219   /* First, solve U */
1220   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1221                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1222 
1223   /* Then, solve L */
1224   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1225                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1226 
1227   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1228   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1229 
1230   /* Copy the temporary to the full solution. */
1231   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1232 
1233   /* restore */
1234   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1235   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1236   PetscCall(PetscLogGpuTimeEnd());
1237   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1238   PetscFunctionReturn(PETSC_SUCCESS);
1239 }
1240 
1241 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1242 {
1243   const PetscScalar                 *barray;
1244   PetscScalar                       *xarray;
1245   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1246   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1247   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1248   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1249 
1250   PetscFunctionBegin;
1251   /* Analyze the matrix and create the transpose ... on the fly */
1252   if (!loTriFactorT && !upTriFactorT) {
1253     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1254     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1255     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1256   }
1257 
1258   /* Get the GPU pointers */
1259   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1260   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1261 
1262   PetscCall(PetscLogGpuTimeBegin());
1263   /* First, solve U */
1264   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1265                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1266 
1267   /* Then, solve L */
1268   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1269                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1270 
1271   /* restore */
1272   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1273   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1274   PetscCall(PetscLogGpuTimeEnd());
1275   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1276   PetscFunctionReturn(PETSC_SUCCESS);
1277 }
1278 
1279 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1280 {
1281   const PetscScalar                    *barray;
1282   PetscScalar                          *xarray;
1283   thrust::device_ptr<const PetscScalar> bGPU;
1284   thrust::device_ptr<PetscScalar>       xGPU;
1285   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1286   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1287   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1288   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1289 
1290   PetscFunctionBegin;
1291   /* Get the GPU pointers */
1292   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1293   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1294   xGPU = thrust::device_pointer_cast(xarray);
1295   bGPU = thrust::device_pointer_cast(barray);
1296 
1297   PetscCall(PetscLogGpuTimeBegin());
1298   /* First, reorder with the row permutation */
1299   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1300 
1301   /* Next, solve L */
1302   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1303                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1304 
1305   /* Then, solve U */
1306   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1307                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1308 
1309   /* Last, reorder with the column permutation */
1310   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1311 
1312   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1313   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1314   PetscCall(PetscLogGpuTimeEnd());
1315   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1316   PetscFunctionReturn(PETSC_SUCCESS);
1317 }
1318 
1319 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1320 {
1321   const PetscScalar                 *barray;
1322   PetscScalar                       *xarray;
1323   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1324   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1325   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1326   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1327 
1328   PetscFunctionBegin;
1329   /* Get the GPU pointers */
1330   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1331   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1332 
1333   PetscCall(PetscLogGpuTimeBegin());
1334   /* First, solve L */
1335   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1336                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1337 
1338   /* Next, solve U */
1339   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1340                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1341 
1342   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1343   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1344   PetscCall(PetscLogGpuTimeEnd());
1345   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1346   PetscFunctionReturn(PETSC_SUCCESS);
1347 }
1348 
1349 #if CUSPARSE_VERSION >= 11500
1350 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1351 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1352 {
1353   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1354   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1355   const PetscScalar            *barray;
1356   PetscScalar                  *xarray;
1357 
1358   PetscFunctionBegin;
1359   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1360   PetscCall(VecCUDAGetArrayRead(b, &barray));
1361   PetscCall(PetscLogGpuTimeBegin());
1362 
1363   /* Solve L*y = b */
1364   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1365   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1366   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1367                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1368                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1369 
1370   /* Solve U*x = y */
1371   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1372   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1373                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1374 
1375   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1376   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1377 
1378   PetscCall(PetscLogGpuTimeEnd());
1379   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1380   PetscFunctionReturn(PETSC_SUCCESS);
1381 }
1382 
1383 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1384 {
1385   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1386   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1387   const PetscScalar            *barray;
1388   PetscScalar                  *xarray;
1389 
1390   PetscFunctionBegin;
1391   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1392     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1393     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1394                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1395 
1396     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1397     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1398     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1399     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1400     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1401   }
1402 
1403   if (!fs->updatedTransposeSpSVAnalysis) {
1404     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1405 
1406     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1407     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1408   }
1409 
1410   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1411   PetscCall(VecCUDAGetArrayRead(b, &barray));
1412   PetscCall(PetscLogGpuTimeBegin());
1413 
1414   /* Solve Ut*y = b */
1415   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1416   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1417   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1418                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1419 
1420   /* Solve Lt*x = y */
1421   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1422   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1423                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1424 
1425   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1426   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1427   PetscCall(PetscLogGpuTimeEnd());
1428   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1429   PetscFunctionReturn(PETSC_SUCCESS);
1430 }
1431 
1432 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1433 {
1434   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1435   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1436   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1437   CsrMatrix                    *Acsr;
1438   PetscInt                      m, nz;
1439   PetscBool                     flg;
1440 
1441   PetscFunctionBegin;
1442   if (PetscDefined(USE_DEBUG)) {
1443     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1444     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1445   }
1446 
1447   /* Copy A's value to fact */
1448   m  = fact->rmap->n;
1449   nz = aij->nz;
1450   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1451   Acsr = (CsrMatrix *)Acusp->mat->mat;
1452   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1453 
1454   /* Factorize fact inplace */
1455   if (m)
1456     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1457                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1458   if (PetscDefined(USE_DEBUG)) {
1459     int              numerical_zero;
1460     cusparseStatus_t status;
1461     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1462     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1463   }
1464 
1465   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1466      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1467   */
1468   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1469 
1470   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1471 
1472   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1473   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1474 
1475   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1476   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1477   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1478   fact->ops->matsolve          = NULL;
1479   fact->ops->matsolvetranspose = NULL;
1480   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1481   PetscFunctionReturn(PETSC_SUCCESS);
1482 }
1483 
1484 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1485 {
1486   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1487   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1488   PetscInt                      m, nz;
1489 
1490   PetscFunctionBegin;
1491   if (PetscDefined(USE_DEBUG)) {
1492     PetscInt  i;
1493     PetscBool flg, missing;
1494 
1495     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1496     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1497     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1498     PetscCall(MatMissingDiagonal(A, &missing, &i));
1499     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1500   }
1501 
1502   /* Free the old stale stuff */
1503   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1504 
1505   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1506      but they will not be used. Allocate them just for easy debugging.
1507    */
1508   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1509 
1510   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1511   fact->factortype             = MAT_FACTOR_ILU;
1512   fact->info.factor_mallocs    = 0;
1513   fact->info.fill_ratio_given  = info->fill;
1514   fact->info.fill_ratio_needed = 1.0;
1515 
1516   aij->row = NULL;
1517   aij->col = NULL;
1518 
1519   /* ====================================================================== */
1520   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1521   /* We'll do in-place factorization on fact                                */
1522   /* ====================================================================== */
1523   const int *Ai, *Aj;
1524 
1525   m  = fact->rmap->n;
1526   nz = aij->nz;
1527 
1528   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1529   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1530   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1531   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1532   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1533   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1534 
1535   /* ====================================================================== */
1536   /* Create descriptors for M, L, U                                         */
1537   /* ====================================================================== */
1538   cusparseFillMode_t fillMode;
1539   cusparseDiagType_t diagType;
1540 
1541   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1542   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1543   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1544 
1545   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1546     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1547     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1548     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1549     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1550   */
1551   fillMode = CUSPARSE_FILL_MODE_LOWER;
1552   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1553   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1554   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1555   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1556 
1557   fillMode = CUSPARSE_FILL_MODE_UPPER;
1558   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1559   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1560   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1561   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1562 
1563   /* ========================================================================= */
1564   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1565   /* ========================================================================= */
1566   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1567   if (m)
1568     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1569                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1570 
1571   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1572   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1573 
1574   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1575   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1576 
1577   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1578   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1579 
1580   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1581   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1582 
1583   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1584      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1585      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1586      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1587    */
1588   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1589     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1590     fs->spsvBuffer_L = fs->factBuffer_M;
1591     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1592   } else {
1593     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1594     fs->spsvBuffer_U = fs->factBuffer_M;
1595     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1596   }
1597 
1598   /* ========================================================================== */
1599   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1600   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1601   /* ========================================================================== */
1602   int              structural_zero;
1603   cusparseStatus_t status;
1604 
1605   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1606   if (m)
1607     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1608                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1609   if (PetscDefined(USE_DEBUG)) {
1610     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1611     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1612     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1613   }
1614 
1615   /* Estimate FLOPs of the numeric factorization */
1616   {
1617     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1618     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1619     PetscLogDouble flops = 0.0;
1620 
1621     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1622     Ai    = Aseq->i;
1623     Adiag = Aseq->diag;
1624     for (PetscInt i = 0; i < m; i++) {
1625       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1626         nzRow  = Ai[i + 1] - Ai[i];
1627         nzLeft = Adiag[i] - Ai[i];
1628         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1629           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1630         */
1631         nzLeft = (nzRow - 1) / 2;
1632         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1633       }
1634     }
1635     fs->numericFactFlops = flops;
1636   }
1637   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1638   PetscFunctionReturn(PETSC_SUCCESS);
1639 }
1640 
1641 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1642 {
1643   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1644   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1645   const PetscScalar            *barray;
1646   PetscScalar                  *xarray;
1647 
1648   PetscFunctionBegin;
1649   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1650   PetscCall(VecCUDAGetArrayRead(b, &barray));
1651   PetscCall(PetscLogGpuTimeBegin());
1652 
1653   /* Solve L*y = b */
1654   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1655   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1656   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1657                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1658 
1659   /* Solve Lt*x = y */
1660   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1661   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1662                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1663 
1664   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1665   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1666 
1667   PetscCall(PetscLogGpuTimeEnd());
1668   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1669   PetscFunctionReturn(PETSC_SUCCESS);
1670 }
1671 
1672 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1673 {
1674   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1675   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1676   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1677   CsrMatrix                    *Acsr;
1678   PetscInt                      m, nz;
1679   PetscBool                     flg;
1680 
1681   PetscFunctionBegin;
1682   if (PetscDefined(USE_DEBUG)) {
1683     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1684     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1685   }
1686 
1687   /* Copy A's value to fact */
1688   m  = fact->rmap->n;
1689   nz = aij->nz;
1690   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1691   Acsr = (CsrMatrix *)Acusp->mat->mat;
1692   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1693 
1694   /* Factorize fact inplace */
1695   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1696      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1697      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1698      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1699      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1700    */
1701   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1702   if (PetscDefined(USE_DEBUG)) {
1703     int              numerical_zero;
1704     cusparseStatus_t status;
1705     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1706     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1707   }
1708 
1709   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1710 
1711   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1712     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1713   */
1714   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1715 
1716   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1717   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1718   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1719   fact->ops->matsolve          = NULL;
1720   fact->ops->matsolvetranspose = NULL;
1721   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1722   PetscFunctionReturn(PETSC_SUCCESS);
1723 }
1724 
1725 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1726 {
1727   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1728   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1729   PetscInt                      m, nz;
1730 
1731   PetscFunctionBegin;
1732   if (PetscDefined(USE_DEBUG)) {
1733     PetscInt  i;
1734     PetscBool flg, missing;
1735 
1736     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1737     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1738     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1739     PetscCall(MatMissingDiagonal(A, &missing, &i));
1740     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1741   }
1742 
1743   /* Free the old stale stuff */
1744   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1745 
1746   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1747      but they will not be used. Allocate them just for easy debugging.
1748    */
1749   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1750 
1751   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1752   fact->factortype             = MAT_FACTOR_ICC;
1753   fact->info.factor_mallocs    = 0;
1754   fact->info.fill_ratio_given  = info->fill;
1755   fact->info.fill_ratio_needed = 1.0;
1756 
1757   aij->row = NULL;
1758   aij->col = NULL;
1759 
1760   /* ====================================================================== */
1761   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1762   /* We'll do in-place factorization on fact                                */
1763   /* ====================================================================== */
1764   const int *Ai, *Aj;
1765 
1766   m  = fact->rmap->n;
1767   nz = aij->nz;
1768 
1769   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1770   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1771   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1772   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1773   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1774   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1775 
1776   /* ====================================================================== */
1777   /* Create mat descriptors for M, L                                        */
1778   /* ====================================================================== */
1779   cusparseFillMode_t fillMode;
1780   cusparseDiagType_t diagType;
1781 
1782   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1783   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1784   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1785 
1786   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1787     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1788     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1789     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1790     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1791   */
1792   fillMode = CUSPARSE_FILL_MODE_LOWER;
1793   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1794   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1795   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1796   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1797 
1798   /* ========================================================================= */
1799   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1800   /* ========================================================================= */
1801   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1802   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1803 
1804   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1805   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1806 
1807   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1808   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1809 
1810   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1811   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1812 
1813   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1814   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1815 
1816   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1817      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1818    */
1819   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1820     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1821     fs->spsvBuffer_L = fs->factBuffer_M;
1822     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1823   } else {
1824     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1825     fs->spsvBuffer_Lt = fs->factBuffer_M;
1826     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1827   }
1828 
1829   /* ========================================================================== */
1830   /* Perform analysis of ic0 on M                                               */
1831   /* The lower triangular part of M has the same sparsity pattern as L          */
1832   /* ========================================================================== */
1833   int              structural_zero;
1834   cusparseStatus_t status;
1835 
1836   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1837   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1838   if (PetscDefined(USE_DEBUG)) {
1839     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1840     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1841     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1842   }
1843 
1844   /* Estimate FLOPs of the numeric factorization */
1845   {
1846     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1847     PetscInt      *Ai, nzRow, nzLeft;
1848     PetscLogDouble flops = 0.0;
1849 
1850     Ai = Aseq->i;
1851     for (PetscInt i = 0; i < m; i++) {
1852       nzRow = Ai[i + 1] - Ai[i];
1853       if (nzRow > 1) {
1854         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1855           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1856         */
1857         nzLeft = (nzRow - 1) / 2;
1858         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1859       }
1860     }
1861     fs->numericFactFlops = flops;
1862   }
1863   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1864   PetscFunctionReturn(PETSC_SUCCESS);
1865 }
1866 #endif
1867 
1868 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1869 {
1870   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1871 
1872   PetscFunctionBegin;
1873 #if CUSPARSE_VERSION >= 11500
1874   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1875   if (cusparseTriFactors->factorizeOnDevice) {
1876     PetscCall(ISIdentity(isrow, &row_identity));
1877     PetscCall(ISIdentity(iscol, &col_identity));
1878   }
1879   if (!info->levels && row_identity && col_identity) {
1880     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1881   } else
1882 #endif
1883   {
1884     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1885     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1886     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1887   }
1888   PetscFunctionReturn(PETSC_SUCCESS);
1889 }
1890 
1891 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1892 {
1893   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1894 
1895   PetscFunctionBegin;
1896   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1897   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1898   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1899   PetscFunctionReturn(PETSC_SUCCESS);
1900 }
1901 
1902 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1903 {
1904   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1905 
1906   PetscFunctionBegin;
1907 #if CUSPARSE_VERSION >= 11500
1908   PetscBool perm_identity = PETSC_FALSE;
1909   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1910   if (!info->levels && perm_identity) {
1911     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1912   } else
1913 #endif
1914   {
1915     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1916     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1917     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1918   }
1919   PetscFunctionReturn(PETSC_SUCCESS);
1920 }
1921 
1922 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1923 {
1924   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1925 
1926   PetscFunctionBegin;
1927   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1928   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1929   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1930   PetscFunctionReturn(PETSC_SUCCESS);
1931 }
1932 
1933 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1934 {
1935   PetscFunctionBegin;
1936   *type = MATSOLVERCUSPARSE;
1937   PetscFunctionReturn(PETSC_SUCCESS);
1938 }
1939 
1940 /*MC
1941   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1942   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1943   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1944   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1945   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1946   algorithms are not recommended. This class does NOT support direct solver operations.
1947 
1948   Level: beginner
1949 
1950 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
1951           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1952 M*/
1953 
1954 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1955 {
1956   PetscInt  n = A->rmap->n;
1957   PetscBool factOnDevice, factOnHost;
1958   char     *prefix;
1959   char      factPlace[32] = "device"; /* the default */
1960 
1961   PetscFunctionBegin;
1962   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1963   PetscCall(MatSetSizes(*B, n, n, n, n));
1964   (*B)->factortype = ftype;
1965   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1966 
1967   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1968   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1969   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1970   PetscOptionsEnd();
1971   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1972   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1973   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1974   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1975 
1976   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1977   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1978     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1979     if (!A->boundtocpu) {
1980       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1981       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1982     } else {
1983       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1984       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1985     }
1986     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1987     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1988     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1989   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1990     if (!A->boundtocpu) {
1991       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1992       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1993     } else {
1994       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1995       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1996     }
1997     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1998     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1999   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2000 
2001   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2002   (*B)->canuseordering = PETSC_TRUE;
2003   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2004   PetscFunctionReturn(PETSC_SUCCESS);
2005 }
2006 
2007 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2008 {
2009   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2010   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2011 #if CUSPARSE_VERSION >= 13500
2012   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2013 #endif
2014 
2015   PetscFunctionBegin;
2016   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2017     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2018     if (A->factortype == MAT_FACTOR_NONE) {
2019       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2020       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2021     }
2022 #if CUSPARSE_VERSION >= 13500
2023     else if (fs->csrVal) {
2024       /* We have a factorized matrix on device and are able to copy it to host */
2025       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2026     }
2027 #endif
2028     else
2029       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2030     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2031     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2032     A->offloadmask = PETSC_OFFLOAD_BOTH;
2033   }
2034   PetscFunctionReturn(PETSC_SUCCESS);
2035 }
2036 
2037 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2038 {
2039   PetscFunctionBegin;
2040   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2041   *array = ((Mat_SeqAIJ *)A->data)->a;
2042   PetscFunctionReturn(PETSC_SUCCESS);
2043 }
2044 
2045 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2046 {
2047   PetscFunctionBegin;
2048   A->offloadmask = PETSC_OFFLOAD_CPU;
2049   *array         = NULL;
2050   PetscFunctionReturn(PETSC_SUCCESS);
2051 }
2052 
2053 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2054 {
2055   PetscFunctionBegin;
2056   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2057   *array = ((Mat_SeqAIJ *)A->data)->a;
2058   PetscFunctionReturn(PETSC_SUCCESS);
2059 }
2060 
2061 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2062 {
2063   PetscFunctionBegin;
2064   *array = NULL;
2065   PetscFunctionReturn(PETSC_SUCCESS);
2066 }
2067 
2068 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2069 {
2070   PetscFunctionBegin;
2071   *array = ((Mat_SeqAIJ *)A->data)->a;
2072   PetscFunctionReturn(PETSC_SUCCESS);
2073 }
2074 
2075 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2076 {
2077   PetscFunctionBegin;
2078   A->offloadmask = PETSC_OFFLOAD_CPU;
2079   *array         = NULL;
2080   PetscFunctionReturn(PETSC_SUCCESS);
2081 }
2082 
2083 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2084 {
2085   Mat_SeqAIJCUSPARSE *cusp;
2086   CsrMatrix          *matrix;
2087 
2088   PetscFunctionBegin;
2089   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2090   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2091   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2092   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2093   matrix = (CsrMatrix *)cusp->mat->mat;
2094 
2095   if (i) {
2096 #if !defined(PETSC_USE_64BIT_INDICES)
2097     *i = matrix->row_offsets->data().get();
2098 #else
2099     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2100 #endif
2101   }
2102   if (j) {
2103 #if !defined(PETSC_USE_64BIT_INDICES)
2104     *j = matrix->column_indices->data().get();
2105 #else
2106     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2107 #endif
2108   }
2109   if (a) *a = matrix->values->data().get();
2110   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2111   PetscFunctionReturn(PETSC_SUCCESS);
2112 }
2113 
2114 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2115 {
2116   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2117   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2118   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2119   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2120   cusparseStatus_t              stat;
2121   PetscBool                     both = PETSC_TRUE;
2122 
2123   PetscFunctionBegin;
2124   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2125   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2126     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2127       CsrMatrix *matrix;
2128       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2129 
2130       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2131       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2132       matrix->values->assign(a->a, a->a + a->nz);
2133       PetscCallCUDA(WaitForCUDA());
2134       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2135       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2136       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2137     } else {
2138       PetscInt nnz;
2139       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2140       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2141       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2142       delete cusparsestruct->workVector;
2143       delete cusparsestruct->rowoffsets_gpu;
2144       cusparsestruct->workVector     = NULL;
2145       cusparsestruct->rowoffsets_gpu = NULL;
2146       try {
2147         if (a->compressedrow.use) {
2148           m    = a->compressedrow.nrows;
2149           ii   = a->compressedrow.i;
2150           ridx = a->compressedrow.rindex;
2151         } else {
2152           m    = A->rmap->n;
2153           ii   = a->i;
2154           ridx = NULL;
2155         }
2156         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2157         if (!a->a) {
2158           nnz  = ii[m];
2159           both = PETSC_FALSE;
2160         } else nnz = a->nz;
2161         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2162 
2163         /* create cusparse matrix */
2164         cusparsestruct->nrows = m;
2165         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2166         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2167         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2168         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2169 
2170         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2171         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2172         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2173         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2174         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2175         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2176         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2177 
2178         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2179         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2180           /* set the matrix */
2181           CsrMatrix *mat   = new CsrMatrix;
2182           mat->num_rows    = m;
2183           mat->num_cols    = A->cmap->n;
2184           mat->num_entries = nnz;
2185           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2186           mat->row_offsets->assign(ii, ii + m + 1);
2187 
2188           mat->column_indices = new THRUSTINTARRAY32(nnz);
2189           mat->column_indices->assign(a->j, a->j + nnz);
2190 
2191           mat->values = new THRUSTARRAY(nnz);
2192           if (a->a) mat->values->assign(a->a, a->a + nnz);
2193 
2194           /* assign the pointer */
2195           matstruct->mat = mat;
2196 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2197           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2198             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2199                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2200             PetscCallCUSPARSE(stat);
2201           }
2202 #endif
2203         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2204 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2205           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2206 #else
2207           CsrMatrix *mat = new CsrMatrix;
2208           mat->num_rows = m;
2209           mat->num_cols = A->cmap->n;
2210           mat->num_entries = nnz;
2211           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2212           mat->row_offsets->assign(ii, ii + m + 1);
2213 
2214           mat->column_indices = new THRUSTINTARRAY32(nnz);
2215           mat->column_indices->assign(a->j, a->j + nnz);
2216 
2217           mat->values = new THRUSTARRAY(nnz);
2218           if (a->a) mat->values->assign(a->a, a->a + nnz);
2219 
2220           cusparseHybMat_t hybMat;
2221           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2222           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2223           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2224           PetscCallCUSPARSE(stat);
2225           /* assign the pointer */
2226           matstruct->mat = hybMat;
2227 
2228           if (mat) {
2229             if (mat->values) delete (THRUSTARRAY *)mat->values;
2230             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2231             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2232             delete (CsrMatrix *)mat;
2233           }
2234 #endif
2235         }
2236 
2237         /* assign the compressed row indices */
2238         if (a->compressedrow.use) {
2239           cusparsestruct->workVector = new THRUSTARRAY(m);
2240           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2241           matstruct->cprowIndices->assign(ridx, ridx + m);
2242           tmp = m;
2243         } else {
2244           cusparsestruct->workVector = NULL;
2245           matstruct->cprowIndices    = NULL;
2246           tmp                        = 0;
2247         }
2248         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2249 
2250         /* assign the pointer */
2251         cusparsestruct->mat = matstruct;
2252       } catch (char *ex) {
2253         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2254       }
2255       PetscCallCUDA(WaitForCUDA());
2256       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2257       cusparsestruct->nonzerostate = A->nonzerostate;
2258     }
2259     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2260   }
2261   PetscFunctionReturn(PETSC_SUCCESS);
2262 }
2263 
2264 struct VecCUDAPlusEquals {
2265   template <typename Tuple>
2266   __host__ __device__ void operator()(Tuple t)
2267   {
2268     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2269   }
2270 };
2271 
2272 struct VecCUDAEquals {
2273   template <typename Tuple>
2274   __host__ __device__ void operator()(Tuple t)
2275   {
2276     thrust::get<1>(t) = thrust::get<0>(t);
2277   }
2278 };
2279 
2280 struct VecCUDAEqualsReverse {
2281   template <typename Tuple>
2282   __host__ __device__ void operator()(Tuple t)
2283   {
2284     thrust::get<0>(t) = thrust::get<1>(t);
2285   }
2286 };
2287 
2288 struct MatMatCusparse {
2289   PetscBool      cisdense;
2290   PetscScalar   *Bt;
2291   Mat            X;
2292   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2293   PetscLogDouble flops;
2294   CsrMatrix     *Bcsr;
2295 
2296 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2297   cusparseSpMatDescr_t matSpBDescr;
2298   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2299   cusparseDnMatDescr_t matBDescr;
2300   cusparseDnMatDescr_t matCDescr;
2301   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2302   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2303   void *dBuffer4;
2304   void *dBuffer5;
2305   #endif
2306   size_t                mmBufferSize;
2307   void                 *mmBuffer;
2308   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2309   cusparseSpGEMMDescr_t spgemmDesc;
2310 #endif
2311 };
2312 
2313 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2314 {
2315   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2316 
2317   PetscFunctionBegin;
2318   PetscCallCUDA(cudaFree(mmdata->Bt));
2319   delete mmdata->Bcsr;
2320 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2321   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2322   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2323   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2324   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2325   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2327   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2328   #endif
2329   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2330   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2331 #endif
2332   PetscCall(MatDestroy(&mmdata->X));
2333   PetscCall(PetscFree(data));
2334   PetscFunctionReturn(PETSC_SUCCESS);
2335 }
2336 
2337 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2338 
2339 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2340 {
2341   Mat_Product                  *product = C->product;
2342   Mat                           A, B;
2343   PetscInt                      m, n, blda, clda;
2344   PetscBool                     flg, biscuda;
2345   Mat_SeqAIJCUSPARSE           *cusp;
2346   cusparseStatus_t              stat;
2347   cusparseOperation_t           opA;
2348   const PetscScalar            *barray;
2349   PetscScalar                  *carray;
2350   MatMatCusparse               *mmdata;
2351   Mat_SeqAIJCUSPARSEMultStruct *mat;
2352   CsrMatrix                    *csrmat;
2353 
2354   PetscFunctionBegin;
2355   MatCheckProduct(C, 1);
2356   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2357   mmdata = (MatMatCusparse *)product->data;
2358   A      = product->A;
2359   B      = product->B;
2360   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2361   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2362   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2363      Instead of silently accepting the wrong answer, I prefer to raise the error */
2364   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2365   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2366   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2367   switch (product->type) {
2368   case MATPRODUCT_AB:
2369   case MATPRODUCT_PtAP:
2370     mat = cusp->mat;
2371     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2372     m   = A->rmap->n;
2373     n   = B->cmap->n;
2374     break;
2375   case MATPRODUCT_AtB:
2376     if (!A->form_explicit_transpose) {
2377       mat = cusp->mat;
2378       opA = CUSPARSE_OPERATION_TRANSPOSE;
2379     } else {
2380       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2381       mat = cusp->matTranspose;
2382       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2383     }
2384     m = A->cmap->n;
2385     n = B->cmap->n;
2386     break;
2387   case MATPRODUCT_ABt:
2388   case MATPRODUCT_RARt:
2389     mat = cusp->mat;
2390     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2391     m   = A->rmap->n;
2392     n   = B->rmap->n;
2393     break;
2394   default:
2395     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2396   }
2397   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2398   csrmat = (CsrMatrix *)mat->mat;
2399   /* if the user passed a CPU matrix, copy the data to the GPU */
2400   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2401   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2402   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2403 
2404   PetscCall(MatDenseGetLDA(B, &blda));
2405   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2406     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2407     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2408   } else {
2409     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2410     PetscCall(MatDenseGetLDA(C, &clda));
2411   }
2412 
2413   PetscCall(PetscLogGpuTimeBegin());
2414 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2415   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2416   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2417   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2418     size_t mmBufferSize;
2419     if (mmdata->initialized && mmdata->Blda != blda) {
2420       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2421       mmdata->matBDescr = NULL;
2422     }
2423     if (!mmdata->matBDescr) {
2424       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2425       mmdata->Blda = blda;
2426     }
2427 
2428     if (mmdata->initialized && mmdata->Clda != clda) {
2429       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2430       mmdata->matCDescr = NULL;
2431     }
2432     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2433       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2434       mmdata->Clda = clda;
2435     }
2436 
2437     if (!mat->matDescr) {
2438       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2439                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2440       PetscCallCUSPARSE(stat);
2441     }
2442     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2443     PetscCallCUSPARSE(stat);
2444     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2445       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2446       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2447       mmdata->mmBufferSize = mmBufferSize;
2448     }
2449     mmdata->initialized = PETSC_TRUE;
2450   } else {
2451     /* to be safe, always update pointers of the mats */
2452     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2453     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2454     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2455   }
2456 
2457   /* do cusparseSpMM, which supports transpose on B */
2458   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2459   PetscCallCUSPARSE(stat);
2460 #else
2461   PetscInt k;
2462   /* cusparseXcsrmm does not support transpose on B */
2463   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2464     cublasHandle_t cublasv2handle;
2465     cublasStatus_t cerr;
2466 
2467     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2468     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2469     PetscCallCUBLAS(cerr);
2470     blda = B->cmap->n;
2471     k = B->cmap->n;
2472   } else {
2473     k = B->rmap->n;
2474   }
2475 
2476   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2477   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2478   PetscCallCUSPARSE(stat);
2479 #endif
2480   PetscCall(PetscLogGpuTimeEnd());
2481   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2482   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2483   if (product->type == MATPRODUCT_RARt) {
2484     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2485     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2486   } else if (product->type == MATPRODUCT_PtAP) {
2487     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2488     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2489   } else {
2490     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2491   }
2492   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2493   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2494   PetscFunctionReturn(PETSC_SUCCESS);
2495 }
2496 
2497 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2498 {
2499   Mat_Product        *product = C->product;
2500   Mat                 A, B;
2501   PetscInt            m, n;
2502   PetscBool           cisdense, flg;
2503   MatMatCusparse     *mmdata;
2504   Mat_SeqAIJCUSPARSE *cusp;
2505 
2506   PetscFunctionBegin;
2507   MatCheckProduct(C, 1);
2508   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2509   A = product->A;
2510   B = product->B;
2511   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2512   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2513   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2514   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2515   switch (product->type) {
2516   case MATPRODUCT_AB:
2517     m = A->rmap->n;
2518     n = B->cmap->n;
2519     break;
2520   case MATPRODUCT_AtB:
2521     m = A->cmap->n;
2522     n = B->cmap->n;
2523     break;
2524   case MATPRODUCT_ABt:
2525     m = A->rmap->n;
2526     n = B->rmap->n;
2527     break;
2528   case MATPRODUCT_PtAP:
2529     m = B->cmap->n;
2530     n = B->cmap->n;
2531     break;
2532   case MATPRODUCT_RARt:
2533     m = B->rmap->n;
2534     n = B->rmap->n;
2535     break;
2536   default:
2537     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2538   }
2539   PetscCall(MatSetSizes(C, m, n, m, n));
2540   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2541   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2542   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2543 
2544   /* product data */
2545   PetscCall(PetscNew(&mmdata));
2546   mmdata->cisdense = cisdense;
2547 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2548   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2549   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2550 #endif
2551   /* for these products we need intermediate storage */
2552   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2553     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2554     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2555     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2556       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2557     } else {
2558       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2559     }
2560   }
2561   C->product->data    = mmdata;
2562   C->product->destroy = MatDestroy_MatMatCusparse;
2563 
2564   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2565   PetscFunctionReturn(PETSC_SUCCESS);
2566 }
2567 
2568 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2569 {
2570   Mat_Product                  *product = C->product;
2571   Mat                           A, B;
2572   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2573   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2574   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2575   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2576   PetscBool                     flg;
2577   cusparseStatus_t              stat;
2578   MatProductType                ptype;
2579   MatMatCusparse               *mmdata;
2580 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2581   cusparseSpMatDescr_t BmatSpDescr;
2582 #endif
2583   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2584 
2585   PetscFunctionBegin;
2586   MatCheckProduct(C, 1);
2587   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2588   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2589   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2590   mmdata = (MatMatCusparse *)C->product->data;
2591   A      = product->A;
2592   B      = product->B;
2593   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2594     mmdata->reusesym = PETSC_FALSE;
2595     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2596     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2597     Cmat = Ccusp->mat;
2598     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2599     Ccsr = (CsrMatrix *)Cmat->mat;
2600     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2601     goto finalize;
2602   }
2603   if (!c->nz) goto finalize;
2604   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2605   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2606   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2607   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2608   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2609   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2610   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2611   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2612   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2613   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2614   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2615   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2616   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2617   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2618 
2619   ptype = product->type;
2620   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2621     ptype = MATPRODUCT_AB;
2622     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2623   }
2624   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2625     ptype = MATPRODUCT_AB;
2626     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2627   }
2628   switch (ptype) {
2629   case MATPRODUCT_AB:
2630     Amat = Acusp->mat;
2631     Bmat = Bcusp->mat;
2632     break;
2633   case MATPRODUCT_AtB:
2634     Amat = Acusp->matTranspose;
2635     Bmat = Bcusp->mat;
2636     break;
2637   case MATPRODUCT_ABt:
2638     Amat = Acusp->mat;
2639     Bmat = Bcusp->matTranspose;
2640     break;
2641   default:
2642     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2643   }
2644   Cmat = Ccusp->mat;
2645   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2646   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2647   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2648   Acsr = (CsrMatrix *)Amat->mat;
2649   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2650   Ccsr = (CsrMatrix *)Cmat->mat;
2651   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2652   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2653   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2654   PetscCall(PetscLogGpuTimeBegin());
2655 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2656   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2657   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2658   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2659   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2660   PetscCallCUSPARSE(stat);
2661   #else
2662   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2663   PetscCallCUSPARSE(stat);
2664   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2665   PetscCallCUSPARSE(stat);
2666   #endif
2667 #else
2668   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2669                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2670   PetscCallCUSPARSE(stat);
2671 #endif
2672   PetscCall(PetscLogGpuFlops(mmdata->flops));
2673   PetscCallCUDA(WaitForCUDA());
2674   PetscCall(PetscLogGpuTimeEnd());
2675   C->offloadmask = PETSC_OFFLOAD_GPU;
2676 finalize:
2677   /* shorter version of MatAssemblyEnd_SeqAIJ */
2678   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2679   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2680   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2681   c->reallocs = 0;
2682   C->info.mallocs += 0;
2683   C->info.nz_unneeded = 0;
2684   C->assembled = C->was_assembled = PETSC_TRUE;
2685   C->num_ass++;
2686   PetscFunctionReturn(PETSC_SUCCESS);
2687 }
2688 
2689 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2690 {
2691   Mat_Product                  *product = C->product;
2692   Mat                           A, B;
2693   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2694   Mat_SeqAIJ                   *a, *b, *c;
2695   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2696   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2697   PetscInt                      i, j, m, n, k;
2698   PetscBool                     flg;
2699   cusparseStatus_t              stat;
2700   MatProductType                ptype;
2701   MatMatCusparse               *mmdata;
2702   PetscLogDouble                flops;
2703   PetscBool                     biscompressed, ciscompressed;
2704 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2705   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2706   cusparseSpMatDescr_t BmatSpDescr;
2707 #else
2708   int cnz;
2709 #endif
2710   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2711 
2712   PetscFunctionBegin;
2713   MatCheckProduct(C, 1);
2714   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2715   A = product->A;
2716   B = product->B;
2717   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2718   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2719   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2720   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2721   a = (Mat_SeqAIJ *)A->data;
2722   b = (Mat_SeqAIJ *)B->data;
2723   /* product data */
2724   PetscCall(PetscNew(&mmdata));
2725   C->product->data    = mmdata;
2726   C->product->destroy = MatDestroy_MatMatCusparse;
2727 
2728   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2729   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2730   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2731   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2732   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2733   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2734 
2735   ptype = product->type;
2736   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2737     ptype                                          = MATPRODUCT_AB;
2738     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2739   }
2740   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2741     ptype                                          = MATPRODUCT_AB;
2742     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2743   }
2744   biscompressed = PETSC_FALSE;
2745   ciscompressed = PETSC_FALSE;
2746   switch (ptype) {
2747   case MATPRODUCT_AB:
2748     m    = A->rmap->n;
2749     n    = B->cmap->n;
2750     k    = A->cmap->n;
2751     Amat = Acusp->mat;
2752     Bmat = Bcusp->mat;
2753     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2754     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2755     break;
2756   case MATPRODUCT_AtB:
2757     m = A->cmap->n;
2758     n = B->cmap->n;
2759     k = A->rmap->n;
2760     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2761     Amat = Acusp->matTranspose;
2762     Bmat = Bcusp->mat;
2763     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2764     break;
2765   case MATPRODUCT_ABt:
2766     m = A->rmap->n;
2767     n = B->rmap->n;
2768     k = A->cmap->n;
2769     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2770     Amat = Acusp->mat;
2771     Bmat = Bcusp->matTranspose;
2772     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2773     break;
2774   default:
2775     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2776   }
2777 
2778   /* create cusparse matrix */
2779   PetscCall(MatSetSizes(C, m, n, m, n));
2780   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2781   c     = (Mat_SeqAIJ *)C->data;
2782   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2783   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2784   Ccsr  = new CsrMatrix;
2785 
2786   c->compressedrow.use = ciscompressed;
2787   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2788     c->compressedrow.nrows = a->compressedrow.nrows;
2789     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2790     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2791     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2792     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2793     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2794   } else {
2795     c->compressedrow.nrows  = 0;
2796     c->compressedrow.i      = NULL;
2797     c->compressedrow.rindex = NULL;
2798     Ccusp->workVector       = NULL;
2799     Cmat->cprowIndices      = NULL;
2800   }
2801   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2802   Ccusp->mat        = Cmat;
2803   Ccusp->mat->mat   = Ccsr;
2804   Ccsr->num_rows    = Ccusp->nrows;
2805   Ccsr->num_cols    = n;
2806   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2807   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2808   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2809   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2810   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2811   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2812   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2813   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2814   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2815   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2816   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2817     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2818     c->nz                = 0;
2819     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2820     Ccsr->values         = new THRUSTARRAY(c->nz);
2821     goto finalizesym;
2822   }
2823 
2824   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2825   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2826   Acsr = (CsrMatrix *)Amat->mat;
2827   if (!biscompressed) {
2828     Bcsr = (CsrMatrix *)Bmat->mat;
2829 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2830     BmatSpDescr = Bmat->matDescr;
2831 #endif
2832   } else { /* we need to use row offsets for the full matrix */
2833     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2834     Bcsr                 = new CsrMatrix;
2835     Bcsr->num_rows       = B->rmap->n;
2836     Bcsr->num_cols       = cBcsr->num_cols;
2837     Bcsr->num_entries    = cBcsr->num_entries;
2838     Bcsr->column_indices = cBcsr->column_indices;
2839     Bcsr->values         = cBcsr->values;
2840     if (!Bcusp->rowoffsets_gpu) {
2841       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2842       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2843       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2844     }
2845     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2846     mmdata->Bcsr      = Bcsr;
2847 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2848     if (Bcsr->num_rows && Bcsr->num_cols) {
2849       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2850       PetscCallCUSPARSE(stat);
2851     }
2852     BmatSpDescr = mmdata->matSpBDescr;
2853 #endif
2854   }
2855   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2856   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2857   /* precompute flops count */
2858   if (ptype == MATPRODUCT_AB) {
2859     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2860       const PetscInt st = a->i[i];
2861       const PetscInt en = a->i[i + 1];
2862       for (j = st; j < en; j++) {
2863         const PetscInt brow = a->j[j];
2864         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2865       }
2866     }
2867   } else if (ptype == MATPRODUCT_AtB) {
2868     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2869       const PetscInt anzi = a->i[i + 1] - a->i[i];
2870       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2871       flops += (2. * anzi) * bnzi;
2872     }
2873   } else { /* TODO */
2874     flops = 0.;
2875   }
2876 
2877   mmdata->flops = flops;
2878   PetscCall(PetscLogGpuTimeBegin());
2879 
2880 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2881   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2882   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2883   PetscCallCUSPARSE(stat);
2884   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2885   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2886   {
2887     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2888      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2889   */
2890     void *dBuffer1 = NULL;
2891     void *dBuffer2 = NULL;
2892     void *dBuffer3 = NULL;
2893     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2894     size_t bufferSize1 = 0;
2895     size_t bufferSize2 = 0;
2896     size_t bufferSize3 = 0;
2897     size_t bufferSize4 = 0;
2898     size_t bufferSize5 = 0;
2899 
2900     /* ask bufferSize1 bytes for external memory */
2901     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2902     PetscCallCUSPARSE(stat);
2903     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2904     /* inspect the matrices A and B to understand the memory requirement for the next step */
2905     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2906     PetscCallCUSPARSE(stat);
2907 
2908     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2909     PetscCallCUSPARSE(stat);
2910     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2911     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2912     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2913     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2914     PetscCallCUSPARSE(stat);
2915     PetscCallCUDA(cudaFree(dBuffer1));
2916     PetscCallCUDA(cudaFree(dBuffer2));
2917 
2918     /* get matrix C non-zero entries C_nnz1 */
2919     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2920     c->nz = (PetscInt)C_nnz1;
2921     /* allocate matrix C */
2922     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2923     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2924     Ccsr->values = new THRUSTARRAY(c->nz);
2925     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2926     /* update matC with the new pointers */
2927     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2928     PetscCallCUSPARSE(stat);
2929 
2930     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2931     PetscCallCUSPARSE(stat);
2932     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2933     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2934     PetscCallCUSPARSE(stat);
2935     PetscCallCUDA(cudaFree(dBuffer3));
2936     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2937     PetscCallCUSPARSE(stat);
2938     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2939   }
2940   #else
2941   size_t bufSize2;
2942   /* ask bufferSize bytes for external memory */
2943   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2944   PetscCallCUSPARSE(stat);
2945   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2946   /* inspect the matrices A and B to understand the memory requirement for the next step */
2947   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2948   PetscCallCUSPARSE(stat);
2949   /* ask bufferSize again bytes for external memory */
2950   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2951   PetscCallCUSPARSE(stat);
2952   /* The CUSPARSE documentation is not clear, nor the API
2953      We need both buffers to perform the operations properly!
2954      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2955      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2956      is stored in the descriptor! What a messy API... */
2957   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2958   /* compute the intermediate product of A * B */
2959   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2960   PetscCallCUSPARSE(stat);
2961   /* get matrix C non-zero entries C_nnz1 */
2962   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2963   c->nz = (PetscInt)C_nnz1;
2964   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2965                       mmdata->mmBufferSize / 1024));
2966   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2967   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2968   Ccsr->values = new THRUSTARRAY(c->nz);
2969   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2970   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2971   PetscCallCUSPARSE(stat);
2972   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2973   PetscCallCUSPARSE(stat);
2974   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2975 #else
2976   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2977   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2978                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
2979   PetscCallCUSPARSE(stat);
2980   c->nz = cnz;
2981   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2982   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2983   Ccsr->values = new THRUSTARRAY(c->nz);
2984   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2985 
2986   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2987   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2988      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2989      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2990   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2991                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2992   PetscCallCUSPARSE(stat);
2993 #endif
2994   PetscCall(PetscLogGpuFlops(mmdata->flops));
2995   PetscCall(PetscLogGpuTimeEnd());
2996 finalizesym:
2997   c->singlemalloc = PETSC_FALSE;
2998   c->free_a       = PETSC_TRUE;
2999   c->free_ij      = PETSC_TRUE;
3000   PetscCall(PetscMalloc1(m + 1, &c->i));
3001   PetscCall(PetscMalloc1(c->nz, &c->j));
3002   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3003     PetscInt      *d_i = c->i;
3004     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3005     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3006     ii = *Ccsr->row_offsets;
3007     jj = *Ccsr->column_indices;
3008     if (ciscompressed) d_i = c->compressedrow.i;
3009     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3010     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3011   } else {
3012     PetscInt *d_i = c->i;
3013     if (ciscompressed) d_i = c->compressedrow.i;
3014     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3015     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3016   }
3017   if (ciscompressed) { /* need to expand host row offsets */
3018     PetscInt r = 0;
3019     c->i[0]    = 0;
3020     for (k = 0; k < c->compressedrow.nrows; k++) {
3021       const PetscInt next = c->compressedrow.rindex[k];
3022       const PetscInt old  = c->compressedrow.i[k];
3023       for (; r < next; r++) c->i[r + 1] = old;
3024     }
3025     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3026   }
3027   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3028   PetscCall(PetscMalloc1(m, &c->ilen));
3029   PetscCall(PetscMalloc1(m, &c->imax));
3030   c->maxnz         = c->nz;
3031   c->nonzerorowcnt = 0;
3032   c->rmax          = 0;
3033   for (k = 0; k < m; k++) {
3034     const PetscInt nn = c->i[k + 1] - c->i[k];
3035     c->ilen[k] = c->imax[k] = nn;
3036     c->nonzerorowcnt += (PetscInt) !!nn;
3037     c->rmax = PetscMax(c->rmax, nn);
3038   }
3039   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3040   PetscCall(PetscMalloc1(c->nz, &c->a));
3041   Ccsr->num_entries = c->nz;
3042 
3043   C->nonzerostate++;
3044   PetscCall(PetscLayoutSetUp(C->rmap));
3045   PetscCall(PetscLayoutSetUp(C->cmap));
3046   Ccusp->nonzerostate = C->nonzerostate;
3047   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3048   C->preallocated     = PETSC_TRUE;
3049   C->assembled        = PETSC_FALSE;
3050   C->was_assembled    = PETSC_FALSE;
3051   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3052     mmdata->reusesym = PETSC_TRUE;
3053     C->offloadmask   = PETSC_OFFLOAD_GPU;
3054   }
3055   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3056   PetscFunctionReturn(PETSC_SUCCESS);
3057 }
3058 
3059 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3060 
3061 /* handles sparse or dense B */
3062 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3063 {
3064   Mat_Product *product = mat->product;
3065   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3066 
3067   PetscFunctionBegin;
3068   MatCheckProduct(mat, 1);
3069   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3070   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3071   if (product->type == MATPRODUCT_ABC) {
3072     Ciscusp = PETSC_FALSE;
3073     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3074   }
3075   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3076     PetscBool usecpu = PETSC_FALSE;
3077     switch (product->type) {
3078     case MATPRODUCT_AB:
3079       if (product->api_user) {
3080         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3081         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3082         PetscOptionsEnd();
3083       } else {
3084         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3085         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3086         PetscOptionsEnd();
3087       }
3088       break;
3089     case MATPRODUCT_AtB:
3090       if (product->api_user) {
3091         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3092         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3093         PetscOptionsEnd();
3094       } else {
3095         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3096         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3097         PetscOptionsEnd();
3098       }
3099       break;
3100     case MATPRODUCT_PtAP:
3101       if (product->api_user) {
3102         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3103         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3104         PetscOptionsEnd();
3105       } else {
3106         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3107         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3108         PetscOptionsEnd();
3109       }
3110       break;
3111     case MATPRODUCT_RARt:
3112       if (product->api_user) {
3113         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3114         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3115         PetscOptionsEnd();
3116       } else {
3117         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3118         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3119         PetscOptionsEnd();
3120       }
3121       break;
3122     case MATPRODUCT_ABC:
3123       if (product->api_user) {
3124         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3125         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3126         PetscOptionsEnd();
3127       } else {
3128         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3129         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3130         PetscOptionsEnd();
3131       }
3132       break;
3133     default:
3134       break;
3135     }
3136     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3137   }
3138   /* dispatch */
3139   if (isdense) {
3140     switch (product->type) {
3141     case MATPRODUCT_AB:
3142     case MATPRODUCT_AtB:
3143     case MATPRODUCT_ABt:
3144     case MATPRODUCT_PtAP:
3145     case MATPRODUCT_RARt:
3146       if (product->A->boundtocpu) {
3147         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3148       } else {
3149         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3150       }
3151       break;
3152     case MATPRODUCT_ABC:
3153       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3154       break;
3155     default:
3156       break;
3157     }
3158   } else if (Biscusp && Ciscusp) {
3159     switch (product->type) {
3160     case MATPRODUCT_AB:
3161     case MATPRODUCT_AtB:
3162     case MATPRODUCT_ABt:
3163       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3164       break;
3165     case MATPRODUCT_PtAP:
3166     case MATPRODUCT_RARt:
3167     case MATPRODUCT_ABC:
3168       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3169       break;
3170     default:
3171       break;
3172     }
3173   } else { /* fallback for AIJ */
3174     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3175   }
3176   PetscFunctionReturn(PETSC_SUCCESS);
3177 }
3178 
3179 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3180 {
3181   PetscFunctionBegin;
3182   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3183   PetscFunctionReturn(PETSC_SUCCESS);
3184 }
3185 
3186 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3187 {
3188   PetscFunctionBegin;
3189   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3190   PetscFunctionReturn(PETSC_SUCCESS);
3191 }
3192 
3193 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3194 {
3195   PetscFunctionBegin;
3196   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3197   PetscFunctionReturn(PETSC_SUCCESS);
3198 }
3199 
3200 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3201 {
3202   PetscFunctionBegin;
3203   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3204   PetscFunctionReturn(PETSC_SUCCESS);
3205 }
3206 
3207 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3208 {
3209   PetscFunctionBegin;
3210   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3211   PetscFunctionReturn(PETSC_SUCCESS);
3212 }
3213 
3214 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3215 {
3216   int i = blockIdx.x * blockDim.x + threadIdx.x;
3217   if (i < n) y[idx[i]] += x[i];
3218 }
3219 
3220 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3221 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3222 {
3223   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3224   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3225   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3226   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3227   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3228   PetscBool                     compressed;
3229 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3230   PetscInt nx, ny;
3231 #endif
3232 
3233   PetscFunctionBegin;
3234   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3235   if (!a->nz) {
3236     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3237     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3238     PetscFunctionReturn(PETSC_SUCCESS);
3239   }
3240   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3241   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3242   if (!trans) {
3243     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3244     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3245   } else {
3246     if (herm || !A->form_explicit_transpose) {
3247       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3248       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3249     } else {
3250       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3251       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3252     }
3253   }
3254   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3255   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3256 
3257   try {
3258     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3259     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3260     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3261 
3262     PetscCall(PetscLogGpuTimeBegin());
3263     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3264       /* z = A x + beta y.
3265          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3266          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3267       */
3268       xptr = xarray;
3269       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3270       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3271 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3272       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3273           allocated to accommodate different uses. So we get the length info directly from mat.
3274        */
3275       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3276         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3277         nx             = mat->num_cols;
3278         ny             = mat->num_rows;
3279       }
3280 #endif
3281     } else {
3282       /* z = A^T x + beta y
3283          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3284          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3285        */
3286       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3287       dptr = zarray;
3288       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3289       if (compressed) { /* Scatter x to work vector */
3290         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3291 
3292         thrust::for_each(
3293 #if PetscDefined(HAVE_THRUST_ASYNC)
3294           thrust::cuda::par.on(PetscDefaultCudaStream),
3295 #endif
3296           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3297           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3298       }
3299 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3300       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3301         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3302         nx             = mat->num_rows;
3303         ny             = mat->num_cols;
3304       }
3305 #endif
3306     }
3307 
3308     /* csr_spmv does y = alpha op(A) x + beta y */
3309     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3310 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3311       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3312       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3313         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3314         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3315         PetscCallCUSPARSE(
3316           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3317         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3318 
3319         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3320       } else {
3321         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3322         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3323         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3324       }
3325 
3326       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3327                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3328 #else
3329       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3330       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3331 #endif
3332     } else {
3333       if (cusparsestruct->nrows) {
3334 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3335         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3336 #else
3337         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3338         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3339 #endif
3340       }
3341     }
3342     PetscCall(PetscLogGpuTimeEnd());
3343 
3344     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3345       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3346         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3347           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3348         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3349           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3350         }
3351       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3352         PetscCall(VecSeq_CUDA::Set(zz, 0));
3353       }
3354 
3355       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3356       if (compressed) {
3357         PetscCall(PetscLogGpuTimeBegin());
3358         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3359            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3360            prevent that. So I just add a ScatterAdd kernel.
3361          */
3362 #if 0
3363         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3364         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3365                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3366                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3367                          VecCUDAPlusEquals());
3368 #else
3369         PetscInt n = matstruct->cprowIndices->size();
3370         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3371 #endif
3372         PetscCall(PetscLogGpuTimeEnd());
3373       }
3374     } else {
3375       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3376     }
3377     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3378     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3379     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3380   } catch (char *ex) {
3381     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3382   }
3383   if (yy) {
3384     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3385   } else {
3386     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3387   }
3388   PetscFunctionReturn(PETSC_SUCCESS);
3389 }
3390 
3391 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3392 {
3393   PetscFunctionBegin;
3394   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3395   PetscFunctionReturn(PETSC_SUCCESS);
3396 }
3397 
3398 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3399 {
3400   PetscObjectState    onnz = A->nonzerostate;
3401   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3402 
3403   PetscFunctionBegin;
3404   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3405   if (onnz != A->nonzerostate && cusp->deviceMat) {
3406     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3407     PetscCallCUDA(cudaFree(cusp->deviceMat));
3408     cusp->deviceMat = NULL;
3409   }
3410   PetscFunctionReturn(PETSC_SUCCESS);
3411 }
3412 
3413 /*@
3414    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3415    (the default parallel PETSc format). This matrix will ultimately pushed down
3416    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3417    assembly performance the user should preallocate the matrix storage by setting
3418    the parameter nz (or the array nnz).  By setting these parameters accurately,
3419    performance during matrix assembly can be increased by more than a factor of 50.
3420 
3421    Collective
3422 
3423    Input Parameters:
3424 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3425 .  m - number of rows
3426 .  n - number of columns
3427 .  nz - number of nonzeros per row (same for all rows)
3428 -  nnz - array containing the number of nonzeros in the various rows
3429          (possibly different for each row) or `NULL`
3430 
3431    Output Parameter:
3432 .  A - the matrix
3433 
3434    Level: intermediate
3435 
3436    Notes:
3437    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3438    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3439    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3440 
3441    If `nnz` is given then `nz` is ignored
3442 
3443    The AIJ format, also called
3444    compressed row storage, is fully compatible with standard Fortran
3445    storage.  That is, the stored row and column indices can begin at
3446    either one (as in Fortran) or zero.  See the users' manual for details.
3447 
3448    Specify the preallocated storage with either nz or nnz (not both).
3449    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3450    allocation.  For large problems you MUST preallocate memory or you
3451    will get TERRIBLE performance, see the users' manual chapter on matrices.
3452 
3453    By default, this format uses inodes (identical nodes) when possible, to
3454    improve numerical efficiency of matrix-vector products and solves. We
3455    search for consecutive rows with the same nonzero structure, thereby
3456    reusing matrix information to achieve increased efficiency.
3457 
3458 .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3459 @*/
3460 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3461 {
3462   PetscFunctionBegin;
3463   PetscCall(MatCreate(comm, A));
3464   PetscCall(MatSetSizes(*A, m, n, m, n));
3465   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3466   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3467   PetscFunctionReturn(PETSC_SUCCESS);
3468 }
3469 
3470 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3471 {
3472   PetscFunctionBegin;
3473   if (A->factortype == MAT_FACTOR_NONE) {
3474     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3475   } else {
3476     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3477   }
3478   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3479   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3480   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3481   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3482   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3483   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3484   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3485   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3486   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3487   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3488   PetscCall(MatDestroy_SeqAIJ(A));
3489   PetscFunctionReturn(PETSC_SUCCESS);
3490 }
3491 
3492 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3493 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3494 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3495 {
3496   PetscFunctionBegin;
3497   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3498   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3499   PetscFunctionReturn(PETSC_SUCCESS);
3500 }
3501 
3502 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3503 {
3504   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3505   Mat_SeqAIJCUSPARSE *cy;
3506   Mat_SeqAIJCUSPARSE *cx;
3507   PetscScalar        *ay;
3508   const PetscScalar  *ax;
3509   CsrMatrix          *csry, *csrx;
3510 
3511   PetscFunctionBegin;
3512   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3513   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3514   if (X->ops->axpy != Y->ops->axpy) {
3515     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3516     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3517     PetscFunctionReturn(PETSC_SUCCESS);
3518   }
3519   /* if we are here, it means both matrices are bound to GPU */
3520   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3521   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3522   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3523   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3524   csry = (CsrMatrix *)cy->mat->mat;
3525   csrx = (CsrMatrix *)cx->mat->mat;
3526   /* see if we can turn this into a cublas axpy */
3527   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3528     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3529     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3530     if (eq) str = SAME_NONZERO_PATTERN;
3531   }
3532   /* spgeam is buggy with one column */
3533   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3534 
3535   if (str == SUBSET_NONZERO_PATTERN) {
3536     PetscScalar b = 1.0;
3537 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3538     size_t bufferSize;
3539     void  *buffer;
3540 #endif
3541 
3542     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3543     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3544     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3545 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3546     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3547                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3548     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3549     PetscCall(PetscLogGpuTimeBegin());
3550     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3551                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3552     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3553     PetscCall(PetscLogGpuTimeEnd());
3554     PetscCallCUDA(cudaFree(buffer));
3555 #else
3556     PetscCall(PetscLogGpuTimeBegin());
3557     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3558                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3559     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3560     PetscCall(PetscLogGpuTimeEnd());
3561 #endif
3562     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3563     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3564     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3565     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3566   } else if (str == SAME_NONZERO_PATTERN) {
3567     cublasHandle_t cublasv2handle;
3568     PetscBLASInt   one = 1, bnz = 1;
3569 
3570     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3571     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3572     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3573     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3574     PetscCall(PetscLogGpuTimeBegin());
3575     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3576     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3577     PetscCall(PetscLogGpuTimeEnd());
3578     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3579     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3580     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3581   } else {
3582     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3583     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3584   }
3585   PetscFunctionReturn(PETSC_SUCCESS);
3586 }
3587 
3588 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3589 {
3590   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3591   PetscScalar   *ay;
3592   cublasHandle_t cublasv2handle;
3593   PetscBLASInt   one = 1, bnz = 1;
3594 
3595   PetscFunctionBegin;
3596   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3597   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3598   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3599   PetscCall(PetscLogGpuTimeBegin());
3600   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3601   PetscCall(PetscLogGpuFlops(bnz));
3602   PetscCall(PetscLogGpuTimeEnd());
3603   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3604   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3605   PetscFunctionReturn(PETSC_SUCCESS);
3606 }
3607 
3608 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3609 {
3610   PetscBool   both = PETSC_FALSE;
3611   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3612 
3613   PetscFunctionBegin;
3614   if (A->factortype == MAT_FACTOR_NONE) {
3615     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3616     if (spptr->mat) {
3617       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3618       if (matrix->values) {
3619         both = PETSC_TRUE;
3620         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3621       }
3622     }
3623     if (spptr->matTranspose) {
3624       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3625       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3626     }
3627   }
3628   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3629   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3630   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3631   else A->offloadmask = PETSC_OFFLOAD_CPU;
3632   PetscFunctionReturn(PETSC_SUCCESS);
3633 }
3634 
3635 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3636 {
3637   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3638 
3639   PetscFunctionBegin;
3640   if (A->factortype != MAT_FACTOR_NONE) {
3641     A->boundtocpu = flg;
3642     PetscFunctionReturn(PETSC_SUCCESS);
3643   }
3644   if (flg) {
3645     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3646 
3647     A->ops->scale                     = MatScale_SeqAIJ;
3648     A->ops->axpy                      = MatAXPY_SeqAIJ;
3649     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3650     A->ops->mult                      = MatMult_SeqAIJ;
3651     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3652     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3653     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3654     A->ops->multhermitiantranspose    = NULL;
3655     A->ops->multhermitiantransposeadd = NULL;
3656     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3657     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3658     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3659     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3660     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3661     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3662     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3663     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3664   } else {
3665     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3666     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3667     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3668     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3669     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3670     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3671     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3672     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3673     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3674     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3675     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3676     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3677     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3678     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3679     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3680     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3681     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3682 
3683     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3684     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3685     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3686     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3687     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3688     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3689   }
3690   A->boundtocpu = flg;
3691   if (flg && a->inode.size) {
3692     a->inode.use = PETSC_TRUE;
3693   } else {
3694     a->inode.use = PETSC_FALSE;
3695   }
3696   PetscFunctionReturn(PETSC_SUCCESS);
3697 }
3698 
3699 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3700 {
3701   Mat B;
3702 
3703   PetscFunctionBegin;
3704   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3705   if (reuse == MAT_INITIAL_MATRIX) {
3706     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3707   } else if (reuse == MAT_REUSE_MATRIX) {
3708     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3709   }
3710   B = *newmat;
3711 
3712   PetscCall(PetscFree(B->defaultvectype));
3713   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3714 
3715   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3716     if (B->factortype == MAT_FACTOR_NONE) {
3717       Mat_SeqAIJCUSPARSE *spptr;
3718       PetscCall(PetscNew(&spptr));
3719       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3720       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3721       spptr->format = MAT_CUSPARSE_CSR;
3722 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3723   #if CUSPARSE_VERSION > 11301
3724       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3725   #else
3726       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3727   #endif
3728       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3729       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3730 #endif
3731       B->spptr = spptr;
3732     } else {
3733       Mat_SeqAIJCUSPARSETriFactors *spptr;
3734 
3735       PetscCall(PetscNew(&spptr));
3736       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3737       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3738       B->spptr = spptr;
3739     }
3740     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3741   }
3742   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3743   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3744   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3745   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3746   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3747   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3748 
3749   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3750   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3751   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3752 #if defined(PETSC_HAVE_HYPRE)
3753   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3754 #endif
3755   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3756   PetscFunctionReturn(PETSC_SUCCESS);
3757 }
3758 
3759 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3760 {
3761   PetscFunctionBegin;
3762   PetscCall(MatCreate_SeqAIJ(B));
3763   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3764   PetscFunctionReturn(PETSC_SUCCESS);
3765 }
3766 
3767 /*MC
3768    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3769 
3770    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3771    CSR, ELL, or Hybrid format.
3772    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3773 
3774    Options Database Keys:
3775 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3776 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3777                                       Other options include ell (ellpack) or hyb (hybrid).
3778 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3779 -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3780 
3781   Level: beginner
3782 
3783 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3784 M*/
3785 
3786 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3787 
3788 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3789 {
3790   PetscFunctionBegin;
3791   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3792   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3793   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3794   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3795   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3796 
3797   PetscFunctionReturn(PETSC_SUCCESS);
3798 }
3799 
3800 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3801 {
3802   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3803 
3804   PetscFunctionBegin;
3805   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3806   delete cusp->cooPerm;
3807   delete cusp->cooPerm_a;
3808   cusp->cooPerm   = NULL;
3809   cusp->cooPerm_a = NULL;
3810   if (cusp->use_extended_coo) {
3811     PetscCallCUDA(cudaFree(cusp->jmap_d));
3812     PetscCallCUDA(cudaFree(cusp->perm_d));
3813   }
3814   cusp->use_extended_coo = PETSC_FALSE;
3815   PetscFunctionReturn(PETSC_SUCCESS);
3816 }
3817 
3818 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3819 {
3820   PetscFunctionBegin;
3821   if (*cusparsestruct) {
3822     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3823     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3824     delete (*cusparsestruct)->workVector;
3825     delete (*cusparsestruct)->rowoffsets_gpu;
3826     delete (*cusparsestruct)->cooPerm;
3827     delete (*cusparsestruct)->cooPerm_a;
3828     delete (*cusparsestruct)->csr2csc_i;
3829     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3830     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3831     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3832     PetscCall(PetscFree(*cusparsestruct));
3833   }
3834   PetscFunctionReturn(PETSC_SUCCESS);
3835 }
3836 
3837 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3838 {
3839   PetscFunctionBegin;
3840   if (*mat) {
3841     delete (*mat)->values;
3842     delete (*mat)->column_indices;
3843     delete (*mat)->row_offsets;
3844     delete *mat;
3845     *mat = 0;
3846   }
3847   PetscFunctionReturn(PETSC_SUCCESS);
3848 }
3849 
3850 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3851 {
3852   PetscFunctionBegin;
3853   if (*trifactor) {
3854     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3855     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3856     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3857     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3858     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3859 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3860     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3861 #endif
3862     PetscCall(PetscFree(*trifactor));
3863   }
3864   PetscFunctionReturn(PETSC_SUCCESS);
3865 }
3866 
3867 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3868 {
3869   CsrMatrix *mat;
3870 
3871   PetscFunctionBegin;
3872   if (*matstruct) {
3873     if ((*matstruct)->mat) {
3874       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3875 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3876         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3877 #else
3878         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3879         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3880 #endif
3881       } else {
3882         mat = (CsrMatrix *)(*matstruct)->mat;
3883         PetscCall(CsrMatrix_Destroy(&mat));
3884       }
3885     }
3886     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3887     delete (*matstruct)->cprowIndices;
3888     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3889     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3890     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3891 
3892 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3893     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3894     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3895     for (int i = 0; i < 3; i++) {
3896       if (mdata->cuSpMV[i].initialized) {
3897         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3898         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3899         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3900       }
3901     }
3902 #endif
3903     delete *matstruct;
3904     *matstruct = NULL;
3905   }
3906   PetscFunctionReturn(PETSC_SUCCESS);
3907 }
3908 
3909 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3910 {
3911   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3912 
3913   PetscFunctionBegin;
3914   if (fs) {
3915     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3916     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3917     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3918     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3919     delete fs->rpermIndices;
3920     delete fs->cpermIndices;
3921     delete fs->workVector;
3922     fs->rpermIndices = NULL;
3923     fs->cpermIndices = NULL;
3924     fs->workVector   = NULL;
3925     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3926     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3927     fs->init_dev_prop = PETSC_FALSE;
3928 #if CUSPARSE_VERSION >= 11500
3929     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3930     PetscCallCUDA(cudaFree(fs->csrColIdx));
3931     PetscCallCUDA(cudaFree(fs->csrVal));
3932     PetscCallCUDA(cudaFree(fs->X));
3933     PetscCallCUDA(cudaFree(fs->Y));
3934     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3935     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3936     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3937     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3938     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3939     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3940     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3941     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3942     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3943     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3944     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3945     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3946     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3947     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3948     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3949     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3950 
3951     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3952     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3953 #endif
3954   }
3955   PetscFunctionReturn(PETSC_SUCCESS);
3956 }
3957 
3958 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3959 {
3960   PetscFunctionBegin;
3961   if (*trifactors) {
3962     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3963     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
3964     PetscCall(PetscFree(*trifactors));
3965   }
3966   PetscFunctionReturn(PETSC_SUCCESS);
3967 }
3968 
3969 struct IJCompare {
3970   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3971   {
3972     if (t1.get<0>() < t2.get<0>()) return true;
3973     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3974     return false;
3975   }
3976 };
3977 
3978 struct IJEqual {
3979   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3980   {
3981     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3982     return true;
3983   }
3984 };
3985 
3986 struct IJDiff {
3987   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3988 };
3989 
3990 struct IJSum {
3991   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3992 };
3993 
3994 #include <thrust/iterator/discard_iterator.h>
3995 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3996 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3997 {
3998   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
3999   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4000   THRUSTARRAY                          *cooPerm_v = NULL;
4001   thrust::device_ptr<const PetscScalar> d_v;
4002   CsrMatrix                            *matrix;
4003   PetscInt                              n;
4004 
4005   PetscFunctionBegin;
4006   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4007   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4008   if (!cusp->cooPerm) {
4009     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4010     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4011     PetscFunctionReturn(PETSC_SUCCESS);
4012   }
4013   matrix = (CsrMatrix *)cusp->mat->mat;
4014   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4015   if (!v) {
4016     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4017     goto finalize;
4018   }
4019   n = cusp->cooPerm->size();
4020   if (isCudaMem(v)) {
4021     d_v = thrust::device_pointer_cast(v);
4022   } else {
4023     cooPerm_v = new THRUSTARRAY(n);
4024     cooPerm_v->assign(v, v + n);
4025     d_v = cooPerm_v->data();
4026     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4027   }
4028   PetscCall(PetscLogGpuTimeBegin());
4029   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4030     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4031       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4032       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4033       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4034         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4035         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4036       */
4037       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4038       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4039       delete cooPerm_w;
4040     } else {
4041       /* all nonzeros in d_v[] are unique entries */
4042       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4043       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4044       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4045     }
4046   } else {
4047     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4048       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4049       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4050     } else {
4051       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4052       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4053       thrust::for_each(zibit, zieit, VecCUDAEquals());
4054     }
4055   }
4056   PetscCall(PetscLogGpuTimeEnd());
4057 finalize:
4058   delete cooPerm_v;
4059   A->offloadmask = PETSC_OFFLOAD_GPU;
4060   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4061   /* shorter version of MatAssemblyEnd_SeqAIJ */
4062   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4063   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4064   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4065   a->reallocs = 0;
4066   A->info.mallocs += 0;
4067   A->info.nz_unneeded = 0;
4068   A->assembled = A->was_assembled = PETSC_TRUE;
4069   A->num_ass++;
4070   PetscFunctionReturn(PETSC_SUCCESS);
4071 }
4072 
4073 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4074 {
4075   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4076 
4077   PetscFunctionBegin;
4078   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4079   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4080   if (destroy) {
4081     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4082     delete cusp->csr2csc_i;
4083     cusp->csr2csc_i = NULL;
4084   }
4085   A->transupdated = PETSC_FALSE;
4086   PetscFunctionReturn(PETSC_SUCCESS);
4087 }
4088 
4089 #include <thrust/binary_search.h>
4090 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4091 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4092 {
4093   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4094   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4095   PetscInt            cooPerm_n, nzr = 0;
4096 
4097   PetscFunctionBegin;
4098   PetscCall(PetscLayoutSetUp(A->rmap));
4099   PetscCall(PetscLayoutSetUp(A->cmap));
4100   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4101   if (n != cooPerm_n) {
4102     delete cusp->cooPerm;
4103     delete cusp->cooPerm_a;
4104     cusp->cooPerm   = NULL;
4105     cusp->cooPerm_a = NULL;
4106   }
4107   if (n) {
4108     thrust::device_ptr<PetscInt> d_i, d_j;
4109     PetscInt                    *d_raw_i, *d_raw_j;
4110     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4111     PetscMemType                 imtype, jmtype;
4112 
4113     PetscCall(PetscGetMemType(coo_i, &imtype));
4114     if (PetscMemTypeHost(imtype)) {
4115       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4116       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4117       d_i        = thrust::device_pointer_cast(d_raw_i);
4118       free_raw_i = PETSC_TRUE;
4119       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4120     } else {
4121       d_i = thrust::device_pointer_cast(coo_i);
4122     }
4123 
4124     PetscCall(PetscGetMemType(coo_j, &jmtype));
4125     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4126       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4127       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4128       d_j        = thrust::device_pointer_cast(d_raw_j);
4129       free_raw_j = PETSC_TRUE;
4130       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4131     } else {
4132       d_j = thrust::device_pointer_cast(coo_j);
4133     }
4134 
4135     THRUSTINTARRAY ii(A->rmap->n);
4136 
4137     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4138     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4139 
4140     /* Ex.
4141       n = 6
4142       coo_i = [3,3,1,4,1,4]
4143       coo_j = [3,2,2,5,2,6]
4144     */
4145     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4146     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4147 
4148     PetscCall(PetscLogGpuTimeBegin());
4149     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4150     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4151     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4152     THRUSTINTARRAY w(d_j, d_j + n);
4153 
4154     /*
4155       d_i     = [1,1,3,3,4,4]
4156       d_j     = [2,2,2,3,5,6]
4157       cooPerm = [2,4,1,0,3,5]
4158     */
4159     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4160 
4161     /*
4162       d_i     = [1,3,3,4,4,x]
4163                             ^ekey
4164       d_j     = [2,2,3,5,6,x]
4165                            ^nekye
4166     */
4167     if (nekey == ekey) { /* all entries are unique */
4168       delete cusp->cooPerm_a;
4169       cusp->cooPerm_a = NULL;
4170     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4171       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4172       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4173       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4174       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4175       w[0]                  = 0;
4176       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4177       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4178     }
4179     thrust::counting_iterator<PetscInt> search_begin(0);
4180     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4181                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4182                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4183     PetscCall(PetscLogGpuTimeEnd());
4184 
4185     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4186     a->singlemalloc = PETSC_FALSE;
4187     a->free_a       = PETSC_TRUE;
4188     a->free_ij      = PETSC_TRUE;
4189     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4190     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4191     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4192     a->nz = a->maxnz = a->i[A->rmap->n];
4193     a->rmax          = 0;
4194     PetscCall(PetscMalloc1(a->nz, &a->a));
4195     PetscCall(PetscMalloc1(a->nz, &a->j));
4196     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4197     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4198     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4199     for (PetscInt i = 0; i < A->rmap->n; i++) {
4200       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4201       nzr += (PetscInt) !!(nnzr);
4202       a->ilen[i] = a->imax[i] = nnzr;
4203       a->rmax                 = PetscMax(a->rmax, nnzr);
4204     }
4205     a->nonzerorowcnt = nzr;
4206     A->preallocated  = PETSC_TRUE;
4207     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4208     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4209     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4210     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4211   } else {
4212     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4213   }
4214   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4215 
4216   /* We want to allocate the CUSPARSE struct for matvec now.
4217      The code is so convoluted now that I prefer to copy zeros */
4218   PetscCall(PetscArrayzero(a->a, a->nz));
4219   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4220   A->offloadmask = PETSC_OFFLOAD_CPU;
4221   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4222   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4223   PetscFunctionReturn(PETSC_SUCCESS);
4224 }
4225 
4226 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4227 {
4228   Mat_SeqAIJ         *seq;
4229   Mat_SeqAIJCUSPARSE *dev;
4230   PetscBool           coo_basic = PETSC_TRUE;
4231   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4232 
4233   PetscFunctionBegin;
4234   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4235   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4236   if (coo_i) {
4237     PetscCall(PetscGetMemType(coo_i, &mtype));
4238     if (PetscMemTypeHost(mtype)) {
4239       for (PetscCount k = 0; k < coo_n; k++) {
4240         if (coo_i[k] < 0 || coo_j[k] < 0) {
4241           coo_basic = PETSC_FALSE;
4242           break;
4243         }
4244       }
4245     }
4246   }
4247 
4248   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4249     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4250   } else {
4251     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4252     mat->offloadmask = PETSC_OFFLOAD_CPU;
4253     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4254     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4255     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4256     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4257     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4258     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4259     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4260     dev->use_extended_coo = PETSC_TRUE;
4261   }
4262   PetscFunctionReturn(PETSC_SUCCESS);
4263 }
4264 
4265 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4266 {
4267   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4268   const PetscCount grid_size = gridDim.x * blockDim.x;
4269   for (; i < nnz; i += grid_size) {
4270     PetscScalar sum = 0.0;
4271     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4272     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4273   }
4274 }
4275 
4276 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4277 {
4278   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4279   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4280   PetscCount          Annz = seq->nz;
4281   PetscMemType        memtype;
4282   const PetscScalar  *v1 = v;
4283   PetscScalar        *Aa;
4284 
4285   PetscFunctionBegin;
4286   if (dev->use_extended_coo) {
4287     PetscCall(PetscGetMemType(v, &memtype));
4288     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4289       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4290       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4291     }
4292 
4293     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4294     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4295 
4296     if (Annz) {
4297       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4298       PetscCallCUDA(cudaPeekAtLastError());
4299     }
4300 
4301     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4302     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4303 
4304     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4305   } else {
4306     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4307   }
4308   PetscFunctionReturn(PETSC_SUCCESS);
4309 }
4310 
4311 /*@C
4312     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4313 
4314    Not Collective
4315 
4316     Input Parameters:
4317 +   A - the matrix
4318 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4319 
4320     Output Parameters:
4321 +   ia - the CSR row pointers
4322 -   ja - the CSR column indices
4323 
4324     Level: developer
4325 
4326     Note:
4327       When compressed is true, the CSR structure does not contain empty rows
4328 
4329 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4330 @*/
4331 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4332 {
4333   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4334   CsrMatrix          *csr;
4335   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4336 
4337   PetscFunctionBegin;
4338   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4339   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4340   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4341   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4342   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4343   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4344   csr = (CsrMatrix *)cusp->mat->mat;
4345   if (i) {
4346     if (!compressed && a->compressedrow.use) { /* need full row offset */
4347       if (!cusp->rowoffsets_gpu) {
4348         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4349         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4350         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4351       }
4352       *i = cusp->rowoffsets_gpu->data().get();
4353     } else *i = csr->row_offsets->data().get();
4354   }
4355   if (j) *j = csr->column_indices->data().get();
4356   PetscFunctionReturn(PETSC_SUCCESS);
4357 }
4358 
4359 /*@C
4360     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4361 
4362    Not Collective
4363 
4364     Input Parameters:
4365 +   A - the matrix
4366 .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4367 .   ia - the CSR row pointers
4368 -   ja - the CSR column indices
4369 
4370     Level: developer
4371 
4372 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4373 @*/
4374 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool, const int **i, const int **j)
4375 {
4376   PetscFunctionBegin;
4377   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4378   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4379   if (i) *i = NULL;
4380   if (j) *j = NULL;
4381   PetscFunctionReturn(PETSC_SUCCESS);
4382 }
4383 
4384 /*@C
4385    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4386 
4387    Not Collective
4388 
4389    Input Parameter:
4390 .   A - a `MATSEQAIJCUSPARSE` matrix
4391 
4392    Output Parameter:
4393 .   a - pointer to the device data
4394 
4395    Level: developer
4396 
4397    Note:
4398    May trigger host-device copies if up-to-date matrix data is on host
4399 
4400 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4401 @*/
4402 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4403 {
4404   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4405   CsrMatrix          *csr;
4406 
4407   PetscFunctionBegin;
4408   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4409   PetscValidPointer(a, 2);
4410   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4411   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4412   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4413   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4414   csr = (CsrMatrix *)cusp->mat->mat;
4415   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4416   *a = csr->values->data().get();
4417   PetscFunctionReturn(PETSC_SUCCESS);
4418 }
4419 
4420 /*@C
4421    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4422 
4423    Not Collective
4424 
4425    Input Parameters:
4426 +   A - a `MATSEQAIJCUSPARSE` matrix
4427 -   a - pointer to the device data
4428 
4429    Level: developer
4430 
4431 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4432 @*/
4433 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4434 {
4435   PetscFunctionBegin;
4436   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4437   PetscValidPointer(a, 2);
4438   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4439   *a = NULL;
4440   PetscFunctionReturn(PETSC_SUCCESS);
4441 }
4442 
4443 /*@C
4444    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4445 
4446    Not Collective
4447 
4448    Input Parameter:
4449 .   A - a `MATSEQAIJCUSPARSE` matrix
4450 
4451    Output Parameter:
4452 .   a - pointer to the device data
4453 
4454    Level: developer
4455 
4456    Note:
4457    May trigger host-device copies if up-to-date matrix data is on host
4458 
4459 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4460 @*/
4461 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4462 {
4463   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4464   CsrMatrix          *csr;
4465 
4466   PetscFunctionBegin;
4467   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4468   PetscValidPointer(a, 2);
4469   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4470   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4471   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4472   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4473   csr = (CsrMatrix *)cusp->mat->mat;
4474   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4475   *a             = csr->values->data().get();
4476   A->offloadmask = PETSC_OFFLOAD_GPU;
4477   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4478   PetscFunctionReturn(PETSC_SUCCESS);
4479 }
4480 /*@C
4481    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4482 
4483    Not Collective
4484 
4485    Input Parameters:
4486 +   A - a `MATSEQAIJCUSPARSE` matrix
4487 -   a - pointer to the device data
4488 
4489    Level: developer
4490 
4491 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4492 @*/
4493 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4494 {
4495   PetscFunctionBegin;
4496   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4497   PetscValidPointer(a, 2);
4498   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4499   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4500   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4501   *a = NULL;
4502   PetscFunctionReturn(PETSC_SUCCESS);
4503 }
4504 
4505 /*@C
4506    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4507 
4508    Not Collective
4509 
4510    Input Parameter:
4511 .   A - a `MATSEQAIJCUSPARSE` matrix
4512 
4513    Output Parameter:
4514 .   a - pointer to the device data
4515 
4516    Level: developer
4517 
4518    Note:
4519    Does not trigger host-device copies and flags data validity on the GPU
4520 
4521 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4522 @*/
4523 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4524 {
4525   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4526   CsrMatrix          *csr;
4527 
4528   PetscFunctionBegin;
4529   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4530   PetscValidPointer(a, 2);
4531   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4532   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4533   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4534   csr = (CsrMatrix *)cusp->mat->mat;
4535   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4536   *a             = csr->values->data().get();
4537   A->offloadmask = PETSC_OFFLOAD_GPU;
4538   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4539   PetscFunctionReturn(PETSC_SUCCESS);
4540 }
4541 
4542 /*@C
4543    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4544 
4545    Not Collective
4546 
4547    Input Parameters:
4548 +   A - a `MATSEQAIJCUSPARSE` matrix
4549 -   a - pointer to the device data
4550 
4551    Level: developer
4552 
4553 .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4554 @*/
4555 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4556 {
4557   PetscFunctionBegin;
4558   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4559   PetscValidPointer(a, 2);
4560   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4561   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4562   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4563   *a = NULL;
4564   PetscFunctionReturn(PETSC_SUCCESS);
4565 }
4566 
4567 struct IJCompare4 {
4568   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4569   {
4570     if (t1.get<0>() < t2.get<0>()) return true;
4571     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4572     return false;
4573   }
4574 };
4575 
4576 struct Shift {
4577   int _shift;
4578 
4579   Shift(int shift) : _shift(shift) { }
4580   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4581 };
4582 
4583 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4584 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4585 {
4586   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4587   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4588   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4589   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4590   PetscInt                      Annz, Bnnz;
4591   cusparseStatus_t              stat;
4592   PetscInt                      i, m, n, zero = 0;
4593 
4594   PetscFunctionBegin;
4595   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4596   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4597   PetscValidPointer(C, 4);
4598   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4599   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4600   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4601   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4602   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4603   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4604   if (reuse == MAT_INITIAL_MATRIX) {
4605     m = A->rmap->n;
4606     n = A->cmap->n + B->cmap->n;
4607     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4608     PetscCall(MatSetSizes(*C, m, n, m, n));
4609     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4610     c                       = (Mat_SeqAIJ *)(*C)->data;
4611     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4612     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4613     Ccsr                    = new CsrMatrix;
4614     Cmat->cprowIndices      = NULL;
4615     c->compressedrow.use    = PETSC_FALSE;
4616     c->compressedrow.nrows  = 0;
4617     c->compressedrow.i      = NULL;
4618     c->compressedrow.rindex = NULL;
4619     Ccusp->workVector       = NULL;
4620     Ccusp->nrows            = m;
4621     Ccusp->mat              = Cmat;
4622     Ccusp->mat->mat         = Ccsr;
4623     Ccsr->num_rows          = m;
4624     Ccsr->num_cols          = n;
4625     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4626     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4627     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4628     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4629     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4630     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4631     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4632     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4633     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4634     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4635     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4636     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4637     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4638 
4639     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4640     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4641     Annz                 = (PetscInt)Acsr->column_indices->size();
4642     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4643     c->nz                = Annz + Bnnz;
4644     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4645     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4646     Ccsr->values         = new THRUSTARRAY(c->nz);
4647     Ccsr->num_entries    = c->nz;
4648     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4649     if (c->nz) {
4650       auto              Acoo = new THRUSTINTARRAY32(Annz);
4651       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4652       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4653       THRUSTINTARRAY32 *Aroff, *Broff;
4654 
4655       if (a->compressedrow.use) { /* need full row offset */
4656         if (!Acusp->rowoffsets_gpu) {
4657           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4658           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4659           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4660         }
4661         Aroff = Acusp->rowoffsets_gpu;
4662       } else Aroff = Acsr->row_offsets;
4663       if (b->compressedrow.use) { /* need full row offset */
4664         if (!Bcusp->rowoffsets_gpu) {
4665           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4666           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4667           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4668         }
4669         Broff = Bcusp->rowoffsets_gpu;
4670       } else Broff = Bcsr->row_offsets;
4671       PetscCall(PetscLogGpuTimeBegin());
4672       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4673       PetscCallCUSPARSE(stat);
4674       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4675       PetscCallCUSPARSE(stat);
4676       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4677       auto Aperm = thrust::make_constant_iterator(1);
4678       auto Bperm = thrust::make_constant_iterator(0);
4679 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4680       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4681       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4682 #else
4683       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4684       auto Bcib = Bcsr->column_indices->begin();
4685       auto Bcie = Bcsr->column_indices->end();
4686       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4687 #endif
4688       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4689       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4690       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4691       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4692       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4693       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4694       auto p1    = Ccusp->cooPerm->begin();
4695       auto p2    = Ccusp->cooPerm->begin();
4696       thrust::advance(p2, Annz);
4697       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4698 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4699       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4700 #endif
4701       auto cci = thrust::make_counting_iterator(zero);
4702       auto cce = thrust::make_counting_iterator(c->nz);
4703 #if 0 //Errors on SUMMIT cuda 11.1.0
4704       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4705 #else
4706       auto pred = thrust::identity<int>();
4707       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4708       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4709 #endif
4710       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4711       PetscCallCUSPARSE(stat);
4712       PetscCall(PetscLogGpuTimeEnd());
4713       delete wPerm;
4714       delete Acoo;
4715       delete Bcoo;
4716       delete Ccoo;
4717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4718       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4719       PetscCallCUSPARSE(stat);
4720 #endif
4721       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4722         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4723         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4724         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4725         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4726         CsrMatrix                    *CcsrT = new CsrMatrix;
4727         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4728         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4729 
4730         (*C)->form_explicit_transpose = PETSC_TRUE;
4731         (*C)->transupdated            = PETSC_TRUE;
4732         Ccusp->rowoffsets_gpu         = NULL;
4733         CmatT->cprowIndices           = NULL;
4734         CmatT->mat                    = CcsrT;
4735         CcsrT->num_rows               = n;
4736         CcsrT->num_cols               = m;
4737         CcsrT->num_entries            = c->nz;
4738 
4739         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4740         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4741         CcsrT->values         = new THRUSTARRAY(c->nz);
4742 
4743         PetscCall(PetscLogGpuTimeBegin());
4744         auto rT = CcsrT->row_offsets->begin();
4745         if (AT) {
4746           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4747           thrust::advance(rT, -1);
4748         }
4749         if (BT) {
4750           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4751           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4752           thrust::copy(titb, tite, rT);
4753         }
4754         auto cT = CcsrT->column_indices->begin();
4755         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4756         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4757         auto vT = CcsrT->values->begin();
4758         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4759         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4760         PetscCall(PetscLogGpuTimeEnd());
4761 
4762         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4763         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4764         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4765         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4766         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4767         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4768         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4769         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4770         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4771 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4772         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4773         PetscCallCUSPARSE(stat);
4774 #endif
4775         Ccusp->matTranspose = CmatT;
4776       }
4777     }
4778 
4779     c->singlemalloc = PETSC_FALSE;
4780     c->free_a       = PETSC_TRUE;
4781     c->free_ij      = PETSC_TRUE;
4782     PetscCall(PetscMalloc1(m + 1, &c->i));
4783     PetscCall(PetscMalloc1(c->nz, &c->j));
4784     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4785       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4786       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4787       ii = *Ccsr->row_offsets;
4788       jj = *Ccsr->column_indices;
4789       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4790       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4791     } else {
4792       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4793       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4794     }
4795     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4796     PetscCall(PetscMalloc1(m, &c->ilen));
4797     PetscCall(PetscMalloc1(m, &c->imax));
4798     c->maxnz         = c->nz;
4799     c->nonzerorowcnt = 0;
4800     c->rmax          = 0;
4801     for (i = 0; i < m; i++) {
4802       const PetscInt nn = c->i[i + 1] - c->i[i];
4803       c->ilen[i] = c->imax[i] = nn;
4804       c->nonzerorowcnt += (PetscInt) !!nn;
4805       c->rmax = PetscMax(c->rmax, nn);
4806     }
4807     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4808     PetscCall(PetscMalloc1(c->nz, &c->a));
4809     (*C)->nonzerostate++;
4810     PetscCall(PetscLayoutSetUp((*C)->rmap));
4811     PetscCall(PetscLayoutSetUp((*C)->cmap));
4812     Ccusp->nonzerostate = (*C)->nonzerostate;
4813     (*C)->preallocated  = PETSC_TRUE;
4814   } else {
4815     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4816     c = (Mat_SeqAIJ *)(*C)->data;
4817     if (c->nz) {
4818       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4819       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4820       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4821       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4822       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4823       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4824       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4825       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4826       Acsr = (CsrMatrix *)Acusp->mat->mat;
4827       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4828       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4829       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4830       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4831       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4832       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4833       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4834       auto pmid = Ccusp->cooPerm->begin();
4835       thrust::advance(pmid, Acsr->num_entries);
4836       PetscCall(PetscLogGpuTimeBegin());
4837       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4838       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4839       thrust::for_each(zibait, zieait, VecCUDAEquals());
4840       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4841       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4842       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4843       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4844       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4845         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4846         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4847         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4848         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4849         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4850         auto       vT    = CcsrT->values->begin();
4851         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4852         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4853         (*C)->transupdated = PETSC_TRUE;
4854       }
4855       PetscCall(PetscLogGpuTimeEnd());
4856     }
4857   }
4858   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4859   (*C)->assembled     = PETSC_TRUE;
4860   (*C)->was_assembled = PETSC_FALSE;
4861   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4862   PetscFunctionReturn(PETSC_SUCCESS);
4863 }
4864 
4865 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4866 {
4867   bool               dmem;
4868   const PetscScalar *av;
4869 
4870   PetscFunctionBegin;
4871   dmem = isCudaMem(v);
4872   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4873   if (n && idx) {
4874     THRUSTINTARRAY widx(n);
4875     widx.assign(idx, idx + n);
4876     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4877 
4878     THRUSTARRAY                    *w = NULL;
4879     thrust::device_ptr<PetscScalar> dv;
4880     if (dmem) {
4881       dv = thrust::device_pointer_cast(v);
4882     } else {
4883       w  = new THRUSTARRAY(n);
4884       dv = w->data();
4885     }
4886     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4887 
4888     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4889     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4890     thrust::for_each(zibit, zieit, VecCUDAEquals());
4891     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4892     delete w;
4893   } else {
4894     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4895   }
4896   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4897   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4898   PetscFunctionReturn(PETSC_SUCCESS);
4899 }
4900