xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2d776b4963042cdf8a412ba09e923aa51facd799) !
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #if PETSC_CPP_VERSION >= 14
17   #define PETSC_HAVE_THRUST_ASYNC 1
18   // thrust::for_each(thrust::cuda::par.on()) requires C++14
19   #include <thrust/async/for_each.h>
20 #endif
21 #include <thrust/iterator/constant_iterator.h>
22 #include <thrust/remove.h>
23 #include <thrust/sort.h>
24 #include <thrust/unique.h>
25 
26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30 
31   typedef enum {
32       CUSPARSE_MV_ALG_DEFAULT = 0,
33       CUSPARSE_COOMV_ALG      = 1,
34       CUSPARSE_CSRMV_ALG1     = 2,
35       CUSPARSE_CSRMV_ALG2     = 3
36   } cusparseSpMVAlg_t;
37 
38   typedef enum {
39       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45       CUSPARSE_SPMM_COO_ALG1    = 1,
46       CUSPARSE_SPMM_COO_ALG2    = 2,
47       CUSPARSE_SPMM_COO_ALG3    = 3,
48       CUSPARSE_SPMM_COO_ALG4    = 5,
49       CUSPARSE_SPMM_CSR_ALG1    = 4,
50       CUSPARSE_SPMM_CSR_ALG2    = 6,
51   } cusparseSpMMAlg_t;
52 
53   typedef enum {
54       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
55       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
56   } cusparseCsr2CscAlg_t;
57   */
58 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61 #endif
62 
63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 
67 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
69 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
70 
71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
72 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
74 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
75 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
76 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
77 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
78 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
83 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
84 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
85 
86 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
88 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
90 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
91 
92 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
94 
95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98 
99 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100 {
101   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
102 
103   PetscFunctionBegin;
104   switch (op) {
105   case MAT_CUSPARSE_MULT:
106     cusparsestruct->format = format;
107     break;
108   case MAT_CUSPARSE_ALL:
109     cusparsestruct->format = format;
110     break;
111   default:
112     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
113   }
114   PetscFunctionReturn(0);
115 }
116 
117 /*@
118    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
119    operation. Only the `MatMult()` operation can use different GPU storage formats
120 
121    Not Collective
122 
123    Input Parameters:
124 +  A - Matrix of type `MATSEQAIJCUSPARSE`
125 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
126         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
127 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
128 
129    Output Parameter:
130 
131    Level: intermediate
132 
133 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
134 @*/
135 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
136 {
137   PetscFunctionBegin;
138   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
139   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
140   PetscFunctionReturn(0);
141 }
142 
143 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
144 {
145   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
146 
147   PetscFunctionBegin;
148   cusparsestruct->use_cpu_solve = use_cpu;
149   PetscFunctionReturn(0);
150 }
151 
152 /*@
153    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
154 
155    Input Parameters:
156 +  A - Matrix of type `MATSEQAIJCUSPARSE`
157 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
158 
159    Output Parameter:
160 
161    Note:
162    The cuSparse LU solver currently computes the factors with the built-in CPU method
163    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
164    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
165 
166    Level: intermediate
167 
168 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
169 @*/
170 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
171 {
172   PetscFunctionBegin;
173   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
174   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
175   PetscFunctionReturn(0);
176 }
177 
178 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
179 {
180   PetscFunctionBegin;
181   switch (op) {
182   case MAT_FORM_EXPLICIT_TRANSPOSE:
183     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
184     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
185     A->form_explicit_transpose = flg;
186     break;
187   default:
188     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
189     break;
190   }
191   PetscFunctionReturn(0);
192 }
193 
194 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
195 
196 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
197 {
198   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
199   IS                  isrow = b->row, iscol = b->col;
200   PetscBool           row_identity, col_identity;
201   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
202 
203   PetscFunctionBegin;
204   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
205   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
206   B->offloadmask = PETSC_OFFLOAD_CPU;
207   /* determine which version of MatSolve needs to be used. */
208   PetscCall(ISIdentity(isrow, &row_identity));
209   PetscCall(ISIdentity(iscol, &col_identity));
210 
211   if (!cusparsestruct->use_cpu_solve) {
212     if (row_identity && col_identity) {
213       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
214       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
215     } else {
216       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
217       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
218     }
219   }
220   B->ops->matsolve          = NULL;
221   B->ops->matsolvetranspose = NULL;
222 
223   /* get the triangular factors */
224   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
229 {
230   MatCUSPARSEStorageFormat format;
231   PetscBool                flg;
232   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
233 
234   PetscFunctionBegin;
235   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
236   if (A->factortype == MAT_FACTOR_NONE) {
237     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
238     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
239 
240     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
241     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
242     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
243     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
244 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
245     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
246     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247   #if CUSPARSE_VERSION > 11301
248     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249   #else
250     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251   #endif
252     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
253     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
254 
255     PetscCall(
256       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
257     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
258 #endif
259   }
260   PetscOptionsHeadEnd();
261   PetscFunctionReturn(0);
262 }
263 
264 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
265 {
266   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
267   PetscInt                           n                  = A->rmap->n;
268   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
269   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
270   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
271   const MatScalar                   *aa = a->a, *v;
272   PetscInt                          *AiLo, *AjLo;
273   PetscInt                           i, nz, nzLower, offset, rowOffset;
274 
275   PetscFunctionBegin;
276   if (!n) PetscFunctionReturn(0);
277   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
278     try {
279       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
280       nzLower = n + ai[n] - ai[1];
281       if (!loTriFactor) {
282         PetscScalar *AALo;
283 
284         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
285 
286         /* Allocate Space for the lower triangular matrix */
287         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
288         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
289 
290         /* Fill the lower triangular matrix */
291         AiLo[0]   = (PetscInt)0;
292         AiLo[n]   = nzLower;
293         AjLo[0]   = (PetscInt)0;
294         AALo[0]   = (MatScalar)1.0;
295         v         = aa;
296         vi        = aj;
297         offset    = 1;
298         rowOffset = 1;
299         for (i = 1; i < n; i++) {
300           nz = ai[i + 1] - ai[i];
301           /* additional 1 for the term on the diagonal */
302           AiLo[i] = rowOffset;
303           rowOffset += nz + 1;
304 
305           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
306           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
307 
308           offset += nz;
309           AjLo[offset] = (PetscInt)i;
310           AALo[offset] = (MatScalar)1.0;
311           offset += 1;
312 
313           v += nz;
314           vi += nz;
315         }
316 
317         /* allocate space for the triangular factor information */
318         PetscCall(PetscNew(&loTriFactor));
319         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
320         /* Create the matrix description */
321         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
322         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
323 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
324         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
325 #else
326         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
327 #endif
328         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
329         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
330 
331         /* set the operation */
332         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
333 
334         /* set the matrix */
335         loTriFactor->csrMat              = new CsrMatrix;
336         loTriFactor->csrMat->num_rows    = n;
337         loTriFactor->csrMat->num_cols    = n;
338         loTriFactor->csrMat->num_entries = nzLower;
339 
340         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
341         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
342 
343         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
344         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
345 
346         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
347         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
348 
349         /* Create the solve analysis information */
350         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
351         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
352 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
353         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
354                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
355         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
356 #endif
357 
358         /* perform the solve analysis */
359         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
360                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
361 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
362                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
363 #else
364                                                   loTriFactor->solveInfo));
365 #endif
366         PetscCallCUDA(WaitForCUDA());
367         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
368 
369         /* assign the pointer */
370         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
371         loTriFactor->AA_h                                          = AALo;
372         PetscCallCUDA(cudaFreeHost(AiLo));
373         PetscCallCUDA(cudaFreeHost(AjLo));
374         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
375       } else { /* update values only */
376         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
377         /* Fill the lower triangular matrix */
378         loTriFactor->AA_h[0] = 1.0;
379         v                    = aa;
380         vi                   = aj;
381         offset               = 1;
382         for (i = 1; i < n; i++) {
383           nz = ai[i + 1] - ai[i];
384           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
385           offset += nz;
386           loTriFactor->AA_h[offset] = 1.0;
387           offset += 1;
388           v += nz;
389         }
390         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
391         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
392       }
393     } catch (char *ex) {
394       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
395     }
396   }
397   PetscFunctionReturn(0);
398 }
399 
400 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
401 {
402   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
403   PetscInt                           n                  = A->rmap->n;
404   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
405   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
406   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
407   const MatScalar                   *aa = a->a, *v;
408   PetscInt                          *AiUp, *AjUp;
409   PetscInt                           i, nz, nzUpper, offset;
410 
411   PetscFunctionBegin;
412   if (!n) PetscFunctionReturn(0);
413   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
414     try {
415       /* next, figure out the number of nonzeros in the upper triangular matrix. */
416       nzUpper = adiag[0] - adiag[n];
417       if (!upTriFactor) {
418         PetscScalar *AAUp;
419 
420         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
421 
422         /* Allocate Space for the upper triangular matrix */
423         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
424         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
425 
426         /* Fill the upper triangular matrix */
427         AiUp[0] = (PetscInt)0;
428         AiUp[n] = nzUpper;
429         offset  = nzUpper;
430         for (i = n - 1; i >= 0; i--) {
431           v  = aa + adiag[i + 1] + 1;
432           vi = aj + adiag[i + 1] + 1;
433 
434           /* number of elements NOT on the diagonal */
435           nz = adiag[i] - adiag[i + 1] - 1;
436 
437           /* decrement the offset */
438           offset -= (nz + 1);
439 
440           /* first, set the diagonal elements */
441           AjUp[offset] = (PetscInt)i;
442           AAUp[offset] = (MatScalar)1. / v[nz];
443           AiUp[i]      = AiUp[i + 1] - (nz + 1);
444 
445           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
446           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
447         }
448 
449         /* allocate space for the triangular factor information */
450         PetscCall(PetscNew(&upTriFactor));
451         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
452 
453         /* Create the matrix description */
454         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
455         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
456 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
457         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
458 #else
459         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
460 #endif
461         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
462         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
463 
464         /* set the operation */
465         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
466 
467         /* set the matrix */
468         upTriFactor->csrMat              = new CsrMatrix;
469         upTriFactor->csrMat->num_rows    = n;
470         upTriFactor->csrMat->num_cols    = n;
471         upTriFactor->csrMat->num_entries = nzUpper;
472 
473         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
474         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
475 
476         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
477         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
478 
479         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
480         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
481 
482         /* Create the solve analysis information */
483         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
484         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
485 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
486         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
487                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
488         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
489 #endif
490 
491         /* perform the solve analysis */
492         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
493                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
494 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
495                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
496 #else
497                                                   upTriFactor->solveInfo));
498 #endif
499         PetscCallCUDA(WaitForCUDA());
500         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
501 
502         /* assign the pointer */
503         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
504         upTriFactor->AA_h                                          = AAUp;
505         PetscCallCUDA(cudaFreeHost(AiUp));
506         PetscCallCUDA(cudaFreeHost(AjUp));
507         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
508       } else {
509         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
510         /* Fill the upper triangular matrix */
511         offset = nzUpper;
512         for (i = n - 1; i >= 0; i--) {
513           v = aa + adiag[i + 1] + 1;
514 
515           /* number of elements NOT on the diagonal */
516           nz = adiag[i] - adiag[i + 1] - 1;
517 
518           /* decrement the offset */
519           offset -= (nz + 1);
520 
521           /* first, set the diagonal elements */
522           upTriFactor->AA_h[offset] = 1. / v[nz];
523           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
524         }
525         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
526         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
527       }
528     } catch (char *ex) {
529       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
530     }
531   }
532   PetscFunctionReturn(0);
533 }
534 
535 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
536 {
537   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
538   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
539   IS                            isrow = a->row, iscol = a->icol;
540   PetscBool                     row_identity, col_identity;
541   PetscInt                      n = A->rmap->n;
542 
543   PetscFunctionBegin;
544   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
545   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
546   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
547 
548   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
549   cusparseTriFactors->nnz = a->nz;
550 
551   A->offloadmask = PETSC_OFFLOAD_BOTH;
552   /* lower triangular indices */
553   PetscCall(ISIdentity(isrow, &row_identity));
554   if (!row_identity && !cusparseTriFactors->rpermIndices) {
555     const PetscInt *r;
556 
557     PetscCall(ISGetIndices(isrow, &r));
558     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
559     cusparseTriFactors->rpermIndices->assign(r, r + n);
560     PetscCall(ISRestoreIndices(isrow, &r));
561     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
562   }
563 
564   /* upper triangular indices */
565   PetscCall(ISIdentity(iscol, &col_identity));
566   if (!col_identity && !cusparseTriFactors->cpermIndices) {
567     const PetscInt *c;
568 
569     PetscCall(ISGetIndices(iscol, &c));
570     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
571     cusparseTriFactors->cpermIndices->assign(c, c + n);
572     PetscCall(ISRestoreIndices(iscol, &c));
573     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
574   }
575   PetscFunctionReturn(0);
576 }
577 
578 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
579 {
580   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
581   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
582   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
583   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
584   PetscInt                          *AiUp, *AjUp;
585   PetscScalar                       *AAUp;
586   PetscScalar                       *AALo;
587   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
588   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
589   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
590   const MatScalar                   *aa = b->a, *v;
591 
592   PetscFunctionBegin;
593   if (!n) PetscFunctionReturn(0);
594   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
595     try {
596       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
597       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
598       if (!upTriFactor && !loTriFactor) {
599         /* Allocate Space for the upper triangular matrix */
600         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
601         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
602 
603         /* Fill the upper triangular matrix */
604         AiUp[0] = (PetscInt)0;
605         AiUp[n] = nzUpper;
606         offset  = 0;
607         for (i = 0; i < n; i++) {
608           /* set the pointers */
609           v  = aa + ai[i];
610           vj = aj + ai[i];
611           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
612 
613           /* first, set the diagonal elements */
614           AjUp[offset] = (PetscInt)i;
615           AAUp[offset] = (MatScalar)1.0 / v[nz];
616           AiUp[i]      = offset;
617           AALo[offset] = (MatScalar)1.0 / v[nz];
618 
619           offset += 1;
620           if (nz > 0) {
621             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
622             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
623             for (j = offset; j < offset + nz; j++) {
624               AAUp[j] = -AAUp[j];
625               AALo[j] = AAUp[j] / v[nz];
626             }
627             offset += nz;
628           }
629         }
630 
631         /* allocate space for the triangular factor information */
632         PetscCall(PetscNew(&upTriFactor));
633         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
634 
635         /* Create the matrix description */
636         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
637         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
638 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
639         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
640 #else
641         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
642 #endif
643         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
644         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
645 
646         /* set the matrix */
647         upTriFactor->csrMat              = new CsrMatrix;
648         upTriFactor->csrMat->num_rows    = A->rmap->n;
649         upTriFactor->csrMat->num_cols    = A->cmap->n;
650         upTriFactor->csrMat->num_entries = a->nz;
651 
652         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
653         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
654 
655         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
656         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
657 
658         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
659         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
660 
661         /* set the operation */
662         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
663 
664         /* Create the solve analysis information */
665         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
666         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
667 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
668         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
669                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
670         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
671 #endif
672 
673         /* perform the solve analysis */
674         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
675                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
676 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
677                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
678 #else
679                                                   upTriFactor->solveInfo));
680 #endif
681         PetscCallCUDA(WaitForCUDA());
682         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
683 
684         /* assign the pointer */
685         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
686 
687         /* allocate space for the triangular factor information */
688         PetscCall(PetscNew(&loTriFactor));
689         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
690 
691         /* Create the matrix description */
692         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
693         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
694 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
695         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
696 #else
697         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
698 #endif
699         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
700         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
701 
702         /* set the operation */
703         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
704 
705         /* set the matrix */
706         loTriFactor->csrMat              = new CsrMatrix;
707         loTriFactor->csrMat->num_rows    = A->rmap->n;
708         loTriFactor->csrMat->num_cols    = A->cmap->n;
709         loTriFactor->csrMat->num_entries = a->nz;
710 
711         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
712         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
713 
714         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
715         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
716 
717         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
718         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
719 
720         /* Create the solve analysis information */
721         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
722         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
723 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
724         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
725                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
726         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
727 #endif
728 
729         /* perform the solve analysis */
730         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
731                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
732 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
733                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
734 #else
735                                                   loTriFactor->solveInfo));
736 #endif
737         PetscCallCUDA(WaitForCUDA());
738         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
739 
740         /* assign the pointer */
741         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
742 
743         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
744         PetscCallCUDA(cudaFreeHost(AiUp));
745         PetscCallCUDA(cudaFreeHost(AjUp));
746       } else {
747         /* Fill the upper triangular matrix */
748         offset = 0;
749         for (i = 0; i < n; i++) {
750           /* set the pointers */
751           v  = aa + ai[i];
752           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
753 
754           /* first, set the diagonal elements */
755           AAUp[offset] = 1.0 / v[nz];
756           AALo[offset] = 1.0 / v[nz];
757 
758           offset += 1;
759           if (nz > 0) {
760             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
761             for (j = offset; j < offset + nz; j++) {
762               AAUp[j] = -AAUp[j];
763               AALo[j] = AAUp[j] / v[nz];
764             }
765             offset += nz;
766           }
767         }
768         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
769         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
770         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
771         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
772         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
773       }
774       PetscCallCUDA(cudaFreeHost(AAUp));
775       PetscCallCUDA(cudaFreeHost(AALo));
776     } catch (char *ex) {
777       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
778     }
779   }
780   PetscFunctionReturn(0);
781 }
782 
783 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
784 {
785   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
786   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
787   IS                            ip                 = a->row;
788   PetscBool                     perm_identity;
789   PetscInt                      n = A->rmap->n;
790 
791   PetscFunctionBegin;
792   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
793   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
794   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
795   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
796 
797   A->offloadmask = PETSC_OFFLOAD_BOTH;
798 
799   /* lower triangular indices */
800   PetscCall(ISIdentity(ip, &perm_identity));
801   if (!perm_identity) {
802     IS              iip;
803     const PetscInt *irip, *rip;
804 
805     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
806     PetscCall(ISGetIndices(iip, &irip));
807     PetscCall(ISGetIndices(ip, &rip));
808     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
809     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
810     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
811     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
812     PetscCall(ISRestoreIndices(iip, &irip));
813     PetscCall(ISDestroy(&iip));
814     PetscCall(ISRestoreIndices(ip, &rip));
815     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
816   }
817   PetscFunctionReturn(0);
818 }
819 
820 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
821 {
822   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
823   IS          ip = b->row;
824   PetscBool   perm_identity;
825 
826   PetscFunctionBegin;
827   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
828   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
829   B->offloadmask = PETSC_OFFLOAD_CPU;
830   /* determine which version of MatSolve needs to be used. */
831   PetscCall(ISIdentity(ip, &perm_identity));
832   if (perm_identity) {
833     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
834     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
835     B->ops->matsolve          = NULL;
836     B->ops->matsolvetranspose = NULL;
837   } else {
838     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
839     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
840     B->ops->matsolve          = NULL;
841     B->ops->matsolvetranspose = NULL;
842   }
843 
844   /* get the triangular factors */
845   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
846   PetscFunctionReturn(0);
847 }
848 
849 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
850 {
851   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
852   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
853   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
854   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
855   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
856   cusparseIndexBase_t                indexBase;
857   cusparseMatrixType_t               matrixType;
858   cusparseFillMode_t                 fillMode;
859   cusparseDiagType_t                 diagType;
860 
861   PetscFunctionBegin;
862   /* allocate space for the transpose of the lower triangular factor */
863   PetscCall(PetscNew(&loTriFactorT));
864   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
865 
866   /* set the matrix descriptors of the lower triangular factor */
867   matrixType = cusparseGetMatType(loTriFactor->descr);
868   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
869   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
870   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
871 
872   /* Create the matrix description */
873   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
874   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
875   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
876   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
877   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
878 
879   /* set the operation */
880   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
881 
882   /* allocate GPU space for the CSC of the lower triangular factor*/
883   loTriFactorT->csrMat                 = new CsrMatrix;
884   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
885   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
886   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
887   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
888   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
889   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
890 
891   /* compute the transpose of the lower triangular factor, i.e. the CSC */
892 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
893   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
894                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
895                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
896   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
897 #endif
898 
899   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
900   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
901                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
902 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
903                                      loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
904 #else
905                                      loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
906 #endif
907   PetscCallCUDA(WaitForCUDA());
908   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
909 
910   /* Create the solve analysis information */
911   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
912   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
913 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
914   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
915                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
916   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
917 #endif
918 
919   /* perform the solve analysis */
920   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
921                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
922 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
923                                             loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
924 #else
925                                             loTriFactorT->solveInfo));
926 #endif
927   PetscCallCUDA(WaitForCUDA());
928   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
929 
930   /* assign the pointer */
931   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
932 
933   /*********************************************/
934   /* Now the Transpose of the Upper Tri Factor */
935   /*********************************************/
936 
937   /* allocate space for the transpose of the upper triangular factor */
938   PetscCall(PetscNew(&upTriFactorT));
939   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
940 
941   /* set the matrix descriptors of the upper triangular factor */
942   matrixType = cusparseGetMatType(upTriFactor->descr);
943   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
944   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
945   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
946 
947   /* Create the matrix description */
948   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
949   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
950   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
951   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
952   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
953 
954   /* set the operation */
955   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
956 
957   /* allocate GPU space for the CSC of the upper triangular factor*/
958   upTriFactorT->csrMat                 = new CsrMatrix;
959   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
960   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
961   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
962   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
963   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
964   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
965 
966   /* compute the transpose of the upper triangular factor, i.e. the CSC */
967 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
968   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
969                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
970                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
971   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
972 #endif
973 
974   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
975   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
976                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
977 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
978                                      upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
979 #else
980                                      upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
981 #endif
982 
983   PetscCallCUDA(WaitForCUDA());
984   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
985 
986   /* Create the solve analysis information */
987   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
988   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
989 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
990   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
991                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
992   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
993 #endif
994 
995   /* perform the solve analysis */
996   /* christ, would it have killed you to put this stuff in a function????????? */
997   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
998                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
999 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1000                                             upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1001 #else
1002                                             upTriFactorT->solveInfo));
1003 #endif
1004 
1005   PetscCallCUDA(WaitForCUDA());
1006   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1007 
1008   /* assign the pointer */
1009   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1010   PetscFunctionReturn(0);
1011 }
1012 
1013 struct PetscScalarToPetscInt {
1014   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1015 };
1016 
1017 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1018 {
1019   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1020   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1021   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1022   cusparseStatus_t              stat;
1023   cusparseIndexBase_t           indexBase;
1024 
1025   PetscFunctionBegin;
1026   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1027   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1028   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1029   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1030   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1031   if (A->transupdated) PetscFunctionReturn(0);
1032   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1033   PetscCall(PetscLogGpuTimeBegin());
1034   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1035   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1036     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1037     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1038     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1039     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1040     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1041 
1042     /* set alpha and beta */
1043     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1044     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1045     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1046     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1047     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1048     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1049 
1050     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1051       CsrMatrix *matrixT      = new CsrMatrix;
1052       matstructT->mat         = matrixT;
1053       matrixT->num_rows       = A->cmap->n;
1054       matrixT->num_cols       = A->rmap->n;
1055       matrixT->num_entries    = a->nz;
1056       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1057       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1058       matrixT->values         = new THRUSTARRAY(a->nz);
1059 
1060       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1061       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1062 
1063 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1064   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1065       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1066                                indexBase, cusparse_scalartype);
1067       PetscCallCUSPARSE(stat);
1068   #else
1069       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1070            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1071 
1072            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1073            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1074            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1075         */
1076       if (matrixT->num_entries) {
1077         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1078         PetscCallCUSPARSE(stat);
1079 
1080       } else {
1081         matstructT->matDescr = NULL;
1082         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1083       }
1084   #endif
1085 #endif
1086     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1087 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1088       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1089 #else
1090       CsrMatrix *temp = new CsrMatrix;
1091       CsrMatrix *tempT = new CsrMatrix;
1092       /* First convert HYB to CSR */
1093       temp->num_rows = A->rmap->n;
1094       temp->num_cols = A->cmap->n;
1095       temp->num_entries = a->nz;
1096       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1097       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1098       temp->values = new THRUSTARRAY(a->nz);
1099 
1100       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1101       PetscCallCUSPARSE(stat);
1102 
1103       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1104       tempT->num_rows = A->rmap->n;
1105       tempT->num_cols = A->cmap->n;
1106       tempT->num_entries = a->nz;
1107       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1108       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1109       tempT->values = new THRUSTARRAY(a->nz);
1110 
1111       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1112                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1113       PetscCallCUSPARSE(stat);
1114 
1115       /* Last, convert CSC to HYB */
1116       cusparseHybMat_t hybMat;
1117       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1118       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1119       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1120       PetscCallCUSPARSE(stat);
1121 
1122       /* assign the pointer */
1123       matstructT->mat = hybMat;
1124       A->transupdated = PETSC_TRUE;
1125       /* delete temporaries */
1126       if (tempT) {
1127         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1128         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1129         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1130         delete (CsrMatrix *)tempT;
1131       }
1132       if (temp) {
1133         if (temp->values) delete (THRUSTARRAY *)temp->values;
1134         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1135         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1136         delete (CsrMatrix *)temp;
1137       }
1138 #endif
1139     }
1140   }
1141   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1142     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1143     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1144     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1145     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1146     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1147     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1148     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1149     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1150     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1151     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1152     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1153       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1154       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1155       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1156     }
1157     if (!cusparsestruct->csr2csc_i) {
1158       THRUSTARRAY csr2csc_a(matrix->num_entries);
1159       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1160 
1161       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1162 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1163       void  *csr2cscBuffer;
1164       size_t csr2cscBufferSize;
1165       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1166                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1167       PetscCallCUSPARSE(stat);
1168       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1169 #endif
1170 
1171       if (matrix->num_entries) {
1172         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1173            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1174            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1175 
1176            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1177            should be filled with indexBase. So I just take a shortcut here.
1178         */
1179         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1180 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1181                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1182         PetscCallCUSPARSE(stat);
1183 #else
1184                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1185         PetscCallCUSPARSE(stat);
1186 #endif
1187       } else {
1188         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1189       }
1190 
1191       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1192       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1193 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1194       PetscCallCUDA(cudaFree(csr2cscBuffer));
1195 #endif
1196     }
1197     PetscCallThrust(
1198       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1199   }
1200   PetscCall(PetscLogGpuTimeEnd());
1201   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1202   /* the compressed row indices is not used for matTranspose */
1203   matstructT->cprowIndices = NULL;
1204   /* assign the pointer */
1205   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1206   A->transupdated                                = PETSC_TRUE;
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1211 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1212 {
1213   PetscInt                              n = xx->map->n;
1214   const PetscScalar                    *barray;
1215   PetscScalar                          *xarray;
1216   thrust::device_ptr<const PetscScalar> bGPU;
1217   thrust::device_ptr<PetscScalar>       xGPU;
1218   cusparseStatus_t                      stat;
1219   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1220   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1221   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1222   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1223 
1224   PetscFunctionBegin;
1225   /* Analyze the matrix and create the transpose ... on the fly */
1226   if (!loTriFactorT && !upTriFactorT) {
1227     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1228     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1229     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1230   }
1231 
1232   /* Get the GPU pointers */
1233   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1234   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1235   xGPU = thrust::device_pointer_cast(xarray);
1236   bGPU = thrust::device_pointer_cast(barray);
1237 
1238   PetscCall(PetscLogGpuTimeBegin());
1239   /* First, reorder with the row permutation */
1240   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1241 
1242   /* First, solve U */
1243   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
1244 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1245                               upTriFactorT->csrMat->num_entries,
1246 #endif
1247                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray,
1248 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1249                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
1250   PetscCallCUSPARSE(stat);
1251 #else
1252                               tempGPU->data().get());
1253   PetscCallCUSPARSE(stat);
1254 #endif
1255 
1256   /* Then, solve L */
1257   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
1258 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1259                               loTriFactorT->csrMat->num_entries,
1260 #endif
1261                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1262                               tempGPU->data().get(),
1263 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1264                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
1265   PetscCallCUSPARSE(stat);
1266 #else
1267                               xarray);
1268   PetscCallCUSPARSE(stat);
1269 #endif
1270 
1271   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1272   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1273 
1274   /* Copy the temporary to the full solution. */
1275   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1276 
1277   /* restore */
1278   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1279   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1280   PetscCall(PetscLogGpuTimeEnd());
1281   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1282   PetscFunctionReturn(0);
1283 }
1284 
1285 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1286 {
1287   const PetscScalar                 *barray;
1288   PetscScalar                       *xarray;
1289   cusparseStatus_t                   stat;
1290   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1291   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1292   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1293   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1294 
1295   PetscFunctionBegin;
1296   /* Analyze the matrix and create the transpose ... on the fly */
1297   if (!loTriFactorT && !upTriFactorT) {
1298     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1299     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1300     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1301   }
1302 
1303   /* Get the GPU pointers */
1304   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1305   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1306 
1307   PetscCall(PetscLogGpuTimeBegin());
1308   /* First, solve U */
1309   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
1310 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1311                               upTriFactorT->csrMat->num_entries,
1312 #endif
1313                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray,
1314 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1315                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
1316   PetscCallCUSPARSE(stat);
1317 #else
1318                               tempGPU->data().get());
1319   PetscCallCUSPARSE(stat);
1320 #endif
1321 
1322   /* Then, solve L */
1323   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
1324 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1325                               loTriFactorT->csrMat->num_entries,
1326 #endif
1327                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1328                               tempGPU->data().get(),
1329 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1330                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
1331   PetscCallCUSPARSE(stat);
1332 #else
1333                               xarray);
1334   PetscCallCUSPARSE(stat);
1335 #endif
1336 
1337   /* restore */
1338   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1339   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1340   PetscCall(PetscLogGpuTimeEnd());
1341   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1342   PetscFunctionReturn(0);
1343 }
1344 
1345 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1346 {
1347   const PetscScalar                    *barray;
1348   PetscScalar                          *xarray;
1349   thrust::device_ptr<const PetscScalar> bGPU;
1350   thrust::device_ptr<PetscScalar>       xGPU;
1351   cusparseStatus_t                      stat;
1352   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1353   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1354   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1355   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1356 
1357   PetscFunctionBegin;
1358 
1359   /* Get the GPU pointers */
1360   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1361   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1362   xGPU = thrust::device_pointer_cast(xarray);
1363   bGPU = thrust::device_pointer_cast(barray);
1364 
1365   PetscCall(PetscLogGpuTimeBegin());
1366   /* First, reorder with the row permutation */
1367   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1368 
1369   /* Next, solve L */
1370   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
1371 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1372                               loTriFactor->csrMat->num_entries,
1373 #endif
1374                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
1375                               tempGPU->data().get(),
1376 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1377                               xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer);
1378   PetscCallCUSPARSE(stat);
1379 #else
1380                               xarray);
1381   PetscCallCUSPARSE(stat);
1382 #endif
1383 
1384   /* Then, solve U */
1385   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
1386 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1387                               upTriFactor->csrMat->num_entries,
1388 #endif
1389                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray,
1390 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1391                               tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer);
1392   PetscCallCUSPARSE(stat);
1393 #else
1394                               tempGPU->data().get());
1395   PetscCallCUSPARSE(stat);
1396 #endif
1397 
1398   /* Last, reorder with the column permutation */
1399   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1400 
1401   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1402   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1403   PetscCall(PetscLogGpuTimeEnd());
1404   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1405   PetscFunctionReturn(0);
1406 }
1407 
1408 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1409 {
1410   const PetscScalar                 *barray;
1411   PetscScalar                       *xarray;
1412   cusparseStatus_t                   stat;
1413   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1414   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1415   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1416   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1417 
1418   PetscFunctionBegin;
1419   /* Get the GPU pointers */
1420   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1421   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1422 
1423   PetscCall(PetscLogGpuTimeBegin());
1424   /* First, solve L */
1425   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
1426 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1427                               loTriFactor->csrMat->num_entries,
1428 #endif
1429                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray,
1430 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1431                               tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer);
1432   PetscCallCUSPARSE(stat);
1433 #else
1434                               tempGPU->data().get());
1435   PetscCallCUSPARSE(stat);
1436 #endif
1437 
1438   /* Next, solve U */
1439   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
1440 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1441                               upTriFactor->csrMat->num_entries,
1442 #endif
1443                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
1444                               tempGPU->data().get(),
1445 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1446                               xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer);
1447   PetscCallCUSPARSE(stat);
1448 #else
1449                               xarray);
1450   PetscCallCUSPARSE(stat);
1451 #endif
1452 
1453   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1454   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1455   PetscCall(PetscLogGpuTimeEnd());
1456   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1457   PetscFunctionReturn(0);
1458 }
1459 
1460 #if CUSPARSE_VERSION >= 11500
1461 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1462 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1463 {
1464   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1465   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1466   const PetscScalar            *barray;
1467   PetscScalar                  *xarray;
1468 
1469   PetscFunctionBegin;
1470   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1471   PetscCall(VecCUDAGetArrayRead(b, &barray));
1472   PetscCall(PetscLogGpuTimeBegin());
1473 
1474   /* Solve L*y = b */
1475   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1476   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1477   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1478                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1479                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1480 
1481   /* Solve U*x = y */
1482   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1483   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1484                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1485 
1486   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1487   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1488 
1489   PetscCall(PetscLogGpuTimeEnd());
1490   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1491   PetscFunctionReturn(0);
1492 }
1493 
1494 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1495 {
1496   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1497   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1498   const PetscScalar            *barray;
1499   PetscScalar                  *xarray;
1500 
1501   PetscFunctionBegin;
1502   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1503     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1504     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1505                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1506 
1507     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1508     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1509     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1510     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1511     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1512   }
1513 
1514   if (!fs->updatedTransposeSpSVAnalysis) {
1515     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1516 
1517     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1518     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1519   }
1520 
1521   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1522   PetscCall(VecCUDAGetArrayRead(b, &barray));
1523   PetscCall(PetscLogGpuTimeBegin());
1524 
1525   /* Solve Ut*y = b */
1526   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1527   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1528   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1529                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1530 
1531   /* Solve Lt*x = y */
1532   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1533   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1534                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1535 
1536   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1537   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1538   PetscCall(PetscLogGpuTimeEnd());
1539   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1540   PetscFunctionReturn(0);
1541 }
1542 
1543 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1544 {
1545   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1546   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1547   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1548   CsrMatrix                    *Acsr;
1549   PetscInt                      m, nz;
1550   PetscBool                     flg;
1551 
1552   PetscFunctionBegin;
1553   if (PetscDefined(USE_DEBUG)) {
1554     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1555     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1556   }
1557 
1558   /* Copy A's value to fact */
1559   m  = fact->rmap->n;
1560   nz = aij->nz;
1561   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1562   Acsr = (CsrMatrix *)Acusp->mat->mat;
1563   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1564 
1565   /* Factorize fact inplace */
1566   if (m)
1567     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1568                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1569   if (PetscDefined(USE_DEBUG)) {
1570     int              numerical_zero;
1571     cusparseStatus_t status;
1572     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1573     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1574   }
1575 
1576   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1577      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1578   */
1579   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1580 
1581   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1582 
1583   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1584   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1585 
1586   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1587   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1588   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1589   fact->ops->matsolve          = NULL;
1590   fact->ops->matsolvetranspose = NULL;
1591   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1596 {
1597   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1598   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1599   PetscInt                      m, nz;
1600 
1601   PetscFunctionBegin;
1602   if (PetscDefined(USE_DEBUG)) {
1603     PetscInt  i;
1604     PetscBool flg, missing;
1605 
1606     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1607     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1608     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1609     PetscCall(MatMissingDiagonal(A, &missing, &i));
1610     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1611   }
1612 
1613   /* Free the old stale stuff */
1614   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1615 
1616   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1617      but they will not be used. Allocate them just for easy debugging.
1618    */
1619   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1620 
1621   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1622   fact->factortype             = MAT_FACTOR_ILU;
1623   fact->info.factor_mallocs    = 0;
1624   fact->info.fill_ratio_given  = info->fill;
1625   fact->info.fill_ratio_needed = 1.0;
1626 
1627   aij->row = NULL;
1628   aij->col = NULL;
1629 
1630   /* ====================================================================== */
1631   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1632   /* We'll do in-place factorization on fact                                */
1633   /* ====================================================================== */
1634   const int *Ai, *Aj;
1635 
1636   m  = fact->rmap->n;
1637   nz = aij->nz;
1638 
1639   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1640   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1641   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1642   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1643   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1644   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1645 
1646   /* ====================================================================== */
1647   /* Create descriptors for M, L, U                                         */
1648   /* ====================================================================== */
1649   cusparseFillMode_t fillMode;
1650   cusparseDiagType_t diagType;
1651 
1652   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1653   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1654   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1655 
1656   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1657     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1658     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1659     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1660     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1661   */
1662   fillMode = CUSPARSE_FILL_MODE_LOWER;
1663   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1664   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1665   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1666   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1667 
1668   fillMode = CUSPARSE_FILL_MODE_UPPER;
1669   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1670   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1671   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1672   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1673 
1674   /* ========================================================================= */
1675   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1676   /* ========================================================================= */
1677   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1678   if (m)
1679     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1680                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1681 
1682   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1683   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1684 
1685   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1686   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1687 
1688   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1689   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1690 
1691   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1692   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1693 
1694   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1695      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1696      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1697      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1698    */
1699   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1700     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1701     fs->spsvBuffer_L = fs->factBuffer_M;
1702     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1703   } else {
1704     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1705     fs->spsvBuffer_U = fs->factBuffer_M;
1706     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1707   }
1708 
1709   /* ========================================================================== */
1710   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1711   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1712   /* ========================================================================== */
1713   int              structural_zero;
1714   cusparseStatus_t status;
1715 
1716   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1717   if (m)
1718     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1719                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1720   if (PetscDefined(USE_DEBUG)) {
1721     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1722     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1723     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1724   }
1725 
1726   /* Estimate FLOPs of the numeric factorization */
1727   {
1728     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1729     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1730     PetscLogDouble flops = 0.0;
1731 
1732     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1733     Ai    = Aseq->i;
1734     Adiag = Aseq->diag;
1735     for (PetscInt i = 0; i < m; i++) {
1736       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1737         nzRow  = Ai[i + 1] - Ai[i];
1738         nzLeft = Adiag[i] - Ai[i];
1739         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1740           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1741         */
1742         nzLeft = (nzRow - 1) / 2;
1743         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1744       }
1745     }
1746     fs->numericFactFlops = flops;
1747   }
1748   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1749   PetscFunctionReturn(0);
1750 }
1751 
1752 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1753 {
1754   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1755   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1756   const PetscScalar            *barray;
1757   PetscScalar                  *xarray;
1758 
1759   PetscFunctionBegin;
1760   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1761   PetscCall(VecCUDAGetArrayRead(b, &barray));
1762   PetscCall(PetscLogGpuTimeBegin());
1763 
1764   /* Solve L*y = b */
1765   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1766   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1767   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1768                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1769 
1770   /* Solve Lt*x = y */
1771   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1772   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1773                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1774 
1775   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1776   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1777 
1778   PetscCall(PetscLogGpuTimeEnd());
1779   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1780   PetscFunctionReturn(0);
1781 }
1782 
1783 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1784 {
1785   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1786   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1787   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1788   CsrMatrix                    *Acsr;
1789   PetscInt                      m, nz;
1790   PetscBool                     flg;
1791 
1792   PetscFunctionBegin;
1793   if (PetscDefined(USE_DEBUG)) {
1794     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1795     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1796   }
1797 
1798   /* Copy A's value to fact */
1799   m  = fact->rmap->n;
1800   nz = aij->nz;
1801   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1802   Acsr = (CsrMatrix *)Acusp->mat->mat;
1803   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1804 
1805   /* Factorize fact inplace */
1806   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1807      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1808      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1809      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1810      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1811    */
1812   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1813   if (PetscDefined(USE_DEBUG)) {
1814     int              numerical_zero;
1815     cusparseStatus_t status;
1816     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1817     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1818   }
1819 
1820   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1821 
1822   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1823     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1824   */
1825   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1826 
1827   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1828   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1829   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1830   fact->ops->matsolve          = NULL;
1831   fact->ops->matsolvetranspose = NULL;
1832   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1833   PetscFunctionReturn(0);
1834 }
1835 
1836 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1837 {
1838   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1839   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1840   PetscInt                      m, nz;
1841 
1842   PetscFunctionBegin;
1843   if (PetscDefined(USE_DEBUG)) {
1844     PetscInt  i;
1845     PetscBool flg, missing;
1846 
1847     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1848     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1849     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1850     PetscCall(MatMissingDiagonal(A, &missing, &i));
1851     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1852   }
1853 
1854   /* Free the old stale stuff */
1855   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1856 
1857   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1858      but they will not be used. Allocate them just for easy debugging.
1859    */
1860   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1861 
1862   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1863   fact->factortype             = MAT_FACTOR_ICC;
1864   fact->info.factor_mallocs    = 0;
1865   fact->info.fill_ratio_given  = info->fill;
1866   fact->info.fill_ratio_needed = 1.0;
1867 
1868   aij->row = NULL;
1869   aij->col = NULL;
1870 
1871   /* ====================================================================== */
1872   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1873   /* We'll do in-place factorization on fact                                */
1874   /* ====================================================================== */
1875   const int *Ai, *Aj;
1876 
1877   m  = fact->rmap->n;
1878   nz = aij->nz;
1879 
1880   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1881   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1882   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1883   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1884   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1885   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1886 
1887   /* ====================================================================== */
1888   /* Create mat descriptors for M, L                                        */
1889   /* ====================================================================== */
1890   cusparseFillMode_t fillMode;
1891   cusparseDiagType_t diagType;
1892 
1893   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1894   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1895   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1896 
1897   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1898     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1899     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1900     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1901     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1902   */
1903   fillMode = CUSPARSE_FILL_MODE_LOWER;
1904   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1905   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1906   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1907   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1908 
1909   /* ========================================================================= */
1910   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1911   /* ========================================================================= */
1912   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1913   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1914 
1915   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1916   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1917 
1918   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1919   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1920 
1921   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1922   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1923 
1924   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1925   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1926 
1927   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1928      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1929    */
1930   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1931     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1932     fs->spsvBuffer_L = fs->factBuffer_M;
1933     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1934   } else {
1935     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1936     fs->spsvBuffer_Lt = fs->factBuffer_M;
1937     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1938   }
1939 
1940   /* ========================================================================== */
1941   /* Perform analysis of ic0 on M                                               */
1942   /* The lower triangular part of M has the same sparsity pattern as L          */
1943   /* ========================================================================== */
1944   int              structural_zero;
1945   cusparseStatus_t status;
1946 
1947   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1948   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1949   if (PetscDefined(USE_DEBUG)) {
1950     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1951     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1952     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1953   }
1954 
1955   /* Estimate FLOPs of the numeric factorization */
1956   {
1957     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1958     PetscInt      *Ai, nzRow, nzLeft;
1959     PetscLogDouble flops = 0.0;
1960 
1961     Ai = Aseq->i;
1962     for (PetscInt i = 0; i < m; i++) {
1963       nzRow = Ai[i + 1] - Ai[i];
1964       if (nzRow > 1) {
1965         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1966           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1967         */
1968         nzLeft = (nzRow - 1) / 2;
1969         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1970       }
1971     }
1972     fs->numericFactFlops = flops;
1973   }
1974   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1975   PetscFunctionReturn(0);
1976 }
1977 #endif
1978 
1979 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1980 {
1981   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1982 
1983   PetscFunctionBegin;
1984 #if CUSPARSE_VERSION >= 11500
1985   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1986   if (cusparseTriFactors->factorizeOnDevice) {
1987     PetscCall(ISIdentity(isrow, &row_identity));
1988     PetscCall(ISIdentity(iscol, &col_identity));
1989   }
1990   if (!info->levels && row_identity && col_identity) {
1991     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1992   } else
1993 #endif
1994   {
1995     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1996     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1997     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1998   }
1999   PetscFunctionReturn(0);
2000 }
2001 
2002 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2003 {
2004   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2005 
2006   PetscFunctionBegin;
2007   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2008   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2009   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2010   PetscFunctionReturn(0);
2011 }
2012 
2013 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2014 {
2015   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2016 
2017   PetscFunctionBegin;
2018 #if CUSPARSE_VERSION >= 11500
2019   PetscBool perm_identity = PETSC_FALSE;
2020   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2021   if (!info->levels && perm_identity) {
2022     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2023   } else
2024 #endif
2025   {
2026     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2027     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2028     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2029   }
2030   PetscFunctionReturn(0);
2031 }
2032 
2033 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2034 {
2035   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2036 
2037   PetscFunctionBegin;
2038   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2039   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2040   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2041   PetscFunctionReturn(0);
2042 }
2043 
2044 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type)
2045 {
2046   PetscFunctionBegin;
2047   *type = MATSOLVERCUSPARSE;
2048   PetscFunctionReturn(0);
2049 }
2050 
2051 /*MC
2052   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2053   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2054   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2055   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2056   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2057   algorithms are not recommended. This class does NOT support direct solver operations.
2058 
2059   Level: beginner
2060 
2061 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2062 M*/
2063 
2064 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2065 {
2066   PetscInt  n = A->rmap->n;
2067   PetscBool factOnDevice, factOnHost;
2068   char     *prefix;
2069   char      factPlace[32] = "device"; /* the default */
2070 
2071   PetscFunctionBegin;
2072   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2073   PetscCall(MatSetSizes(*B, n, n, n, n));
2074   (*B)->factortype = ftype;
2075   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2076 
2077   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2078   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2079   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2080   PetscOptionsEnd();
2081   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2082   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2083   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2084   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2085 
2086   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2087   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2088     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2089     if (!A->boundtocpu) {
2090       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2091       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2092     } else {
2093       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2094       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2095     }
2096     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2097     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2098     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2099   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2100     if (!A->boundtocpu) {
2101       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2102       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2103     } else {
2104       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2105       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2106     }
2107     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2108     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2109   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2110 
2111   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2112   (*B)->canuseordering = PETSC_TRUE;
2113   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2114   PetscFunctionReturn(0);
2115 }
2116 
2117 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2118 {
2119   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2120   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2121 #if CUSPARSE_VERSION >= 13500
2122   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2123 #endif
2124 
2125   PetscFunctionBegin;
2126   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2127     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2128     if (A->factortype == MAT_FACTOR_NONE) {
2129       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2130       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2131     }
2132 #if CUSPARSE_VERSION >= 13500
2133     else if (fs->csrVal) {
2134       /* We have a factorized matrix on device and are able to copy it to host */
2135       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2136     }
2137 #endif
2138     else
2139       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2140     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2141     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2142     A->offloadmask = PETSC_OFFLOAD_BOTH;
2143   }
2144   PetscFunctionReturn(0);
2145 }
2146 
2147 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2148 {
2149   PetscFunctionBegin;
2150   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2151   *array = ((Mat_SeqAIJ *)A->data)->a;
2152   PetscFunctionReturn(0);
2153 }
2154 
2155 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2156 {
2157   PetscFunctionBegin;
2158   A->offloadmask = PETSC_OFFLOAD_CPU;
2159   *array         = NULL;
2160   PetscFunctionReturn(0);
2161 }
2162 
2163 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2164 {
2165   PetscFunctionBegin;
2166   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2167   *array = ((Mat_SeqAIJ *)A->data)->a;
2168   PetscFunctionReturn(0);
2169 }
2170 
2171 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2172 {
2173   PetscFunctionBegin;
2174   *array = NULL;
2175   PetscFunctionReturn(0);
2176 }
2177 
2178 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2179 {
2180   PetscFunctionBegin;
2181   *array = ((Mat_SeqAIJ *)A->data)->a;
2182   PetscFunctionReturn(0);
2183 }
2184 
2185 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2186 {
2187   PetscFunctionBegin;
2188   A->offloadmask = PETSC_OFFLOAD_CPU;
2189   *array         = NULL;
2190   PetscFunctionReturn(0);
2191 }
2192 
2193 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2194 {
2195   Mat_SeqAIJCUSPARSE *cusp;
2196   CsrMatrix          *matrix;
2197 
2198   PetscFunctionBegin;
2199   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2200   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2201   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2202   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2203   matrix = (CsrMatrix *)cusp->mat->mat;
2204 
2205   if (i) {
2206 #if !defined(PETSC_USE_64BIT_INDICES)
2207     *i = matrix->row_offsets->data().get();
2208 #else
2209     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2210 #endif
2211   }
2212   if (j) {
2213 #if !defined(PETSC_USE_64BIT_INDICES)
2214     *j = matrix->column_indices->data().get();
2215 #else
2216     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2217 #endif
2218   }
2219   if (a) *a = matrix->values->data().get();
2220   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2221   PetscFunctionReturn(0);
2222 }
2223 
2224 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2225 {
2226   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2227   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2228   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2229   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2230   cusparseStatus_t              stat;
2231   PetscBool                     both = PETSC_TRUE;
2232 
2233   PetscFunctionBegin;
2234   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2235   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2236     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2237       CsrMatrix *matrix;
2238       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2239 
2240       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2241       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2242       matrix->values->assign(a->a, a->a + a->nz);
2243       PetscCallCUDA(WaitForCUDA());
2244       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2245       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2246       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2247     } else {
2248       PetscInt nnz;
2249       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2250       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2251       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2252       delete cusparsestruct->workVector;
2253       delete cusparsestruct->rowoffsets_gpu;
2254       cusparsestruct->workVector     = NULL;
2255       cusparsestruct->rowoffsets_gpu = NULL;
2256       try {
2257         if (a->compressedrow.use) {
2258           m    = a->compressedrow.nrows;
2259           ii   = a->compressedrow.i;
2260           ridx = a->compressedrow.rindex;
2261         } else {
2262           m    = A->rmap->n;
2263           ii   = a->i;
2264           ridx = NULL;
2265         }
2266         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2267         if (!a->a) {
2268           nnz  = ii[m];
2269           both = PETSC_FALSE;
2270         } else nnz = a->nz;
2271         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2272 
2273         /* create cusparse matrix */
2274         cusparsestruct->nrows = m;
2275         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2276         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2277         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2278         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2279 
2280         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2281         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2282         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2283         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2284         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2285         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2286         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2287 
2288         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2289         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2290           /* set the matrix */
2291           CsrMatrix *mat   = new CsrMatrix;
2292           mat->num_rows    = m;
2293           mat->num_cols    = A->cmap->n;
2294           mat->num_entries = nnz;
2295           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2296           mat->row_offsets->assign(ii, ii + m + 1);
2297 
2298           mat->column_indices = new THRUSTINTARRAY32(nnz);
2299           mat->column_indices->assign(a->j, a->j + nnz);
2300 
2301           mat->values = new THRUSTARRAY(nnz);
2302           if (a->a) mat->values->assign(a->a, a->a + nnz);
2303 
2304           /* assign the pointer */
2305           matstruct->mat = mat;
2306 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2307           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2308             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2309                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2310             PetscCallCUSPARSE(stat);
2311           }
2312 #endif
2313         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2314 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2315           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2316 #else
2317           CsrMatrix *mat = new CsrMatrix;
2318           mat->num_rows = m;
2319           mat->num_cols = A->cmap->n;
2320           mat->num_entries = nnz;
2321           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2322           mat->row_offsets->assign(ii, ii + m + 1);
2323 
2324           mat->column_indices = new THRUSTINTARRAY32(nnz);
2325           mat->column_indices->assign(a->j, a->j + nnz);
2326 
2327           mat->values = new THRUSTARRAY(nnz);
2328           if (a->a) mat->values->assign(a->a, a->a + nnz);
2329 
2330           cusparseHybMat_t hybMat;
2331           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2332           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2333           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2334           PetscCallCUSPARSE(stat);
2335           /* assign the pointer */
2336           matstruct->mat = hybMat;
2337 
2338           if (mat) {
2339             if (mat->values) delete (THRUSTARRAY *)mat->values;
2340             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2341             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2342             delete (CsrMatrix *)mat;
2343           }
2344 #endif
2345         }
2346 
2347         /* assign the compressed row indices */
2348         if (a->compressedrow.use) {
2349           cusparsestruct->workVector = new THRUSTARRAY(m);
2350           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2351           matstruct->cprowIndices->assign(ridx, ridx + m);
2352           tmp = m;
2353         } else {
2354           cusparsestruct->workVector = NULL;
2355           matstruct->cprowIndices    = NULL;
2356           tmp                        = 0;
2357         }
2358         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2359 
2360         /* assign the pointer */
2361         cusparsestruct->mat = matstruct;
2362       } catch (char *ex) {
2363         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2364       }
2365       PetscCallCUDA(WaitForCUDA());
2366       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2367       cusparsestruct->nonzerostate = A->nonzerostate;
2368     }
2369     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2370   }
2371   PetscFunctionReturn(0);
2372 }
2373 
2374 struct VecCUDAPlusEquals {
2375   template <typename Tuple>
2376   __host__ __device__ void operator()(Tuple t)
2377   {
2378     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2379   }
2380 };
2381 
2382 struct VecCUDAEquals {
2383   template <typename Tuple>
2384   __host__ __device__ void operator()(Tuple t)
2385   {
2386     thrust::get<1>(t) = thrust::get<0>(t);
2387   }
2388 };
2389 
2390 struct VecCUDAEqualsReverse {
2391   template <typename Tuple>
2392   __host__ __device__ void operator()(Tuple t)
2393   {
2394     thrust::get<0>(t) = thrust::get<1>(t);
2395   }
2396 };
2397 
2398 struct MatMatCusparse {
2399   PetscBool      cisdense;
2400   PetscScalar   *Bt;
2401   Mat            X;
2402   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2403   PetscLogDouble flops;
2404   CsrMatrix     *Bcsr;
2405 
2406 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2407   cusparseSpMatDescr_t matSpBDescr;
2408   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2409   cusparseDnMatDescr_t matBDescr;
2410   cusparseDnMatDescr_t matCDescr;
2411   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2412   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2413   void *dBuffer4;
2414   void *dBuffer5;
2415   #endif
2416   size_t                mmBufferSize;
2417   void                 *mmBuffer;
2418   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2419   cusparseSpGEMMDescr_t spgemmDesc;
2420 #endif
2421 };
2422 
2423 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2424 {
2425   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2426 
2427   PetscFunctionBegin;
2428   PetscCallCUDA(cudaFree(mmdata->Bt));
2429   delete mmdata->Bcsr;
2430 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2431   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2432   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2433   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2434   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2435   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2436   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2437   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2438   #endif
2439   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2440   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2441 #endif
2442   PetscCall(MatDestroy(&mmdata->X));
2443   PetscCall(PetscFree(data));
2444   PetscFunctionReturn(0);
2445 }
2446 
2447 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2448 
2449 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2450 {
2451   Mat_Product                  *product = C->product;
2452   Mat                           A, B;
2453   PetscInt                      m, n, blda, clda;
2454   PetscBool                     flg, biscuda;
2455   Mat_SeqAIJCUSPARSE           *cusp;
2456   cusparseStatus_t              stat;
2457   cusparseOperation_t           opA;
2458   const PetscScalar            *barray;
2459   PetscScalar                  *carray;
2460   MatMatCusparse               *mmdata;
2461   Mat_SeqAIJCUSPARSEMultStruct *mat;
2462   CsrMatrix                    *csrmat;
2463 
2464   PetscFunctionBegin;
2465   MatCheckProduct(C, 1);
2466   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2467   mmdata = (MatMatCusparse *)product->data;
2468   A      = product->A;
2469   B      = product->B;
2470   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2471   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2472   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2473      Instead of silently accepting the wrong answer, I prefer to raise the error */
2474   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2475   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2476   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2477   switch (product->type) {
2478   case MATPRODUCT_AB:
2479   case MATPRODUCT_PtAP:
2480     mat = cusp->mat;
2481     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2482     m   = A->rmap->n;
2483     n   = B->cmap->n;
2484     break;
2485   case MATPRODUCT_AtB:
2486     if (!A->form_explicit_transpose) {
2487       mat = cusp->mat;
2488       opA = CUSPARSE_OPERATION_TRANSPOSE;
2489     } else {
2490       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2491       mat = cusp->matTranspose;
2492       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2493     }
2494     m = A->cmap->n;
2495     n = B->cmap->n;
2496     break;
2497   case MATPRODUCT_ABt:
2498   case MATPRODUCT_RARt:
2499     mat = cusp->mat;
2500     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2501     m   = A->rmap->n;
2502     n   = B->rmap->n;
2503     break;
2504   default:
2505     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2506   }
2507   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2508   csrmat = (CsrMatrix *)mat->mat;
2509   /* if the user passed a CPU matrix, copy the data to the GPU */
2510   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2511   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2512   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2513 
2514   PetscCall(MatDenseGetLDA(B, &blda));
2515   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2516     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
2517     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2518   } else {
2519     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
2520     PetscCall(MatDenseGetLDA(C, &clda));
2521   }
2522 
2523   PetscCall(PetscLogGpuTimeBegin());
2524 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2525   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2526   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2527   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2528     size_t mmBufferSize;
2529     if (mmdata->initialized && mmdata->Blda != blda) {
2530       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2531       mmdata->matBDescr = NULL;
2532     }
2533     if (!mmdata->matBDescr) {
2534       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2535       mmdata->Blda = blda;
2536     }
2537 
2538     if (mmdata->initialized && mmdata->Clda != clda) {
2539       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2540       mmdata->matCDescr = NULL;
2541     }
2542     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2543       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2544       mmdata->Clda = clda;
2545     }
2546 
2547     if (!mat->matDescr) {
2548       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2549                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2550       PetscCallCUSPARSE(stat);
2551     }
2552     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2553     PetscCallCUSPARSE(stat);
2554     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2555       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2556       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2557       mmdata->mmBufferSize = mmBufferSize;
2558     }
2559     mmdata->initialized = PETSC_TRUE;
2560   } else {
2561     /* to be safe, always update pointers of the mats */
2562     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2563     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2564     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2565   }
2566 
2567   /* do cusparseSpMM, which supports transpose on B */
2568   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2569   PetscCallCUSPARSE(stat);
2570 #else
2571   PetscInt k;
2572   /* cusparseXcsrmm does not support transpose on B */
2573   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2574     cublasHandle_t cublasv2handle;
2575     cublasStatus_t cerr;
2576 
2577     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2578     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2579     PetscCallCUBLAS(cerr);
2580     blda = B->cmap->n;
2581     k = B->cmap->n;
2582   } else {
2583     k = B->rmap->n;
2584   }
2585 
2586   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2587   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2588   PetscCallCUSPARSE(stat);
2589 #endif
2590   PetscCall(PetscLogGpuTimeEnd());
2591   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2592   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2593   if (product->type == MATPRODUCT_RARt) {
2594     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2595     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2596   } else if (product->type == MATPRODUCT_PtAP) {
2597     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2598     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2599   } else {
2600     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2601   }
2602   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2603   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2604   PetscFunctionReturn(0);
2605 }
2606 
2607 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2608 {
2609   Mat_Product        *product = C->product;
2610   Mat                 A, B;
2611   PetscInt            m, n;
2612   PetscBool           cisdense, flg;
2613   MatMatCusparse     *mmdata;
2614   Mat_SeqAIJCUSPARSE *cusp;
2615 
2616   PetscFunctionBegin;
2617   MatCheckProduct(C, 1);
2618   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2619   A = product->A;
2620   B = product->B;
2621   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2622   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2623   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2624   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2625   switch (product->type) {
2626   case MATPRODUCT_AB:
2627     m = A->rmap->n;
2628     n = B->cmap->n;
2629     break;
2630   case MATPRODUCT_AtB:
2631     m = A->cmap->n;
2632     n = B->cmap->n;
2633     break;
2634   case MATPRODUCT_ABt:
2635     m = A->rmap->n;
2636     n = B->rmap->n;
2637     break;
2638   case MATPRODUCT_PtAP:
2639     m = B->cmap->n;
2640     n = B->cmap->n;
2641     break;
2642   case MATPRODUCT_RARt:
2643     m = B->rmap->n;
2644     n = B->rmap->n;
2645     break;
2646   default:
2647     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2648   }
2649   PetscCall(MatSetSizes(C, m, n, m, n));
2650   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2651   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2652   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2653 
2654   /* product data */
2655   PetscCall(PetscNew(&mmdata));
2656   mmdata->cisdense = cisdense;
2657 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2658   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2659   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2660 #endif
2661   /* for these products we need intermediate storage */
2662   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2663     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2664     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2665     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2666       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2667     } else {
2668       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2669     }
2670   }
2671   C->product->data    = mmdata;
2672   C->product->destroy = MatDestroy_MatMatCusparse;
2673 
2674   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2675   PetscFunctionReturn(0);
2676 }
2677 
2678 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2679 {
2680   Mat_Product                  *product = C->product;
2681   Mat                           A, B;
2682   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2683   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2684   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2685   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2686   PetscBool                     flg;
2687   cusparseStatus_t              stat;
2688   MatProductType                ptype;
2689   MatMatCusparse               *mmdata;
2690 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2691   cusparseSpMatDescr_t BmatSpDescr;
2692 #endif
2693   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2694 
2695   PetscFunctionBegin;
2696   MatCheckProduct(C, 1);
2697   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2698   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2699   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2700   mmdata = (MatMatCusparse *)C->product->data;
2701   A      = product->A;
2702   B      = product->B;
2703   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2704     mmdata->reusesym = PETSC_FALSE;
2705     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2706     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2707     Cmat = Ccusp->mat;
2708     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2709     Ccsr = (CsrMatrix *)Cmat->mat;
2710     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2711     goto finalize;
2712   }
2713   if (!c->nz) goto finalize;
2714   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2715   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2716   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2717   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2718   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2719   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2720   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2721   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2722   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2723   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2724   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2725   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2726   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2727   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2728 
2729   ptype = product->type;
2730   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2731     ptype = MATPRODUCT_AB;
2732     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2733   }
2734   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2735     ptype = MATPRODUCT_AB;
2736     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2737   }
2738   switch (ptype) {
2739   case MATPRODUCT_AB:
2740     Amat = Acusp->mat;
2741     Bmat = Bcusp->mat;
2742     break;
2743   case MATPRODUCT_AtB:
2744     Amat = Acusp->matTranspose;
2745     Bmat = Bcusp->mat;
2746     break;
2747   case MATPRODUCT_ABt:
2748     Amat = Acusp->mat;
2749     Bmat = Bcusp->matTranspose;
2750     break;
2751   default:
2752     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2753   }
2754   Cmat = Ccusp->mat;
2755   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2756   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2757   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2758   Acsr = (CsrMatrix *)Amat->mat;
2759   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2760   Ccsr = (CsrMatrix *)Cmat->mat;
2761   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2762   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2763   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2764   PetscCall(PetscLogGpuTimeBegin());
2765 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2766   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2767   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2768   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2769   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2770   PetscCallCUSPARSE(stat);
2771   #else
2772   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2773   PetscCallCUSPARSE(stat);
2774   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2775   PetscCallCUSPARSE(stat);
2776   #endif
2777 #else
2778   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2779                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2780   PetscCallCUSPARSE(stat);
2781 #endif
2782   PetscCall(PetscLogGpuFlops(mmdata->flops));
2783   PetscCallCUDA(WaitForCUDA());
2784   PetscCall(PetscLogGpuTimeEnd());
2785   C->offloadmask = PETSC_OFFLOAD_GPU;
2786 finalize:
2787   /* shorter version of MatAssemblyEnd_SeqAIJ */
2788   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2789   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2790   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2791   c->reallocs = 0;
2792   C->info.mallocs += 0;
2793   C->info.nz_unneeded = 0;
2794   C->assembled = C->was_assembled = PETSC_TRUE;
2795   C->num_ass++;
2796   PetscFunctionReturn(0);
2797 }
2798 
2799 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2800 {
2801   Mat_Product                  *product = C->product;
2802   Mat                           A, B;
2803   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2804   Mat_SeqAIJ                   *a, *b, *c;
2805   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2806   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2807   PetscInt                      i, j, m, n, k;
2808   PetscBool                     flg;
2809   cusparseStatus_t              stat;
2810   MatProductType                ptype;
2811   MatMatCusparse               *mmdata;
2812   PetscLogDouble                flops;
2813   PetscBool                     biscompressed, ciscompressed;
2814 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2815   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2816   cusparseSpMatDescr_t BmatSpDescr;
2817 #else
2818   int cnz;
2819 #endif
2820   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2821 
2822   PetscFunctionBegin;
2823   MatCheckProduct(C, 1);
2824   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2825   A = product->A;
2826   B = product->B;
2827   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2828   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2829   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2830   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2831   a = (Mat_SeqAIJ *)A->data;
2832   b = (Mat_SeqAIJ *)B->data;
2833   /* product data */
2834   PetscCall(PetscNew(&mmdata));
2835   C->product->data    = mmdata;
2836   C->product->destroy = MatDestroy_MatMatCusparse;
2837 
2838   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2839   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2840   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2841   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2842   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2843   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2844 
2845   ptype = product->type;
2846   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2847     ptype                                          = MATPRODUCT_AB;
2848     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2849   }
2850   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2851     ptype                                          = MATPRODUCT_AB;
2852     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2853   }
2854   biscompressed = PETSC_FALSE;
2855   ciscompressed = PETSC_FALSE;
2856   switch (ptype) {
2857   case MATPRODUCT_AB:
2858     m    = A->rmap->n;
2859     n    = B->cmap->n;
2860     k    = A->cmap->n;
2861     Amat = Acusp->mat;
2862     Bmat = Bcusp->mat;
2863     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2864     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2865     break;
2866   case MATPRODUCT_AtB:
2867     m = A->cmap->n;
2868     n = B->cmap->n;
2869     k = A->rmap->n;
2870     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2871     Amat = Acusp->matTranspose;
2872     Bmat = Bcusp->mat;
2873     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2874     break;
2875   case MATPRODUCT_ABt:
2876     m = A->rmap->n;
2877     n = B->rmap->n;
2878     k = A->cmap->n;
2879     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2880     Amat = Acusp->mat;
2881     Bmat = Bcusp->matTranspose;
2882     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2883     break;
2884   default:
2885     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2886   }
2887 
2888   /* create cusparse matrix */
2889   PetscCall(MatSetSizes(C, m, n, m, n));
2890   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2891   c     = (Mat_SeqAIJ *)C->data;
2892   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2893   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2894   Ccsr  = new CsrMatrix;
2895 
2896   c->compressedrow.use = ciscompressed;
2897   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2898     c->compressedrow.nrows = a->compressedrow.nrows;
2899     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2900     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2901     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2902     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2903     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2904   } else {
2905     c->compressedrow.nrows  = 0;
2906     c->compressedrow.i      = NULL;
2907     c->compressedrow.rindex = NULL;
2908     Ccusp->workVector       = NULL;
2909     Cmat->cprowIndices      = NULL;
2910   }
2911   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2912   Ccusp->mat        = Cmat;
2913   Ccusp->mat->mat   = Ccsr;
2914   Ccsr->num_rows    = Ccusp->nrows;
2915   Ccsr->num_cols    = n;
2916   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2917   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2918   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2919   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2920   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2921   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2922   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2923   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2924   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2925   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2926   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2927     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2928     c->nz                = 0;
2929     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2930     Ccsr->values         = new THRUSTARRAY(c->nz);
2931     goto finalizesym;
2932   }
2933 
2934   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2935   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2936   Acsr = (CsrMatrix *)Amat->mat;
2937   if (!biscompressed) {
2938     Bcsr = (CsrMatrix *)Bmat->mat;
2939 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2940     BmatSpDescr = Bmat->matDescr;
2941 #endif
2942   } else { /* we need to use row offsets for the full matrix */
2943     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2944     Bcsr                 = new CsrMatrix;
2945     Bcsr->num_rows       = B->rmap->n;
2946     Bcsr->num_cols       = cBcsr->num_cols;
2947     Bcsr->num_entries    = cBcsr->num_entries;
2948     Bcsr->column_indices = cBcsr->column_indices;
2949     Bcsr->values         = cBcsr->values;
2950     if (!Bcusp->rowoffsets_gpu) {
2951       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2952       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2953       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2954     }
2955     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2956     mmdata->Bcsr      = Bcsr;
2957 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2958     if (Bcsr->num_rows && Bcsr->num_cols) {
2959       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2960       PetscCallCUSPARSE(stat);
2961     }
2962     BmatSpDescr = mmdata->matSpBDescr;
2963 #endif
2964   }
2965   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2966   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2967   /* precompute flops count */
2968   if (ptype == MATPRODUCT_AB) {
2969     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2970       const PetscInt st = a->i[i];
2971       const PetscInt en = a->i[i + 1];
2972       for (j = st; j < en; j++) {
2973         const PetscInt brow = a->j[j];
2974         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2975       }
2976     }
2977   } else if (ptype == MATPRODUCT_AtB) {
2978     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2979       const PetscInt anzi = a->i[i + 1] - a->i[i];
2980       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2981       flops += (2. * anzi) * bnzi;
2982     }
2983   } else { /* TODO */
2984     flops = 0.;
2985   }
2986 
2987   mmdata->flops = flops;
2988   PetscCall(PetscLogGpuTimeBegin());
2989 
2990 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2991   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2992   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2993   PetscCallCUSPARSE(stat);
2994   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2995   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2996   {
2997     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2998      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2999   */
3000     void *dBuffer1 = NULL;
3001     void *dBuffer2 = NULL;
3002     void *dBuffer3 = NULL;
3003     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3004     size_t bufferSize1 = 0;
3005     size_t bufferSize2 = 0;
3006     size_t bufferSize3 = 0;
3007     size_t bufferSize4 = 0;
3008     size_t bufferSize5 = 0;
3009 
3010     /*----------------------------------------------------------------------*/
3011     /* ask bufferSize1 bytes for external memory */
3012     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3013     PetscCallCUSPARSE(stat);
3014     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3015     /* inspect the matrices A and B to understand the memory requirement for the next step */
3016     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3017     PetscCallCUSPARSE(stat);
3018 
3019     /*----------------------------------------------------------------------*/
3020     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3021     PetscCallCUSPARSE(stat);
3022     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3023     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3024     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3025     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3026     PetscCallCUSPARSE(stat);
3027     PetscCallCUDA(cudaFree(dBuffer1));
3028     PetscCallCUDA(cudaFree(dBuffer2));
3029 
3030     /*----------------------------------------------------------------------*/
3031     /* get matrix C non-zero entries C_nnz1 */
3032     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3033     c->nz = (PetscInt)C_nnz1;
3034     /* allocate matrix C */
3035     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3036     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3037     Ccsr->values = new THRUSTARRAY(c->nz);
3038     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3039     /* update matC with the new pointers */
3040     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3041     PetscCallCUSPARSE(stat);
3042 
3043     /*----------------------------------------------------------------------*/
3044     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3045     PetscCallCUSPARSE(stat);
3046     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3047     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3048     PetscCallCUSPARSE(stat);
3049     PetscCallCUDA(cudaFree(dBuffer3));
3050     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3051     PetscCallCUSPARSE(stat);
3052     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3053   }
3054   #else
3055   size_t bufSize2;
3056   /* ask bufferSize bytes for external memory */
3057   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3058   PetscCallCUSPARSE(stat);
3059   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3060   /* inspect the matrices A and B to understand the memory requirement for the next step */
3061   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3062   PetscCallCUSPARSE(stat);
3063   /* ask bufferSize again bytes for external memory */
3064   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3065   PetscCallCUSPARSE(stat);
3066   /* The CUSPARSE documentation is not clear, nor the API
3067      We need both buffers to perform the operations properly!
3068      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3069      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3070      is stored in the descriptor! What a messy API... */
3071   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3072   /* compute the intermediate product of A * B */
3073   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3074   PetscCallCUSPARSE(stat);
3075   /* get matrix C non-zero entries C_nnz1 */
3076   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3077   c->nz = (PetscInt)C_nnz1;
3078   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3079                       mmdata->mmBufferSize / 1024));
3080   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3081   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3082   Ccsr->values = new THRUSTARRAY(c->nz);
3083   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3084   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3085   PetscCallCUSPARSE(stat);
3086   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3087   PetscCallCUSPARSE(stat);
3088   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3089 #else
3090   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3091   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3092                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3093   PetscCallCUSPARSE(stat);
3094   c->nz = cnz;
3095   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3096   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3097   Ccsr->values = new THRUSTARRAY(c->nz);
3098   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3099 
3100   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3101   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3102      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3103      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3104   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3105                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3106   PetscCallCUSPARSE(stat);
3107 #endif
3108   PetscCall(PetscLogGpuFlops(mmdata->flops));
3109   PetscCall(PetscLogGpuTimeEnd());
3110 finalizesym:
3111   c->singlemalloc = PETSC_FALSE;
3112   c->free_a       = PETSC_TRUE;
3113   c->free_ij      = PETSC_TRUE;
3114   PetscCall(PetscMalloc1(m + 1, &c->i));
3115   PetscCall(PetscMalloc1(c->nz, &c->j));
3116   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3117     PetscInt      *d_i = c->i;
3118     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3119     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3120     ii = *Ccsr->row_offsets;
3121     jj = *Ccsr->column_indices;
3122     if (ciscompressed) d_i = c->compressedrow.i;
3123     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3124     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3125   } else {
3126     PetscInt *d_i = c->i;
3127     if (ciscompressed) d_i = c->compressedrow.i;
3128     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3129     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3130   }
3131   if (ciscompressed) { /* need to expand host row offsets */
3132     PetscInt r = 0;
3133     c->i[0]    = 0;
3134     for (k = 0; k < c->compressedrow.nrows; k++) {
3135       const PetscInt next = c->compressedrow.rindex[k];
3136       const PetscInt old  = c->compressedrow.i[k];
3137       for (; r < next; r++) c->i[r + 1] = old;
3138     }
3139     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3140   }
3141   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3142   PetscCall(PetscMalloc1(m, &c->ilen));
3143   PetscCall(PetscMalloc1(m, &c->imax));
3144   c->maxnz         = c->nz;
3145   c->nonzerorowcnt = 0;
3146   c->rmax          = 0;
3147   for (k = 0; k < m; k++) {
3148     const PetscInt nn = c->i[k + 1] - c->i[k];
3149     c->ilen[k] = c->imax[k] = nn;
3150     c->nonzerorowcnt += (PetscInt) !!nn;
3151     c->rmax = PetscMax(c->rmax, nn);
3152   }
3153   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3154   PetscCall(PetscMalloc1(c->nz, &c->a));
3155   Ccsr->num_entries = c->nz;
3156 
3157   C->nonzerostate++;
3158   PetscCall(PetscLayoutSetUp(C->rmap));
3159   PetscCall(PetscLayoutSetUp(C->cmap));
3160   Ccusp->nonzerostate = C->nonzerostate;
3161   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3162   C->preallocated     = PETSC_TRUE;
3163   C->assembled        = PETSC_FALSE;
3164   C->was_assembled    = PETSC_FALSE;
3165   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3166     mmdata->reusesym = PETSC_TRUE;
3167     C->offloadmask   = PETSC_OFFLOAD_GPU;
3168   }
3169   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3170   PetscFunctionReturn(0);
3171 }
3172 
3173 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3174 
3175 /* handles sparse or dense B */
3176 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3177 {
3178   Mat_Product *product = mat->product;
3179   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3180 
3181   PetscFunctionBegin;
3182   MatCheckProduct(mat, 1);
3183   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3184   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3185   if (product->type == MATPRODUCT_ABC) {
3186     Ciscusp = PETSC_FALSE;
3187     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3188   }
3189   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3190     PetscBool usecpu = PETSC_FALSE;
3191     switch (product->type) {
3192     case MATPRODUCT_AB:
3193       if (product->api_user) {
3194         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3195         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3196         PetscOptionsEnd();
3197       } else {
3198         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3199         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3200         PetscOptionsEnd();
3201       }
3202       break;
3203     case MATPRODUCT_AtB:
3204       if (product->api_user) {
3205         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3206         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3207         PetscOptionsEnd();
3208       } else {
3209         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3210         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3211         PetscOptionsEnd();
3212       }
3213       break;
3214     case MATPRODUCT_PtAP:
3215       if (product->api_user) {
3216         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3217         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3218         PetscOptionsEnd();
3219       } else {
3220         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3221         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3222         PetscOptionsEnd();
3223       }
3224       break;
3225     case MATPRODUCT_RARt:
3226       if (product->api_user) {
3227         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3228         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3229         PetscOptionsEnd();
3230       } else {
3231         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3232         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3233         PetscOptionsEnd();
3234       }
3235       break;
3236     case MATPRODUCT_ABC:
3237       if (product->api_user) {
3238         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3239         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3240         PetscOptionsEnd();
3241       } else {
3242         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3243         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3244         PetscOptionsEnd();
3245       }
3246       break;
3247     default:
3248       break;
3249     }
3250     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3251   }
3252   /* dispatch */
3253   if (isdense) {
3254     switch (product->type) {
3255     case MATPRODUCT_AB:
3256     case MATPRODUCT_AtB:
3257     case MATPRODUCT_ABt:
3258     case MATPRODUCT_PtAP:
3259     case MATPRODUCT_RARt:
3260       if (product->A->boundtocpu) {
3261         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3262       } else {
3263         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3264       }
3265       break;
3266     case MATPRODUCT_ABC:
3267       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3268       break;
3269     default:
3270       break;
3271     }
3272   } else if (Biscusp && Ciscusp) {
3273     switch (product->type) {
3274     case MATPRODUCT_AB:
3275     case MATPRODUCT_AtB:
3276     case MATPRODUCT_ABt:
3277       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3278       break;
3279     case MATPRODUCT_PtAP:
3280     case MATPRODUCT_RARt:
3281     case MATPRODUCT_ABC:
3282       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3283       break;
3284     default:
3285       break;
3286     }
3287   } else { /* fallback for AIJ */
3288     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3289   }
3290   PetscFunctionReturn(0);
3291 }
3292 
3293 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3294 {
3295   PetscFunctionBegin;
3296   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3297   PetscFunctionReturn(0);
3298 }
3299 
3300 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3301 {
3302   PetscFunctionBegin;
3303   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3304   PetscFunctionReturn(0);
3305 }
3306 
3307 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3308 {
3309   PetscFunctionBegin;
3310   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3311   PetscFunctionReturn(0);
3312 }
3313 
3314 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3315 {
3316   PetscFunctionBegin;
3317   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3318   PetscFunctionReturn(0);
3319 }
3320 
3321 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3322 {
3323   PetscFunctionBegin;
3324   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3325   PetscFunctionReturn(0);
3326 }
3327 
3328 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3329 {
3330   int i = blockIdx.x * blockDim.x + threadIdx.x;
3331   if (i < n) y[idx[i]] += x[i];
3332 }
3333 
3334 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3335 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3336 {
3337   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3338   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3339   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3340   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3341   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3342   PetscBool                     compressed;
3343 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3344   PetscInt nx, ny;
3345 #endif
3346 
3347   PetscFunctionBegin;
3348   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3349   if (!a->nz) {
3350     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
3351     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3352     PetscFunctionReturn(0);
3353   }
3354   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3355   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3356   if (!trans) {
3357     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3358     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3359   } else {
3360     if (herm || !A->form_explicit_transpose) {
3361       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3362       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3363     } else {
3364       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3365       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3366     }
3367   }
3368   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3369   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3370 
3371   try {
3372     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3373     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3374     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3375 
3376     PetscCall(PetscLogGpuTimeBegin());
3377     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3378       /* z = A x + beta y.
3379          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3380          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3381       */
3382       xptr = xarray;
3383       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3384       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3385 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3386       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3387           allocated to accommodate different uses. So we get the length info directly from mat.
3388        */
3389       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3390         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3391         nx             = mat->num_cols;
3392         ny             = mat->num_rows;
3393       }
3394 #endif
3395     } else {
3396       /* z = A^T x + beta y
3397          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3398          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3399        */
3400       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3401       dptr = zarray;
3402       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3403       if (compressed) { /* Scatter x to work vector */
3404         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3405 
3406         thrust::for_each(
3407 #if PetscDefined(HAVE_THRUST_ASYNC)
3408           thrust::cuda::par.on(PetscDefaultCudaStream),
3409 #endif
3410           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3411           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3412       }
3413 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3414       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3415         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3416         nx             = mat->num_rows;
3417         ny             = mat->num_cols;
3418       }
3419 #endif
3420     }
3421 
3422     /* csr_spmv does y = alpha op(A) x + beta y */
3423     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3424 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3425       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3426       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3427         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3428         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3429         PetscCallCUSPARSE(
3430           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3431         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3432 
3433         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3434       } else {
3435         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3436         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3437         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3438       }
3439 
3440       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3441                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3442 #else
3443       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3444       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3445 #endif
3446     } else {
3447       if (cusparsestruct->nrows) {
3448 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3449         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3450 #else
3451         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3452         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3453 #endif
3454       }
3455     }
3456     PetscCall(PetscLogGpuTimeEnd());
3457 
3458     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3459       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3460         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3461           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3462         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3463           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
3464         }
3465       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3466         PetscCall(VecSet_SeqCUDA(zz, 0));
3467       }
3468 
3469       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3470       if (compressed) {
3471         PetscCall(PetscLogGpuTimeBegin());
3472         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3473            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3474            prevent that. So I just add a ScatterAdd kernel.
3475          */
3476 #if 0
3477         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3478         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3479                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3480                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3481                          VecCUDAPlusEquals());
3482 #else
3483         PetscInt n = matstruct->cprowIndices->size();
3484         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3485 #endif
3486         PetscCall(PetscLogGpuTimeEnd());
3487       }
3488     } else {
3489       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3490     }
3491     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3492     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3493     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3494   } catch (char *ex) {
3495     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3496   }
3497   if (yy) {
3498     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3499   } else {
3500     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3501   }
3502   PetscFunctionReturn(0);
3503 }
3504 
3505 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3506 {
3507   PetscFunctionBegin;
3508   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3509   PetscFunctionReturn(0);
3510 }
3511 
3512 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3513 {
3514   PetscObjectState    onnz = A->nonzerostate;
3515   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3516 
3517   PetscFunctionBegin;
3518   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3519   if (onnz != A->nonzerostate && cusp->deviceMat) {
3520     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3521     PetscCallCUDA(cudaFree(cusp->deviceMat));
3522     cusp->deviceMat = NULL;
3523   }
3524   PetscFunctionReturn(0);
3525 }
3526 
3527 /* --------------------------------------------------------------------------------*/
3528 /*@
3529    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3530    (the default parallel PETSc format). This matrix will ultimately pushed down
3531    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3532    assembly performance the user should preallocate the matrix storage by setting
3533    the parameter nz (or the array nnz).  By setting these parameters accurately,
3534    performance during matrix assembly can be increased by more than a factor of 50.
3535 
3536    Collective
3537 
3538    Input Parameters:
3539 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3540 .  m - number of rows
3541 .  n - number of columns
3542 .  nz - number of nonzeros per row (same for all rows)
3543 -  nnz - array containing the number of nonzeros in the various rows
3544          (possibly different for each row) or NULL
3545 
3546    Output Parameter:
3547 .  A - the matrix
3548 
3549    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3550    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3551    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3552 
3553    Notes:
3554    If nnz is given then nz is ignored
3555 
3556    The AIJ format, also called
3557    compressed row storage, is fully compatible with standard Fortran 77
3558    storage.  That is, the stored row and column indices can begin at
3559    either one (as in Fortran) or zero.  See the users' manual for details.
3560 
3561    Specify the preallocated storage with either nz or nnz (not both).
3562    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3563    allocation.  For large problems you MUST preallocate memory or you
3564    will get TERRIBLE performance, see the users' manual chapter on matrices.
3565 
3566    By default, this format uses inodes (identical nodes) when possible, to
3567    improve numerical efficiency of matrix-vector products and solves. We
3568    search for consecutive rows with the same nonzero structure, thereby
3569    reusing matrix information to achieve increased efficiency.
3570 
3571    Level: intermediate
3572 
3573 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3574 @*/
3575 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3576 {
3577   PetscFunctionBegin;
3578   PetscCall(MatCreate(comm, A));
3579   PetscCall(MatSetSizes(*A, m, n, m, n));
3580   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3581   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3582   PetscFunctionReturn(0);
3583 }
3584 
3585 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3586 {
3587   PetscFunctionBegin;
3588   if (A->factortype == MAT_FACTOR_NONE) {
3589     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3590   } else {
3591     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3592   }
3593   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3594   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3595   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3596   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3597   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3598   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3599   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3600   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3601   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3602   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3603   PetscCall(MatDestroy_SeqAIJ(A));
3604   PetscFunctionReturn(0);
3605 }
3606 
3607 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3608 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3609 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3610 {
3611   PetscFunctionBegin;
3612   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3613   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3614   PetscFunctionReturn(0);
3615 }
3616 
3617 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3618 {
3619   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3620   Mat_SeqAIJCUSPARSE *cy;
3621   Mat_SeqAIJCUSPARSE *cx;
3622   PetscScalar        *ay;
3623   const PetscScalar  *ax;
3624   CsrMatrix          *csry, *csrx;
3625 
3626   PetscFunctionBegin;
3627   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3628   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3629   if (X->ops->axpy != Y->ops->axpy) {
3630     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3631     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3632     PetscFunctionReturn(0);
3633   }
3634   /* if we are here, it means both matrices are bound to GPU */
3635   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3636   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3637   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3638   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3639   csry = (CsrMatrix *)cy->mat->mat;
3640   csrx = (CsrMatrix *)cx->mat->mat;
3641   /* see if we can turn this into a cublas axpy */
3642   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3643     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3644     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3645     if (eq) str = SAME_NONZERO_PATTERN;
3646   }
3647   /* spgeam is buggy with one column */
3648   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3649 
3650   if (str == SUBSET_NONZERO_PATTERN) {
3651     PetscScalar b = 1.0;
3652 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3653     size_t bufferSize;
3654     void  *buffer;
3655 #endif
3656 
3657     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3658     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3659     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3660 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3661     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3662                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3663     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3664     PetscCall(PetscLogGpuTimeBegin());
3665     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3666                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3667     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3668     PetscCall(PetscLogGpuTimeEnd());
3669     PetscCallCUDA(cudaFree(buffer));
3670 #else
3671     PetscCall(PetscLogGpuTimeBegin());
3672     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3673                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3674     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3675     PetscCall(PetscLogGpuTimeEnd());
3676 #endif
3677     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3678     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3679     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3680     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3681   } else if (str == SAME_NONZERO_PATTERN) {
3682     cublasHandle_t cublasv2handle;
3683     PetscBLASInt   one = 1, bnz = 1;
3684 
3685     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3686     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3687     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3688     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3689     PetscCall(PetscLogGpuTimeBegin());
3690     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3691     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3692     PetscCall(PetscLogGpuTimeEnd());
3693     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3694     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3695     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3696   } else {
3697     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3698     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3699   }
3700   PetscFunctionReturn(0);
3701 }
3702 
3703 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3704 {
3705   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3706   PetscScalar   *ay;
3707   cublasHandle_t cublasv2handle;
3708   PetscBLASInt   one = 1, bnz = 1;
3709 
3710   PetscFunctionBegin;
3711   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3712   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3713   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3714   PetscCall(PetscLogGpuTimeBegin());
3715   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3716   PetscCall(PetscLogGpuFlops(bnz));
3717   PetscCall(PetscLogGpuTimeEnd());
3718   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3719   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3720   PetscFunctionReturn(0);
3721 }
3722 
3723 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3724 {
3725   PetscBool   both = PETSC_FALSE;
3726   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3727 
3728   PetscFunctionBegin;
3729   if (A->factortype == MAT_FACTOR_NONE) {
3730     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3731     if (spptr->mat) {
3732       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3733       if (matrix->values) {
3734         both = PETSC_TRUE;
3735         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3736       }
3737     }
3738     if (spptr->matTranspose) {
3739       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3740       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3741     }
3742   }
3743   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3744   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3745   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3746   else A->offloadmask = PETSC_OFFLOAD_CPU;
3747   PetscFunctionReturn(0);
3748 }
3749 
3750 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3751 {
3752   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3753 
3754   PetscFunctionBegin;
3755   if (A->factortype != MAT_FACTOR_NONE) {
3756     A->boundtocpu = flg;
3757     PetscFunctionReturn(0);
3758   }
3759   if (flg) {
3760     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3761 
3762     A->ops->scale                     = MatScale_SeqAIJ;
3763     A->ops->axpy                      = MatAXPY_SeqAIJ;
3764     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3765     A->ops->mult                      = MatMult_SeqAIJ;
3766     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3767     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3768     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3769     A->ops->multhermitiantranspose    = NULL;
3770     A->ops->multhermitiantransposeadd = NULL;
3771     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3772     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3773     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3774     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3775     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3776     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3777     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3778     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3779   } else {
3780     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3781     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3782     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3783     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3784     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3785     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3786     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3787     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3788     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3789     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3790     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3791     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3792     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3793     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3794     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3795     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3796     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3797 
3798     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3799     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3800     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3801     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3802     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3803     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3804   }
3805   A->boundtocpu = flg;
3806   if (flg && a->inode.size) {
3807     a->inode.use = PETSC_TRUE;
3808   } else {
3809     a->inode.use = PETSC_FALSE;
3810   }
3811   PetscFunctionReturn(0);
3812 }
3813 
3814 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3815 {
3816   Mat B;
3817 
3818   PetscFunctionBegin;
3819   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3820   if (reuse == MAT_INITIAL_MATRIX) {
3821     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3822   } else if (reuse == MAT_REUSE_MATRIX) {
3823     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3824   }
3825   B = *newmat;
3826 
3827   PetscCall(PetscFree(B->defaultvectype));
3828   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3829 
3830   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3831     if (B->factortype == MAT_FACTOR_NONE) {
3832       Mat_SeqAIJCUSPARSE *spptr;
3833       PetscCall(PetscNew(&spptr));
3834       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3835       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3836       spptr->format = MAT_CUSPARSE_CSR;
3837 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3838   #if CUSPARSE_VERSION > 11301
3839       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3840   #else
3841       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3842   #endif
3843       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3844       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3845 #endif
3846       B->spptr = spptr;
3847     } else {
3848       Mat_SeqAIJCUSPARSETriFactors *spptr;
3849 
3850       PetscCall(PetscNew(&spptr));
3851       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3852       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3853       B->spptr = spptr;
3854     }
3855     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3856   }
3857   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3858   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3859   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3860   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3861   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3862   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3863 
3864   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3865   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3866   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3867 #if defined(PETSC_HAVE_HYPRE)
3868   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3869 #endif
3870   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3871   PetscFunctionReturn(0);
3872 }
3873 
3874 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3875 {
3876   PetscFunctionBegin;
3877   PetscCall(MatCreate_SeqAIJ(B));
3878   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3879   PetscFunctionReturn(0);
3880 }
3881 
3882 /*MC
3883    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3884 
3885    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3886    CSR, ELL, or Hybrid format.
3887    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3888 
3889    Options Database Keys:
3890 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3891 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3892 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3893 +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3894 
3895   Level: beginner
3896 
3897 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3898 M*/
3899 
3900 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3901 
3902 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3903 {
3904   PetscFunctionBegin;
3905   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3906   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3907   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3908   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3909   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3910 
3911   PetscFunctionReturn(0);
3912 }
3913 
3914 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3915 {
3916   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3917 
3918   PetscFunctionBegin;
3919   if (!cusp) PetscFunctionReturn(0);
3920   delete cusp->cooPerm;
3921   delete cusp->cooPerm_a;
3922   cusp->cooPerm   = NULL;
3923   cusp->cooPerm_a = NULL;
3924   if (cusp->use_extended_coo) {
3925     PetscCallCUDA(cudaFree(cusp->jmap_d));
3926     PetscCallCUDA(cudaFree(cusp->perm_d));
3927   }
3928   cusp->use_extended_coo = PETSC_FALSE;
3929   PetscFunctionReturn(0);
3930 }
3931 
3932 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3933 {
3934   PetscFunctionBegin;
3935   if (*cusparsestruct) {
3936     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3937     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3938     delete (*cusparsestruct)->workVector;
3939     delete (*cusparsestruct)->rowoffsets_gpu;
3940     delete (*cusparsestruct)->cooPerm;
3941     delete (*cusparsestruct)->cooPerm_a;
3942     delete (*cusparsestruct)->csr2csc_i;
3943     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3944     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3945     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3946     PetscCall(PetscFree(*cusparsestruct));
3947   }
3948   PetscFunctionReturn(0);
3949 }
3950 
3951 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3952 {
3953   PetscFunctionBegin;
3954   if (*mat) {
3955     delete (*mat)->values;
3956     delete (*mat)->column_indices;
3957     delete (*mat)->row_offsets;
3958     delete *mat;
3959     *mat = 0;
3960   }
3961   PetscFunctionReturn(0);
3962 }
3963 
3964 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3965 {
3966   PetscFunctionBegin;
3967   if (*trifactor) {
3968     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3969     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3970     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3971     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3972     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3973 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3974     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3975 #endif
3976     PetscCall(PetscFree(*trifactor));
3977   }
3978   PetscFunctionReturn(0);
3979 }
3980 
3981 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3982 {
3983   CsrMatrix *mat;
3984 
3985   PetscFunctionBegin;
3986   if (*matstruct) {
3987     if ((*matstruct)->mat) {
3988       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3989 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3990         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3991 #else
3992         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3993         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3994 #endif
3995       } else {
3996         mat = (CsrMatrix *)(*matstruct)->mat;
3997         CsrMatrix_Destroy(&mat);
3998       }
3999     }
4000     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4001     delete (*matstruct)->cprowIndices;
4002     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4003     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4004     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4005 
4006 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4007     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4008     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4009     for (int i = 0; i < 3; i++) {
4010       if (mdata->cuSpMV[i].initialized) {
4011         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4012         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4013         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4014       }
4015     }
4016 #endif
4017     delete *matstruct;
4018     *matstruct = NULL;
4019   }
4020   PetscFunctionReturn(0);
4021 }
4022 
4023 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4024 {
4025   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4026 
4027   PetscFunctionBegin;
4028   if (fs) {
4029     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4030     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4031     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4032     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4033     delete fs->rpermIndices;
4034     delete fs->cpermIndices;
4035     delete fs->workVector;
4036     fs->rpermIndices = NULL;
4037     fs->cpermIndices = NULL;
4038     fs->workVector   = NULL;
4039     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
4040     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
4041     fs->init_dev_prop = PETSC_FALSE;
4042 #if CUSPARSE_VERSION >= 11500
4043     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4044     PetscCallCUDA(cudaFree(fs->csrColIdx));
4045     PetscCallCUDA(cudaFree(fs->csrVal));
4046     PetscCallCUDA(cudaFree(fs->X));
4047     PetscCallCUDA(cudaFree(fs->Y));
4048     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4049     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4050     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4051     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4052     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4053     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4054     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4055     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4056     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4057     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4058     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4059     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4060     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4061     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4062     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4063     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4064 
4065     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4066     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4067 #endif
4068   }
4069   PetscFunctionReturn(0);
4070 }
4071 
4072 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4073 {
4074   cusparseHandle_t handle;
4075 
4076   PetscFunctionBegin;
4077   if (*trifactors) {
4078     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4079     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
4080     PetscCall(PetscFree(*trifactors));
4081   }
4082   PetscFunctionReturn(0);
4083 }
4084 
4085 struct IJCompare {
4086   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4087   {
4088     if (t1.get<0>() < t2.get<0>()) return true;
4089     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4090     return false;
4091   }
4092 };
4093 
4094 struct IJEqual {
4095   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4096   {
4097     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4098     return true;
4099   }
4100 };
4101 
4102 struct IJDiff {
4103   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
4104 };
4105 
4106 struct IJSum {
4107   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
4108 };
4109 
4110 #include <thrust/iterator/discard_iterator.h>
4111 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4112 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4113 {
4114   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4115   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4116   THRUSTARRAY                          *cooPerm_v = NULL;
4117   thrust::device_ptr<const PetscScalar> d_v;
4118   CsrMatrix                            *matrix;
4119   PetscInt                              n;
4120 
4121   PetscFunctionBegin;
4122   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4123   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4124   if (!cusp->cooPerm) {
4125     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4126     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4127     PetscFunctionReturn(0);
4128   }
4129   matrix = (CsrMatrix *)cusp->mat->mat;
4130   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4131   if (!v) {
4132     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4133     goto finalize;
4134   }
4135   n = cusp->cooPerm->size();
4136   if (isCudaMem(v)) {
4137     d_v = thrust::device_pointer_cast(v);
4138   } else {
4139     cooPerm_v = new THRUSTARRAY(n);
4140     cooPerm_v->assign(v, v + n);
4141     d_v = cooPerm_v->data();
4142     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4143   }
4144   PetscCall(PetscLogGpuTimeBegin());
4145   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4146     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4147       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4148       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4149       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4150         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4151         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4152       */
4153       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4154       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4155       delete cooPerm_w;
4156     } else {
4157       /* all nonzeros in d_v[] are unique entries */
4158       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4159       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4160       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4161     }
4162   } else {
4163     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4164       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4165       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4166     } else {
4167       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4168       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4169       thrust::for_each(zibit, zieit, VecCUDAEquals());
4170     }
4171   }
4172   PetscCall(PetscLogGpuTimeEnd());
4173 finalize:
4174   delete cooPerm_v;
4175   A->offloadmask = PETSC_OFFLOAD_GPU;
4176   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4177   /* shorter version of MatAssemblyEnd_SeqAIJ */
4178   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4179   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4180   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4181   a->reallocs = 0;
4182   A->info.mallocs += 0;
4183   A->info.nz_unneeded = 0;
4184   A->assembled = A->was_assembled = PETSC_TRUE;
4185   A->num_ass++;
4186   PetscFunctionReturn(0);
4187 }
4188 
4189 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4190 {
4191   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4192 
4193   PetscFunctionBegin;
4194   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4195   if (!cusp) PetscFunctionReturn(0);
4196   if (destroy) {
4197     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4198     delete cusp->csr2csc_i;
4199     cusp->csr2csc_i = NULL;
4200   }
4201   A->transupdated = PETSC_FALSE;
4202   PetscFunctionReturn(0);
4203 }
4204 
4205 #include <thrust/binary_search.h>
4206 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4207 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4208 {
4209   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4210   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4211   PetscInt            cooPerm_n, nzr = 0;
4212 
4213   PetscFunctionBegin;
4214   PetscCall(PetscLayoutSetUp(A->rmap));
4215   PetscCall(PetscLayoutSetUp(A->cmap));
4216   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4217   if (n != cooPerm_n) {
4218     delete cusp->cooPerm;
4219     delete cusp->cooPerm_a;
4220     cusp->cooPerm   = NULL;
4221     cusp->cooPerm_a = NULL;
4222   }
4223   if (n) {
4224     thrust::device_ptr<PetscInt> d_i, d_j;
4225     PetscInt                    *d_raw_i, *d_raw_j;
4226     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4227     PetscMemType                 imtype, jmtype;
4228 
4229     PetscCall(PetscGetMemType(coo_i, &imtype));
4230     if (PetscMemTypeHost(imtype)) {
4231       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4232       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4233       d_i        = thrust::device_pointer_cast(d_raw_i);
4234       free_raw_i = PETSC_TRUE;
4235       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4236     } else {
4237       d_i = thrust::device_pointer_cast(coo_i);
4238     }
4239 
4240     PetscCall(PetscGetMemType(coo_j, &jmtype));
4241     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4242       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4243       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4244       d_j        = thrust::device_pointer_cast(d_raw_j);
4245       free_raw_j = PETSC_TRUE;
4246       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4247     } else {
4248       d_j = thrust::device_pointer_cast(coo_j);
4249     }
4250 
4251     THRUSTINTARRAY ii(A->rmap->n);
4252 
4253     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4254     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4255 
4256     /* Ex.
4257       n = 6
4258       coo_i = [3,3,1,4,1,4]
4259       coo_j = [3,2,2,5,2,6]
4260     */
4261     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4262     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4263 
4264     PetscCall(PetscLogGpuTimeBegin());
4265     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4266     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4267     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4268     THRUSTINTARRAY w(d_j, d_j + n);
4269 
4270     /*
4271       d_i     = [1,1,3,3,4,4]
4272       d_j     = [2,2,2,3,5,6]
4273       cooPerm = [2,4,1,0,3,5]
4274     */
4275     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4276 
4277     /*
4278       d_i     = [1,3,3,4,4,x]
4279                             ^ekey
4280       d_j     = [2,2,3,5,6,x]
4281                            ^nekye
4282     */
4283     if (nekey == ekey) { /* all entries are unique */
4284       delete cusp->cooPerm_a;
4285       cusp->cooPerm_a = NULL;
4286     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4287       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4288       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4289       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4290       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4291       w[0]                  = 0;
4292       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4293       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4294     }
4295     thrust::counting_iterator<PetscInt> search_begin(0);
4296     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4297                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4298                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4299     PetscCall(PetscLogGpuTimeEnd());
4300 
4301     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4302     a->singlemalloc = PETSC_FALSE;
4303     a->free_a       = PETSC_TRUE;
4304     a->free_ij      = PETSC_TRUE;
4305     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4306     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4307     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4308     a->nz = a->maxnz = a->i[A->rmap->n];
4309     a->rmax          = 0;
4310     PetscCall(PetscMalloc1(a->nz, &a->a));
4311     PetscCall(PetscMalloc1(a->nz, &a->j));
4312     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4313     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4314     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4315     for (PetscInt i = 0; i < A->rmap->n; i++) {
4316       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4317       nzr += (PetscInt) !!(nnzr);
4318       a->ilen[i] = a->imax[i] = nnzr;
4319       a->rmax                 = PetscMax(a->rmax, nnzr);
4320     }
4321     a->nonzerorowcnt = nzr;
4322     A->preallocated  = PETSC_TRUE;
4323     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4324     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4325     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4326     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4327   } else {
4328     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4329   }
4330   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4331 
4332   /* We want to allocate the CUSPARSE struct for matvec now.
4333      The code is so convoluted now that I prefer to copy zeros */
4334   PetscCall(PetscArrayzero(a->a, a->nz));
4335   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4336   A->offloadmask = PETSC_OFFLOAD_CPU;
4337   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4338   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4339   PetscFunctionReturn(0);
4340 }
4341 
4342 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4343 {
4344   Mat_SeqAIJ         *seq;
4345   Mat_SeqAIJCUSPARSE *dev;
4346   PetscBool           coo_basic = PETSC_TRUE;
4347   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4348 
4349   PetscFunctionBegin;
4350   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4351   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4352   if (coo_i) {
4353     PetscCall(PetscGetMemType(coo_i, &mtype));
4354     if (PetscMemTypeHost(mtype)) {
4355       for (PetscCount k = 0; k < coo_n; k++) {
4356         if (coo_i[k] < 0 || coo_j[k] < 0) {
4357           coo_basic = PETSC_FALSE;
4358           break;
4359         }
4360       }
4361     }
4362   }
4363 
4364   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4365     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4366   } else {
4367     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4368     mat->offloadmask = PETSC_OFFLOAD_CPU;
4369     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4370     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4371     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4372     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4373     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4374     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4375     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4376     dev->use_extended_coo = PETSC_TRUE;
4377   }
4378   PetscFunctionReturn(0);
4379 }
4380 
4381 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4382 {
4383   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4384   const PetscCount grid_size = gridDim.x * blockDim.x;
4385   for (; i < nnz; i += grid_size) {
4386     PetscScalar sum = 0.0;
4387     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4388     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4389   }
4390 }
4391 
4392 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4393 {
4394   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4395   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4396   PetscCount          Annz = seq->nz;
4397   PetscMemType        memtype;
4398   const PetscScalar  *v1 = v;
4399   PetscScalar        *Aa;
4400 
4401   PetscFunctionBegin;
4402   if (dev->use_extended_coo) {
4403     PetscCall(PetscGetMemType(v, &memtype));
4404     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4405       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4406       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4407     }
4408 
4409     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4410     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4411 
4412     if (Annz) {
4413       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4414       PetscCallCUDA(cudaPeekAtLastError());
4415     }
4416 
4417     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4418     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4419 
4420     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4421   } else {
4422     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4423   }
4424   PetscFunctionReturn(0);
4425 }
4426 
4427 /*@C
4428     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
4429 
4430    Not collective
4431 
4432     Input Parameters:
4433 +   A - the matrix
4434 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4435 
4436     Output Parameters:
4437 +   ia - the CSR row pointers
4438 -   ja - the CSR column indices
4439 
4440     Level: developer
4441 
4442     Note:
4443       When compressed is true, the CSR structure does not contain empty rows
4444 
4445 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4446 @*/
4447 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4448 {
4449   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4450   CsrMatrix          *csr;
4451   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4452 
4453   PetscFunctionBegin;
4454   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4455   if (!i || !j) PetscFunctionReturn(0);
4456   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4457   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4458   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4459   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4460   csr = (CsrMatrix *)cusp->mat->mat;
4461   if (i) {
4462     if (!compressed && a->compressedrow.use) { /* need full row offset */
4463       if (!cusp->rowoffsets_gpu) {
4464         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4465         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4466         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4467       }
4468       *i = cusp->rowoffsets_gpu->data().get();
4469     } else *i = csr->row_offsets->data().get();
4470   }
4471   if (j) *j = csr->column_indices->data().get();
4472   PetscFunctionReturn(0);
4473 }
4474 
4475 /*@C
4476     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4477 
4478    Not collective
4479 
4480     Input Parameters:
4481 +   A - the matrix
4482 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4483 
4484     Output Parameters:
4485 +   ia - the CSR row pointers
4486 -   ja - the CSR column indices
4487 
4488     Level: developer
4489 
4490 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4491 @*/
4492 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4493 {
4494   PetscFunctionBegin;
4495   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4496   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4497   if (i) *i = NULL;
4498   if (j) *j = NULL;
4499   PetscFunctionReturn(0);
4500 }
4501 
4502 /*@C
4503    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4504 
4505    Not Collective
4506 
4507    Input Parameter:
4508 .   A - a `MATSEQAIJCUSPARSE` matrix
4509 
4510    Output Parameter:
4511 .   a - pointer to the device data
4512 
4513    Level: developer
4514 
4515    Note:
4516    May trigger host-device copies if up-to-date matrix data is on host
4517 
4518 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4519 @*/
4520 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4521 {
4522   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4523   CsrMatrix          *csr;
4524 
4525   PetscFunctionBegin;
4526   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4527   PetscValidPointer(a, 2);
4528   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4529   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4530   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4531   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4532   csr = (CsrMatrix *)cusp->mat->mat;
4533   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4534   *a = csr->values->data().get();
4535   PetscFunctionReturn(0);
4536 }
4537 
4538 /*@C
4539    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4540 
4541    Not Collective
4542 
4543    Input Parameter:
4544 .   A - a `MATSEQAIJCUSPARSE` matrix
4545 
4546    Output Parameter:
4547 .   a - pointer to the device data
4548 
4549    Level: developer
4550 
4551 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4552 @*/
4553 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4554 {
4555   PetscFunctionBegin;
4556   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4557   PetscValidPointer(a, 2);
4558   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4559   *a = NULL;
4560   PetscFunctionReturn(0);
4561 }
4562 
4563 /*@C
4564    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4565 
4566    Not Collective
4567 
4568    Input Parameter:
4569 .   A - a `MATSEQAIJCUSPARSE` matrix
4570 
4571    Output Parameter:
4572 .   a - pointer to the device data
4573 
4574    Level: developer
4575 
4576    Note:
4577    May trigger host-device copies if up-to-date matrix data is on host
4578 
4579 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4580 @*/
4581 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4582 {
4583   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4584   CsrMatrix          *csr;
4585 
4586   PetscFunctionBegin;
4587   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4588   PetscValidPointer(a, 2);
4589   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4590   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4591   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4592   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4593   csr = (CsrMatrix *)cusp->mat->mat;
4594   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4595   *a             = csr->values->data().get();
4596   A->offloadmask = PETSC_OFFLOAD_GPU;
4597   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4598   PetscFunctionReturn(0);
4599 }
4600 /*@C
4601    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4602 
4603    Not Collective
4604 
4605    Input Parameter:
4606 .   A - a `MATSEQAIJCUSPARSE` matrix
4607 
4608    Output Parameter:
4609 .   a - pointer to the device data
4610 
4611    Level: developer
4612 
4613 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4614 @*/
4615 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4616 {
4617   PetscFunctionBegin;
4618   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4619   PetscValidPointer(a, 2);
4620   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4621   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4622   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4623   *a = NULL;
4624   PetscFunctionReturn(0);
4625 }
4626 
4627 /*@C
4628    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4629 
4630    Not Collective
4631 
4632    Input Parameter:
4633 .   A - a `MATSEQAIJCUSPARSE` matrix
4634 
4635    Output Parameter:
4636 .   a - pointer to the device data
4637 
4638    Level: developer
4639 
4640    Note:
4641    Does not trigger host-device copies and flags data validity on the GPU
4642 
4643 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4644 @*/
4645 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4646 {
4647   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4648   CsrMatrix          *csr;
4649 
4650   PetscFunctionBegin;
4651   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4652   PetscValidPointer(a, 2);
4653   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4654   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4655   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4656   csr = (CsrMatrix *)cusp->mat->mat;
4657   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4658   *a             = csr->values->data().get();
4659   A->offloadmask = PETSC_OFFLOAD_GPU;
4660   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4661   PetscFunctionReturn(0);
4662 }
4663 
4664 /*@C
4665    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4666 
4667    Not Collective
4668 
4669    Input Parameter:
4670 .   A - a `MATSEQAIJCUSPARSE` matrix
4671 
4672    Output Parameter:
4673 .   a - pointer to the device data
4674 
4675    Level: developer
4676 
4677 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4678 @*/
4679 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4680 {
4681   PetscFunctionBegin;
4682   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4683   PetscValidPointer(a, 2);
4684   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4685   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4686   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4687   *a = NULL;
4688   PetscFunctionReturn(0);
4689 }
4690 
4691 struct IJCompare4 {
4692   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4693   {
4694     if (t1.get<0>() < t2.get<0>()) return true;
4695     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4696     return false;
4697   }
4698 };
4699 
4700 struct Shift {
4701   int _shift;
4702 
4703   Shift(int shift) : _shift(shift) { }
4704   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4705 };
4706 
4707 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4708 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4709 {
4710   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4711   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4712   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4713   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4714   PetscInt                      Annz, Bnnz;
4715   cusparseStatus_t              stat;
4716   PetscInt                      i, m, n, zero = 0;
4717 
4718   PetscFunctionBegin;
4719   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4720   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4721   PetscValidPointer(C, 4);
4722   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4723   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4724   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4725   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4726   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4727   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4728   if (reuse == MAT_INITIAL_MATRIX) {
4729     m = A->rmap->n;
4730     n = A->cmap->n + B->cmap->n;
4731     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4732     PetscCall(MatSetSizes(*C, m, n, m, n));
4733     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4734     c                       = (Mat_SeqAIJ *)(*C)->data;
4735     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4736     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4737     Ccsr                    = new CsrMatrix;
4738     Cmat->cprowIndices      = NULL;
4739     c->compressedrow.use    = PETSC_FALSE;
4740     c->compressedrow.nrows  = 0;
4741     c->compressedrow.i      = NULL;
4742     c->compressedrow.rindex = NULL;
4743     Ccusp->workVector       = NULL;
4744     Ccusp->nrows            = m;
4745     Ccusp->mat              = Cmat;
4746     Ccusp->mat->mat         = Ccsr;
4747     Ccsr->num_rows          = m;
4748     Ccsr->num_cols          = n;
4749     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4750     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4751     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4752     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4753     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4754     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4755     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4756     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4757     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4758     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4759     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4760     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4761     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4762 
4763     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4764     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4765     Annz                 = (PetscInt)Acsr->column_indices->size();
4766     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4767     c->nz                = Annz + Bnnz;
4768     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4769     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4770     Ccsr->values         = new THRUSTARRAY(c->nz);
4771     Ccsr->num_entries    = c->nz;
4772     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4773     if (c->nz) {
4774       auto              Acoo = new THRUSTINTARRAY32(Annz);
4775       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4776       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4777       THRUSTINTARRAY32 *Aroff, *Broff;
4778 
4779       if (a->compressedrow.use) { /* need full row offset */
4780         if (!Acusp->rowoffsets_gpu) {
4781           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4782           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4783           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4784         }
4785         Aroff = Acusp->rowoffsets_gpu;
4786       } else Aroff = Acsr->row_offsets;
4787       if (b->compressedrow.use) { /* need full row offset */
4788         if (!Bcusp->rowoffsets_gpu) {
4789           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4790           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4791           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4792         }
4793         Broff = Bcusp->rowoffsets_gpu;
4794       } else Broff = Bcsr->row_offsets;
4795       PetscCall(PetscLogGpuTimeBegin());
4796       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4797       PetscCallCUSPARSE(stat);
4798       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4799       PetscCallCUSPARSE(stat);
4800       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4801       auto Aperm = thrust::make_constant_iterator(1);
4802       auto Bperm = thrust::make_constant_iterator(0);
4803 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4804       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4805       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4806 #else
4807       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4808       auto Bcib = Bcsr->column_indices->begin();
4809       auto Bcie = Bcsr->column_indices->end();
4810       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4811 #endif
4812       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4813       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4814       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4815       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4816       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4817       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4818       auto p1    = Ccusp->cooPerm->begin();
4819       auto p2    = Ccusp->cooPerm->begin();
4820       thrust::advance(p2, Annz);
4821       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4822 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4823       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4824 #endif
4825       auto cci = thrust::make_counting_iterator(zero);
4826       auto cce = thrust::make_counting_iterator(c->nz);
4827 #if 0 //Errors on SUMMIT cuda 11.1.0
4828       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4829 #else
4830       auto pred = thrust::identity<int>();
4831       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4832       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4833 #endif
4834       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4835       PetscCallCUSPARSE(stat);
4836       PetscCall(PetscLogGpuTimeEnd());
4837       delete wPerm;
4838       delete Acoo;
4839       delete Bcoo;
4840       delete Ccoo;
4841 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4842       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4843       PetscCallCUSPARSE(stat);
4844 #endif
4845       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4846         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4847         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4848         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4849         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4850         CsrMatrix                    *CcsrT = new CsrMatrix;
4851         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4852         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4853 
4854         (*C)->form_explicit_transpose = PETSC_TRUE;
4855         (*C)->transupdated            = PETSC_TRUE;
4856         Ccusp->rowoffsets_gpu         = NULL;
4857         CmatT->cprowIndices           = NULL;
4858         CmatT->mat                    = CcsrT;
4859         CcsrT->num_rows               = n;
4860         CcsrT->num_cols               = m;
4861         CcsrT->num_entries            = c->nz;
4862 
4863         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4864         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4865         CcsrT->values         = new THRUSTARRAY(c->nz);
4866 
4867         PetscCall(PetscLogGpuTimeBegin());
4868         auto rT = CcsrT->row_offsets->begin();
4869         if (AT) {
4870           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4871           thrust::advance(rT, -1);
4872         }
4873         if (BT) {
4874           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4875           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4876           thrust::copy(titb, tite, rT);
4877         }
4878         auto cT = CcsrT->column_indices->begin();
4879         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4880         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4881         auto vT = CcsrT->values->begin();
4882         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4883         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4884         PetscCall(PetscLogGpuTimeEnd());
4885 
4886         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4887         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4888         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4889         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4890         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4891         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4892         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4893         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4894         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4895 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4896         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4897         PetscCallCUSPARSE(stat);
4898 #endif
4899         Ccusp->matTranspose = CmatT;
4900       }
4901     }
4902 
4903     c->singlemalloc = PETSC_FALSE;
4904     c->free_a       = PETSC_TRUE;
4905     c->free_ij      = PETSC_TRUE;
4906     PetscCall(PetscMalloc1(m + 1, &c->i));
4907     PetscCall(PetscMalloc1(c->nz, &c->j));
4908     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4909       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4910       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4911       ii = *Ccsr->row_offsets;
4912       jj = *Ccsr->column_indices;
4913       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4914       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4915     } else {
4916       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4917       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4918     }
4919     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4920     PetscCall(PetscMalloc1(m, &c->ilen));
4921     PetscCall(PetscMalloc1(m, &c->imax));
4922     c->maxnz         = c->nz;
4923     c->nonzerorowcnt = 0;
4924     c->rmax          = 0;
4925     for (i = 0; i < m; i++) {
4926       const PetscInt nn = c->i[i + 1] - c->i[i];
4927       c->ilen[i] = c->imax[i] = nn;
4928       c->nonzerorowcnt += (PetscInt) !!nn;
4929       c->rmax = PetscMax(c->rmax, nn);
4930     }
4931     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4932     PetscCall(PetscMalloc1(c->nz, &c->a));
4933     (*C)->nonzerostate++;
4934     PetscCall(PetscLayoutSetUp((*C)->rmap));
4935     PetscCall(PetscLayoutSetUp((*C)->cmap));
4936     Ccusp->nonzerostate = (*C)->nonzerostate;
4937     (*C)->preallocated  = PETSC_TRUE;
4938   } else {
4939     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4940     c = (Mat_SeqAIJ *)(*C)->data;
4941     if (c->nz) {
4942       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4943       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4944       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4945       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4946       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4947       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4948       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4949       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4950       Acsr = (CsrMatrix *)Acusp->mat->mat;
4951       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4952       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4953       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4954       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4955       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4956       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4957       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4958       auto pmid = Ccusp->cooPerm->begin();
4959       thrust::advance(pmid, Acsr->num_entries);
4960       PetscCall(PetscLogGpuTimeBegin());
4961       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4962       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4963       thrust::for_each(zibait, zieait, VecCUDAEquals());
4964       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4965       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4966       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4967       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4968       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4969         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4970         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4971         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4972         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4973         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4974         auto       vT    = CcsrT->values->begin();
4975         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4976         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4977         (*C)->transupdated = PETSC_TRUE;
4978       }
4979       PetscCall(PetscLogGpuTimeEnd());
4980     }
4981   }
4982   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4983   (*C)->assembled     = PETSC_TRUE;
4984   (*C)->was_assembled = PETSC_FALSE;
4985   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4986   PetscFunctionReturn(0);
4987 }
4988 
4989 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4990 {
4991   bool               dmem;
4992   const PetscScalar *av;
4993 
4994   PetscFunctionBegin;
4995   dmem = isCudaMem(v);
4996   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4997   if (n && idx) {
4998     THRUSTINTARRAY widx(n);
4999     widx.assign(idx, idx + n);
5000     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5001 
5002     THRUSTARRAY                    *w = NULL;
5003     thrust::device_ptr<PetscScalar> dv;
5004     if (dmem) {
5005       dv = thrust::device_pointer_cast(v);
5006     } else {
5007       w  = new THRUSTARRAY(n);
5008       dv = w->data();
5009     }
5010     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5011 
5012     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5013     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5014     thrust::for_each(zibit, zieit, VecCUDAEquals());
5015     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5016     delete w;
5017   } else {
5018     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5019   }
5020   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5021   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5022   PetscFunctionReturn(0);
5023 }
5024