xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 750b007cd8d816cecd9de99077bb0a703b4cf61a)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #if PETSC_CPP_VERSION >= 14
17 #define PETSC_HAVE_THRUST_ASYNC 1
18 // thrust::for_each(thrust::cuda::par.on()) requires C++14
19 #include <thrust/async/for_each.h>
20 #endif
21 #include <thrust/iterator/constant_iterator.h>
22 #include <thrust/remove.h>
23 #include <thrust/sort.h>
24 #include <thrust/unique.h>
25 
26 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30 
31   typedef enum {
32       CUSPARSE_MV_ALG_DEFAULT = 0,
33       CUSPARSE_COOMV_ALG      = 1,
34       CUSPARSE_CSRMV_ALG1     = 2,
35       CUSPARSE_CSRMV_ALG2     = 3
36   } cusparseSpMVAlg_t;
37 
38   typedef enum {
39       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45       CUSPARSE_SPMM_COO_ALG1    = 1,
46       CUSPARSE_SPMM_COO_ALG2    = 2,
47       CUSPARSE_SPMM_COO_ALG3    = 3,
48       CUSPARSE_SPMM_COO_ALG4    = 5,
49       CUSPARSE_SPMM_CSR_ALG1    = 4,
50       CUSPARSE_SPMM_CSR_ALG2    = 6,
51   } cusparseSpMMAlg_t;
52 
53   typedef enum {
54       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
55       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
56   } cusparseCsr2CscAlg_t;
57   */
58 const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59 const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61 #endif
62 
63 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66 
67 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
69 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
70 
71 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
72 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
73 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
74 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
75 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
76 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
77 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
78 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
79 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
83 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
84 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
85 
86 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
87 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
88 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
90 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
91 
92 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
94 
95 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98 
99 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
101 
102   PetscFunctionBegin;
103   switch (op) {
104   case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break;
105   case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break;
106   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
107   }
108   PetscFunctionReturn(0);
109 }
110 
111 /*@
112    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
113    operation. Only the `MatMult()` operation can use different GPU storage formats
114 
115    Not Collective
116 
117    Input Parameters:
118 +  A - Matrix of type `MATSEQAIJCUSPARSE`
119 .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
120         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
121 -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
122 
123    Output Parameter:
124 
125    Level: intermediate
126 
127 .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
128 @*/
129 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
130   PetscFunctionBegin;
131   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
132   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
133   PetscFunctionReturn(0);
134 }
135 
136 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) {
137   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
138 
139   PetscFunctionBegin;
140   cusparsestruct->use_cpu_solve = use_cpu;
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
146 
147    Input Parameters:
148 +  A - Matrix of type `MATSEQAIJCUSPARSE`
149 -  use_cpu - set flag for using the built-in CPU `MatSolve()`
150 
151    Output Parameter:
152 
153    Note:
154    The cuSparse LU solver currently computes the factors with the built-in CPU method
155    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
156    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
157 
158    Level: intermediate
159 
160 .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
161 @*/
162 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) {
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
165   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
166   PetscFunctionReturn(0);
167 }
168 
169 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) {
170   PetscFunctionBegin;
171   switch (op) {
172   case MAT_FORM_EXPLICIT_TRANSPOSE:
173     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
174     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
175     A->form_explicit_transpose = flg;
176     break;
177   default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break;
178   }
179   PetscFunctionReturn(0);
180 }
181 
182 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
183 
184 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
185   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
186   IS                  isrow = b->row, iscol = b->col;
187   PetscBool           row_identity, col_identity;
188   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
189 
190   PetscFunctionBegin;
191   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
192   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
193   B->offloadmask = PETSC_OFFLOAD_CPU;
194   /* determine which version of MatSolve needs to be used. */
195   PetscCall(ISIdentity(isrow, &row_identity));
196   PetscCall(ISIdentity(iscol, &col_identity));
197 
198   if (!cusparsestruct->use_cpu_solve) {
199     if (row_identity && col_identity) {
200       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
201       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
202     } else {
203       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
204       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
205     }
206   }
207   B->ops->matsolve          = NULL;
208   B->ops->matsolvetranspose = NULL;
209 
210   /* get the triangular factors */
211   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) {
216   MatCUSPARSEStorageFormat format;
217   PetscBool                flg;
218   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
219 
220   PetscFunctionBegin;
221   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
222   if (A->factortype == MAT_FACTOR_NONE) {
223     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
224     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
225 
226     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
227     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
228     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
229     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
230 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
231     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
232     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
233 #if CUSPARSE_VERSION > 11301
234     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
235 #else
236     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
237 #endif
238     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
239     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
240 
241     PetscCall(
242       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
243     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
244 #endif
245   }
246   PetscOptionsHeadEnd();
247   PetscFunctionReturn(0);
248 }
249 
250 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) {
251   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
252   PetscInt                           n                  = A->rmap->n;
253   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
254   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
255   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
256   const MatScalar                   *aa = a->a, *v;
257   PetscInt                          *AiLo, *AjLo;
258   PetscInt                           i, nz, nzLower, offset, rowOffset;
259 
260   PetscFunctionBegin;
261   if (!n) PetscFunctionReturn(0);
262   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
263     try {
264       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
265       nzLower = n + ai[n] - ai[1];
266       if (!loTriFactor) {
267         PetscScalar *AALo;
268 
269         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
270 
271         /* Allocate Space for the lower triangular matrix */
272         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
273         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
274 
275         /* Fill the lower triangular matrix */
276         AiLo[0]   = (PetscInt)0;
277         AiLo[n]   = nzLower;
278         AjLo[0]   = (PetscInt)0;
279         AALo[0]   = (MatScalar)1.0;
280         v         = aa;
281         vi        = aj;
282         offset    = 1;
283         rowOffset = 1;
284         for (i = 1; i < n; i++) {
285           nz      = ai[i + 1] - ai[i];
286           /* additional 1 for the term on the diagonal */
287           AiLo[i] = rowOffset;
288           rowOffset += nz + 1;
289 
290           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
291           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
292 
293           offset += nz;
294           AjLo[offset] = (PetscInt)i;
295           AALo[offset] = (MatScalar)1.0;
296           offset += 1;
297 
298           v += nz;
299           vi += nz;
300         }
301 
302         /* allocate space for the triangular factor information */
303         PetscCall(PetscNew(&loTriFactor));
304         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
305         /* Create the matrix description */
306         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
307         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
308 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
309         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
310 #else
311         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
312 #endif
313         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
314         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
315 
316         /* set the operation */
317         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
318 
319         /* set the matrix */
320         loTriFactor->csrMat              = new CsrMatrix;
321         loTriFactor->csrMat->num_rows    = n;
322         loTriFactor->csrMat->num_cols    = n;
323         loTriFactor->csrMat->num_entries = nzLower;
324 
325         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
326         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
327 
328         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
329         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
330 
331         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
332         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
333 
334         /* Create the solve analysis information */
335         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
336         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
337 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
338         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
339                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
340         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
341 #endif
342 
343         /* perform the solve analysis */
344         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
345                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
346 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
347                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
348 #else
349                                                   loTriFactor->solveInfo));
350 #endif
351         PetscCallCUDA(WaitForCUDA());
352         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
353 
354         /* assign the pointer */
355         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
356         loTriFactor->AA_h                                          = AALo;
357         PetscCallCUDA(cudaFreeHost(AiLo));
358         PetscCallCUDA(cudaFreeHost(AjLo));
359         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
360       } else { /* update values only */
361         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
362         /* Fill the lower triangular matrix */
363         loTriFactor->AA_h[0] = 1.0;
364         v                    = aa;
365         vi                   = aj;
366         offset               = 1;
367         for (i = 1; i < n; i++) {
368           nz = ai[i + 1] - ai[i];
369           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
370           offset += nz;
371           loTriFactor->AA_h[offset] = 1.0;
372           offset += 1;
373           v += nz;
374         }
375         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
376         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
377       }
378     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
379   }
380   PetscFunctionReturn(0);
381 }
382 
383 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) {
384   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
385   PetscInt                           n                  = A->rmap->n;
386   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
387   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
388   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
389   const MatScalar                   *aa = a->a, *v;
390   PetscInt                          *AiUp, *AjUp;
391   PetscInt                           i, nz, nzUpper, offset;
392 
393   PetscFunctionBegin;
394   if (!n) PetscFunctionReturn(0);
395   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
396     try {
397       /* next, figure out the number of nonzeros in the upper triangular matrix. */
398       nzUpper = adiag[0] - adiag[n];
399       if (!upTriFactor) {
400         PetscScalar *AAUp;
401 
402         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
403 
404         /* Allocate Space for the upper triangular matrix */
405         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
406         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
407 
408         /* Fill the upper triangular matrix */
409         AiUp[0] = (PetscInt)0;
410         AiUp[n] = nzUpper;
411         offset  = nzUpper;
412         for (i = n - 1; i >= 0; i--) {
413           v  = aa + adiag[i + 1] + 1;
414           vi = aj + adiag[i + 1] + 1;
415 
416           /* number of elements NOT on the diagonal */
417           nz = adiag[i] - adiag[i + 1] - 1;
418 
419           /* decrement the offset */
420           offset -= (nz + 1);
421 
422           /* first, set the diagonal elements */
423           AjUp[offset] = (PetscInt)i;
424           AAUp[offset] = (MatScalar)1. / v[nz];
425           AiUp[i]      = AiUp[i + 1] - (nz + 1);
426 
427           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
428           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
429         }
430 
431         /* allocate space for the triangular factor information */
432         PetscCall(PetscNew(&upTriFactor));
433         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
434 
435         /* Create the matrix description */
436         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
437         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
438 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
439         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
440 #else
441         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
442 #endif
443         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
444         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
445 
446         /* set the operation */
447         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
448 
449         /* set the matrix */
450         upTriFactor->csrMat              = new CsrMatrix;
451         upTriFactor->csrMat->num_rows    = n;
452         upTriFactor->csrMat->num_cols    = n;
453         upTriFactor->csrMat->num_entries = nzUpper;
454 
455         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
456         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
457 
458         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
459         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
460 
461         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
462         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
463 
464         /* Create the solve analysis information */
465         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
466         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
467 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
468         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
469                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
470         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
471 #endif
472 
473         /* perform the solve analysis */
474         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
475                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
476 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
477                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
478 #else
479                                                   upTriFactor->solveInfo));
480 #endif
481         PetscCallCUDA(WaitForCUDA());
482         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
483 
484         /* assign the pointer */
485         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
486         upTriFactor->AA_h                                          = AAUp;
487         PetscCallCUDA(cudaFreeHost(AiUp));
488         PetscCallCUDA(cudaFreeHost(AjUp));
489         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
490       } else {
491         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
492         /* Fill the upper triangular matrix */
493         offset = nzUpper;
494         for (i = n - 1; i >= 0; i--) {
495           v = aa + adiag[i + 1] + 1;
496 
497           /* number of elements NOT on the diagonal */
498           nz = adiag[i] - adiag[i + 1] - 1;
499 
500           /* decrement the offset */
501           offset -= (nz + 1);
502 
503           /* first, set the diagonal elements */
504           upTriFactor->AA_h[offset] = 1. / v[nz];
505           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
506         }
507         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
508         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
509       }
510     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
511   }
512   PetscFunctionReturn(0);
513 }
514 
515 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) {
516   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
517   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
518   IS                            isrow = a->row, iscol = a->icol;
519   PetscBool                     row_identity, col_identity;
520   PetscInt                      n = A->rmap->n;
521 
522   PetscFunctionBegin;
523   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
524   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
525   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
526 
527   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
528   cusparseTriFactors->nnz = a->nz;
529 
530   A->offloadmask = PETSC_OFFLOAD_BOTH;
531   /* lower triangular indices */
532   PetscCall(ISIdentity(isrow, &row_identity));
533   if (!row_identity && !cusparseTriFactors->rpermIndices) {
534     const PetscInt *r;
535 
536     PetscCall(ISGetIndices(isrow, &r));
537     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
538     cusparseTriFactors->rpermIndices->assign(r, r + n);
539     PetscCall(ISRestoreIndices(isrow, &r));
540     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
541   }
542 
543   /* upper triangular indices */
544   PetscCall(ISIdentity(iscol, &col_identity));
545   if (!col_identity && !cusparseTriFactors->cpermIndices) {
546     const PetscInt *c;
547 
548     PetscCall(ISGetIndices(iscol, &c));
549     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
550     cusparseTriFactors->cpermIndices->assign(c, c + n);
551     PetscCall(ISRestoreIndices(iscol, &c));
552     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
553   }
554   PetscFunctionReturn(0);
555 }
556 
557 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) {
558   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
559   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
560   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
561   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
562   PetscInt                          *AiUp, *AjUp;
563   PetscScalar                       *AAUp;
564   PetscScalar                       *AALo;
565   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
566   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
567   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
568   const MatScalar                   *aa = b->a, *v;
569 
570   PetscFunctionBegin;
571   if (!n) PetscFunctionReturn(0);
572   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
573     try {
574       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
575       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
576       if (!upTriFactor && !loTriFactor) {
577         /* Allocate Space for the upper triangular matrix */
578         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
579         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
580 
581         /* Fill the upper triangular matrix */
582         AiUp[0] = (PetscInt)0;
583         AiUp[n] = nzUpper;
584         offset  = 0;
585         for (i = 0; i < n; i++) {
586           /* set the pointers */
587           v  = aa + ai[i];
588           vj = aj + ai[i];
589           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
590 
591           /* first, set the diagonal elements */
592           AjUp[offset] = (PetscInt)i;
593           AAUp[offset] = (MatScalar)1.0 / v[nz];
594           AiUp[i]      = offset;
595           AALo[offset] = (MatScalar)1.0 / v[nz];
596 
597           offset += 1;
598           if (nz > 0) {
599             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
600             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
601             for (j = offset; j < offset + nz; j++) {
602               AAUp[j] = -AAUp[j];
603               AALo[j] = AAUp[j] / v[nz];
604             }
605             offset += nz;
606           }
607         }
608 
609         /* allocate space for the triangular factor information */
610         PetscCall(PetscNew(&upTriFactor));
611         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
612 
613         /* Create the matrix description */
614         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
615         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
616 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
617         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
618 #else
619         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
620 #endif
621         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
622         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
623 
624         /* set the matrix */
625         upTriFactor->csrMat              = new CsrMatrix;
626         upTriFactor->csrMat->num_rows    = A->rmap->n;
627         upTriFactor->csrMat->num_cols    = A->cmap->n;
628         upTriFactor->csrMat->num_entries = a->nz;
629 
630         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
631         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
632 
633         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
634         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
635 
636         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
637         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
638 
639         /* set the operation */
640         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
641 
642         /* Create the solve analysis information */
643         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
644         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
645 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
646         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
647                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
648         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
649 #endif
650 
651         /* perform the solve analysis */
652         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
653                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
654 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
655                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
656 #else
657                                                   upTriFactor->solveInfo));
658 #endif
659         PetscCallCUDA(WaitForCUDA());
660         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
661 
662         /* assign the pointer */
663         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
664 
665         /* allocate space for the triangular factor information */
666         PetscCall(PetscNew(&loTriFactor));
667         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
668 
669         /* Create the matrix description */
670         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
671         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
672 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
673         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
674 #else
675         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
676 #endif
677         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
678         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
679 
680         /* set the operation */
681         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
682 
683         /* set the matrix */
684         loTriFactor->csrMat              = new CsrMatrix;
685         loTriFactor->csrMat->num_rows    = A->rmap->n;
686         loTriFactor->csrMat->num_cols    = A->cmap->n;
687         loTriFactor->csrMat->num_entries = a->nz;
688 
689         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
690         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
691 
692         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
693         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
694 
695         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
696         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
697 
698         /* Create the solve analysis information */
699         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
700         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
701 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
702         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
703                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
704         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
705 #endif
706 
707         /* perform the solve analysis */
708         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
709                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
710 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
711                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
712 #else
713                                                   loTriFactor->solveInfo));
714 #endif
715         PetscCallCUDA(WaitForCUDA());
716         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
717 
718         /* assign the pointer */
719         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
720 
721         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
722         PetscCallCUDA(cudaFreeHost(AiUp));
723         PetscCallCUDA(cudaFreeHost(AjUp));
724       } else {
725         /* Fill the upper triangular matrix */
726         offset = 0;
727         for (i = 0; i < n; i++) {
728           /* set the pointers */
729           v  = aa + ai[i];
730           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
731 
732           /* first, set the diagonal elements */
733           AAUp[offset] = 1.0 / v[nz];
734           AALo[offset] = 1.0 / v[nz];
735 
736           offset += 1;
737           if (nz > 0) {
738             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
739             for (j = offset; j < offset + nz; j++) {
740               AAUp[j] = -AAUp[j];
741               AALo[j] = AAUp[j] / v[nz];
742             }
743             offset += nz;
744           }
745         }
746         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
747         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
748         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
749         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
750         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
751       }
752       PetscCallCUDA(cudaFreeHost(AAUp));
753       PetscCallCUDA(cudaFreeHost(AALo));
754     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
755   }
756   PetscFunctionReturn(0);
757 }
758 
759 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) {
760   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
761   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
762   IS                            ip                 = a->row;
763   PetscBool                     perm_identity;
764   PetscInt                      n = A->rmap->n;
765 
766   PetscFunctionBegin;
767   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
768   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
769   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
770   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
771 
772   A->offloadmask = PETSC_OFFLOAD_BOTH;
773 
774   /* lower triangular indices */
775   PetscCall(ISIdentity(ip, &perm_identity));
776   if (!perm_identity) {
777     IS              iip;
778     const PetscInt *irip, *rip;
779 
780     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
781     PetscCall(ISGetIndices(iip, &irip));
782     PetscCall(ISGetIndices(ip, &rip));
783     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
784     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
785     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
786     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
787     PetscCall(ISRestoreIndices(iip, &irip));
788     PetscCall(ISDestroy(&iip));
789     PetscCall(ISRestoreIndices(ip, &rip));
790     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
791   }
792   PetscFunctionReturn(0);
793 }
794 
795 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
796   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
797   IS          ip = b->row;
798   PetscBool   perm_identity;
799 
800   PetscFunctionBegin;
801   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
802   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
803   B->offloadmask = PETSC_OFFLOAD_CPU;
804   /* determine which version of MatSolve needs to be used. */
805   PetscCall(ISIdentity(ip, &perm_identity));
806   if (perm_identity) {
807     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
808     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
809     B->ops->matsolve          = NULL;
810     B->ops->matsolvetranspose = NULL;
811   } else {
812     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
813     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
814     B->ops->matsolve          = NULL;
815     B->ops->matsolvetranspose = NULL;
816   }
817 
818   /* get the triangular factors */
819   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
820   PetscFunctionReturn(0);
821 }
822 
823 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) {
824   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
825   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
826   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
827   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
828   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
829   cusparseIndexBase_t                indexBase;
830   cusparseMatrixType_t               matrixType;
831   cusparseFillMode_t                 fillMode;
832   cusparseDiagType_t                 diagType;
833 
834   PetscFunctionBegin;
835   /* allocate space for the transpose of the lower triangular factor */
836   PetscCall(PetscNew(&loTriFactorT));
837   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
838 
839   /* set the matrix descriptors of the lower triangular factor */
840   matrixType = cusparseGetMatType(loTriFactor->descr);
841   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
842   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
843   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
844 
845   /* Create the matrix description */
846   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
847   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
848   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
849   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
850   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
851 
852   /* set the operation */
853   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
854 
855   /* allocate GPU space for the CSC of the lower triangular factor*/
856   loTriFactorT->csrMat                 = new CsrMatrix;
857   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
858   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
859   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
860   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
861   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
862   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
863 
864   /* compute the transpose of the lower triangular factor, i.e. the CSC */
865 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
866   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
867                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
868                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
869   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
870 #endif
871 
872   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
873   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
874                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
875 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
876                                      loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
877 #else
878                                      loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
879 #endif
880   PetscCallCUDA(WaitForCUDA());
881   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
882 
883   /* Create the solve analysis information */
884   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
885   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
886 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
887   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
888                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
889   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
890 #endif
891 
892   /* perform the solve analysis */
893   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
894                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
895 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
896                                             loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
897 #else
898                                             loTriFactorT->solveInfo));
899 #endif
900   PetscCallCUDA(WaitForCUDA());
901   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
902 
903   /* assign the pointer */
904   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
905 
906   /*********************************************/
907   /* Now the Transpose of the Upper Tri Factor */
908   /*********************************************/
909 
910   /* allocate space for the transpose of the upper triangular factor */
911   PetscCall(PetscNew(&upTriFactorT));
912   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
913 
914   /* set the matrix descriptors of the upper triangular factor */
915   matrixType = cusparseGetMatType(upTriFactor->descr);
916   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
917   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
918   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
919 
920   /* Create the matrix description */
921   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
922   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
923   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
924   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
925   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
926 
927   /* set the operation */
928   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
929 
930   /* allocate GPU space for the CSC of the upper triangular factor*/
931   upTriFactorT->csrMat                 = new CsrMatrix;
932   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
933   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
934   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
935   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
936   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
937   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
938 
939   /* compute the transpose of the upper triangular factor, i.e. the CSC */
940 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
941   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
942                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
943                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
944   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
945 #endif
946 
947   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
948   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
949                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
950 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
951                                      upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
952 #else
953                                      upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
954 #endif
955 
956   PetscCallCUDA(WaitForCUDA());
957   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
958 
959   /* Create the solve analysis information */
960   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
961   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
962 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
963   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
964                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
965   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
966 #endif
967 
968   /* perform the solve analysis */
969   /* christ, would it have killed you to put this stuff in a function????????? */
970   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
971                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
972 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
973                                             upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
974 #else
975                                             upTriFactorT->solveInfo));
976 #endif
977 
978   PetscCallCUDA(WaitForCUDA());
979   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
980 
981   /* assign the pointer */
982   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
983   PetscFunctionReturn(0);
984 }
985 
986 struct PetscScalarToPetscInt {
987   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
988 };
989 
990 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) {
991   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
992   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
993   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
994   cusparseStatus_t              stat;
995   cusparseIndexBase_t           indexBase;
996 
997   PetscFunctionBegin;
998   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
999   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1000   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1001   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1002   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1003   if (A->transupdated) PetscFunctionReturn(0);
1004   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1005   PetscCall(PetscLogGpuTimeBegin());
1006   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1007   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1008     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1009     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1010     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1011     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1012     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1013 
1014     /* set alpha and beta */
1015     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1016     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1017     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1018     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1019     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1020     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1021 
1022     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1023       CsrMatrix *matrixT      = new CsrMatrix;
1024       matstructT->mat         = matrixT;
1025       matrixT->num_rows       = A->cmap->n;
1026       matrixT->num_cols       = A->rmap->n;
1027       matrixT->num_entries    = a->nz;
1028       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1029       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1030       matrixT->values         = new THRUSTARRAY(a->nz);
1031 
1032       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1033       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1034 
1035 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1036 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1037       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1038                                indexBase, cusparse_scalartype);
1039       PetscCallCUSPARSE(stat);
1040 #else
1041       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1042            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1043 
1044            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1045            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1046            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1047         */
1048       if (matrixT->num_entries) {
1049         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1050         PetscCallCUSPARSE(stat);
1051 
1052       } else {
1053         matstructT->matDescr = NULL;
1054         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1055       }
1056 #endif
1057 #endif
1058     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1059 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1060       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1061 #else
1062       CsrMatrix *temp = new CsrMatrix;
1063       CsrMatrix *tempT = new CsrMatrix;
1064       /* First convert HYB to CSR */
1065       temp->num_rows = A->rmap->n;
1066       temp->num_cols = A->cmap->n;
1067       temp->num_entries = a->nz;
1068       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1069       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1070       temp->values = new THRUSTARRAY(a->nz);
1071 
1072       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1073       PetscCallCUSPARSE(stat);
1074 
1075       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1076       tempT->num_rows = A->rmap->n;
1077       tempT->num_cols = A->cmap->n;
1078       tempT->num_entries = a->nz;
1079       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1080       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1081       tempT->values = new THRUSTARRAY(a->nz);
1082 
1083       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1084                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1085       PetscCallCUSPARSE(stat);
1086 
1087       /* Last, convert CSC to HYB */
1088       cusparseHybMat_t hybMat;
1089       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1090       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1091       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1092       PetscCallCUSPARSE(stat);
1093 
1094       /* assign the pointer */
1095       matstructT->mat = hybMat;
1096       A->transupdated = PETSC_TRUE;
1097       /* delete temporaries */
1098       if (tempT) {
1099         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1100         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1101         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1102         delete (CsrMatrix *)tempT;
1103       }
1104       if (temp) {
1105         if (temp->values) delete (THRUSTARRAY *)temp->values;
1106         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1107         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1108         delete (CsrMatrix *)temp;
1109       }
1110 #endif
1111     }
1112   }
1113   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1114     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1115     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1116     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1117     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1118     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1119     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1120     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1121     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1122     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1123     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1124     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1125       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1126       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1127       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1128     }
1129     if (!cusparsestruct->csr2csc_i) {
1130       THRUSTARRAY csr2csc_a(matrix->num_entries);
1131       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1132 
1133       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1134 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1135       void  *csr2cscBuffer;
1136       size_t csr2cscBufferSize;
1137       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1138                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1139       PetscCallCUSPARSE(stat);
1140       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1141 #endif
1142 
1143       if (matrix->num_entries) {
1144         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1145            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1146            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1147 
1148            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1149            should be filled with indexBase. So I just take a shortcut here.
1150         */
1151         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1152 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1153                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1154         PetscCallCUSPARSE(stat);
1155 #else
1156                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1157         PetscCallCUSPARSE(stat);
1158 #endif
1159       } else {
1160         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1161       }
1162 
1163       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1164       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1165 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1166       PetscCallCUDA(cudaFree(csr2cscBuffer));
1167 #endif
1168     }
1169     PetscCallThrust(
1170       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1171   }
1172   PetscCall(PetscLogGpuTimeEnd());
1173   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174   /* the compressed row indices is not used for matTranspose */
1175   matstructT->cprowIndices                       = NULL;
1176   /* assign the pointer */
1177   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1178   A->transupdated                                = PETSC_TRUE;
1179   PetscFunctionReturn(0);
1180 }
1181 
1182 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1183 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1184   PetscInt                              n = xx->map->n;
1185   const PetscScalar                    *barray;
1186   PetscScalar                          *xarray;
1187   thrust::device_ptr<const PetscScalar> bGPU;
1188   thrust::device_ptr<PetscScalar>       xGPU;
1189   cusparseStatus_t                      stat;
1190   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1191   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1192   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1193   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1194 
1195   PetscFunctionBegin;
1196   /* Analyze the matrix and create the transpose ... on the fly */
1197   if (!loTriFactorT && !upTriFactorT) {
1198     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1199     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1200     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1201   }
1202 
1203   /* Get the GPU pointers */
1204   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1205   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1206   xGPU = thrust::device_pointer_cast(xarray);
1207   bGPU = thrust::device_pointer_cast(barray);
1208 
1209   PetscCall(PetscLogGpuTimeBegin());
1210   /* First, reorder with the row permutation */
1211   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1212 
1213   /* First, solve U */
1214   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
1215 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1216                               upTriFactorT->csrMat->num_entries,
1217 #endif
1218                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray,
1219 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1220                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
1221   PetscCallCUSPARSE(stat);
1222 #else
1223                               tempGPU->data().get());
1224   PetscCallCUSPARSE(stat);
1225 #endif
1226 
1227   /* Then, solve L */
1228   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
1229 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1230                               loTriFactorT->csrMat->num_entries,
1231 #endif
1232                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1233                               tempGPU->data().get(),
1234 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1235                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
1236   PetscCallCUSPARSE(stat);
1237 #else
1238                               xarray);
1239   PetscCallCUSPARSE(stat);
1240 #endif
1241 
1242   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1243   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1244 
1245   /* Copy the temporary to the full solution. */
1246   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1247 
1248   /* restore */
1249   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1250   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1251   PetscCall(PetscLogGpuTimeEnd());
1252   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1253   PetscFunctionReturn(0);
1254 }
1255 
1256 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1257   const PetscScalar                 *barray;
1258   PetscScalar                       *xarray;
1259   cusparseStatus_t                   stat;
1260   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1261   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1262   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1263   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1264 
1265   PetscFunctionBegin;
1266   /* Analyze the matrix and create the transpose ... on the fly */
1267   if (!loTriFactorT && !upTriFactorT) {
1268     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1269     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1270     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1271   }
1272 
1273   /* Get the GPU pointers */
1274   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1275   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1276 
1277   PetscCall(PetscLogGpuTimeBegin());
1278   /* First, solve U */
1279   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
1280 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1281                               upTriFactorT->csrMat->num_entries,
1282 #endif
1283                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray,
1284 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1285                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
1286   PetscCallCUSPARSE(stat);
1287 #else
1288                               tempGPU->data().get());
1289   PetscCallCUSPARSE(stat);
1290 #endif
1291 
1292   /* Then, solve L */
1293   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
1294 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1295                               loTriFactorT->csrMat->num_entries,
1296 #endif
1297                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1298                               tempGPU->data().get(),
1299 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1300                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
1301   PetscCallCUSPARSE(stat);
1302 #else
1303                               xarray);
1304   PetscCallCUSPARSE(stat);
1305 #endif
1306 
1307   /* restore */
1308   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1309   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1310   PetscCall(PetscLogGpuTimeEnd());
1311   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1316   const PetscScalar                    *barray;
1317   PetscScalar                          *xarray;
1318   thrust::device_ptr<const PetscScalar> bGPU;
1319   thrust::device_ptr<PetscScalar>       xGPU;
1320   cusparseStatus_t                      stat;
1321   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1322   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1323   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1324   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1325 
1326   PetscFunctionBegin;
1327 
1328   /* Get the GPU pointers */
1329   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1330   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1331   xGPU = thrust::device_pointer_cast(xarray);
1332   bGPU = thrust::device_pointer_cast(barray);
1333 
1334   PetscCall(PetscLogGpuTimeBegin());
1335   /* First, reorder with the row permutation */
1336   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1337 
1338   /* Next, solve L */
1339   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
1340 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1341                               loTriFactor->csrMat->num_entries,
1342 #endif
1343                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
1344                               tempGPU->data().get(),
1345 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1346                               xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer);
1347   PetscCallCUSPARSE(stat);
1348 #else
1349                               xarray);
1350   PetscCallCUSPARSE(stat);
1351 #endif
1352 
1353   /* Then, solve U */
1354   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
1355 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1356                               upTriFactor->csrMat->num_entries,
1357 #endif
1358                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray,
1359 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1360                               tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer);
1361   PetscCallCUSPARSE(stat);
1362 #else
1363                               tempGPU->data().get());
1364   PetscCallCUSPARSE(stat);
1365 #endif
1366 
1367   /* Last, reorder with the column permutation */
1368   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1369 
1370   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1371   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1372   PetscCall(PetscLogGpuTimeEnd());
1373   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1378   const PetscScalar                 *barray;
1379   PetscScalar                       *xarray;
1380   cusparseStatus_t                   stat;
1381   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1382   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1383   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1384   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1385 
1386   PetscFunctionBegin;
1387   /* Get the GPU pointers */
1388   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1389   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1390 
1391   PetscCall(PetscLogGpuTimeBegin());
1392   /* First, solve L */
1393   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
1394 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1395                               loTriFactor->csrMat->num_entries,
1396 #endif
1397                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray,
1398 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1399                               tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer);
1400   PetscCallCUSPARSE(stat);
1401 #else
1402                               tempGPU->data().get());
1403   PetscCallCUSPARSE(stat);
1404 #endif
1405 
1406   /* Next, solve U */
1407   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
1408 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1409                               upTriFactor->csrMat->num_entries,
1410 #endif
1411                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
1412                               tempGPU->data().get(),
1413 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1414                               xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer);
1415   PetscCallCUSPARSE(stat);
1416 #else
1417                               xarray);
1418   PetscCallCUSPARSE(stat);
1419 #endif
1420 
1421   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1422   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1423   PetscCall(PetscLogGpuTimeEnd());
1424   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1425   PetscFunctionReturn(0);
1426 }
1427 
1428 #if CUSPARSE_VERSION >= 11500
1429 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1430 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1431   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1432   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1433   const PetscScalar            *barray;
1434   PetscScalar                  *xarray;
1435 
1436   PetscFunctionBegin;
1437   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1438   PetscCall(VecCUDAGetArrayRead(b, &barray));
1439   PetscCall(PetscLogGpuTimeBegin());
1440 
1441   /* Solve L*y = b */
1442   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1443   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1444   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1445                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1446                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1447 
1448   /* Solve U*x = y */
1449   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1450   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1451                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1452 
1453   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1454   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1455 
1456   PetscCall(PetscLogGpuTimeEnd());
1457   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1458   PetscFunctionReturn(0);
1459 }
1460 
1461 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1462   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1463   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1464   const PetscScalar            *barray;
1465   PetscScalar                  *xarray;
1466 
1467   PetscFunctionBegin;
1468   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1469     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1470     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1471                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1472 
1473     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1474     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1475     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1476     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1477     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1478   }
1479 
1480   if (!fs->updatedTransposeSpSVAnalysis) {
1481     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1482 
1483     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1484     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1485   }
1486 
1487   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1488   PetscCall(VecCUDAGetArrayRead(b, &barray));
1489   PetscCall(PetscLogGpuTimeBegin());
1490 
1491   /* Solve Ut*y = b */
1492   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1493   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1494   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1495                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1496 
1497   /* Solve Lt*x = y */
1498   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1499   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1500                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1501 
1502   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1503   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1504   PetscCall(PetscLogGpuTimeEnd());
1505   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1506   PetscFunctionReturn(0);
1507 }
1508 
1509 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) {
1510   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1511   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1512   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1513   CsrMatrix                    *Acsr;
1514   PetscInt                      m, nz;
1515   PetscBool                     flg;
1516 
1517   PetscFunctionBegin;
1518   if (PetscDefined(USE_DEBUG)) {
1519     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1520     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1521   }
1522 
1523   /* Copy A's value to fact */
1524   m  = fact->rmap->n;
1525   nz = aij->nz;
1526   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1527   Acsr = (CsrMatrix *)Acusp->mat->mat;
1528   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1529 
1530   /* Factorize fact inplace */
1531   if (m)
1532     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1533                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1534   if (PetscDefined(USE_DEBUG)) {
1535     int              numerical_zero;
1536     cusparseStatus_t status;
1537     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1538     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1539   }
1540 
1541   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1542      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1543   */
1544   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1545 
1546   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1547 
1548   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1549   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1550 
1551   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1552   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1553   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1554   fact->ops->matsolve          = NULL;
1555   fact->ops->matsolvetranspose = NULL;
1556   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1557   PetscFunctionReturn(0);
1558 }
1559 
1560 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1561   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1562   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1563   PetscInt                      m, nz;
1564 
1565   PetscFunctionBegin;
1566   if (PetscDefined(USE_DEBUG)) {
1567     PetscInt  i;
1568     PetscBool flg, missing;
1569 
1570     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1571     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1572     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1573     PetscCall(MatMissingDiagonal(A, &missing, &i));
1574     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1575   }
1576 
1577   /* Free the old stale stuff */
1578   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1579 
1580   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1581      but they will not be used. Allocate them just for easy debugging.
1582    */
1583   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1584 
1585   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1586   fact->factortype             = MAT_FACTOR_ILU;
1587   fact->info.factor_mallocs    = 0;
1588   fact->info.fill_ratio_given  = info->fill;
1589   fact->info.fill_ratio_needed = 1.0;
1590 
1591   aij->row = NULL;
1592   aij->col = NULL;
1593 
1594   /* ====================================================================== */
1595   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1596   /* We'll do in-place factorization on fact                                */
1597   /* ====================================================================== */
1598   const int *Ai, *Aj;
1599 
1600   m  = fact->rmap->n;
1601   nz = aij->nz;
1602 
1603   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1604   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1605   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1606   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1607   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1608   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1609 
1610   /* ====================================================================== */
1611   /* Create descriptors for M, L, U                                         */
1612   /* ====================================================================== */
1613   cusparseFillMode_t fillMode;
1614   cusparseDiagType_t diagType;
1615 
1616   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1617   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1618   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1619 
1620   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1621     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1622     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1623     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1624     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1625   */
1626   fillMode = CUSPARSE_FILL_MODE_LOWER;
1627   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1628   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1629   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1630   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1631 
1632   fillMode = CUSPARSE_FILL_MODE_UPPER;
1633   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1634   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1635   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1636   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1637 
1638   /* ========================================================================= */
1639   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1640   /* ========================================================================= */
1641   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1642   if (m)
1643     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1644                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1645 
1646   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1647   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1648 
1649   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1650   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1651 
1652   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1653   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1654 
1655   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1656   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1657 
1658   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1659      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1660      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1661      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1662    */
1663   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1664     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1665     fs->spsvBuffer_L = fs->factBuffer_M;
1666     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1667   } else {
1668     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1669     fs->spsvBuffer_U = fs->factBuffer_M;
1670     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1671   }
1672 
1673   /* ========================================================================== */
1674   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1675   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1676   /* ========================================================================== */
1677   int              structural_zero;
1678   cusparseStatus_t status;
1679 
1680   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1681   if (m)
1682     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1683                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1684   if (PetscDefined(USE_DEBUG)) {
1685     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1686     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1687     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1688   }
1689 
1690   /* Estimate FLOPs of the numeric factorization */
1691   {
1692     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1693     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1694     PetscLogDouble flops = 0.0;
1695 
1696     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1697     Ai    = Aseq->i;
1698     Adiag = Aseq->diag;
1699     for (PetscInt i = 0; i < m; i++) {
1700       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1701         nzRow  = Ai[i + 1] - Ai[i];
1702         nzLeft = Adiag[i] - Ai[i];
1703         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1704           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1705         */
1706         nzLeft = (nzRow - 1) / 2;
1707         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1708       }
1709     }
1710     fs->numericFactFlops = flops;
1711   }
1712   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1713   PetscFunctionReturn(0);
1714 }
1715 
1716 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) {
1717   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1718   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1719   const PetscScalar            *barray;
1720   PetscScalar                  *xarray;
1721 
1722   PetscFunctionBegin;
1723   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1724   PetscCall(VecCUDAGetArrayRead(b, &barray));
1725   PetscCall(PetscLogGpuTimeBegin());
1726 
1727   /* Solve L*y = b */
1728   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1729   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1730   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1731                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1732 
1733   /* Solve Lt*x = y */
1734   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1735   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1736                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1737 
1738   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1739   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1740 
1741   PetscCall(PetscLogGpuTimeEnd());
1742   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) {
1747   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1748   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1749   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1750   CsrMatrix                    *Acsr;
1751   PetscInt                      m, nz;
1752   PetscBool                     flg;
1753 
1754   PetscFunctionBegin;
1755   if (PetscDefined(USE_DEBUG)) {
1756     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1757     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1758   }
1759 
1760   /* Copy A's value to fact */
1761   m  = fact->rmap->n;
1762   nz = aij->nz;
1763   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1764   Acsr = (CsrMatrix *)Acusp->mat->mat;
1765   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1766 
1767   /* Factorize fact inplace */
1768   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1769      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1770      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1771      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1772      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1773    */
1774   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1775   if (PetscDefined(USE_DEBUG)) {
1776     int              numerical_zero;
1777     cusparseStatus_t status;
1778     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1779     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1780   }
1781 
1782   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1783 
1784   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1785     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1786   */
1787   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1788 
1789   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1790   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1791   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1792   fact->ops->matsolve          = NULL;
1793   fact->ops->matsolvetranspose = NULL;
1794   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1795   PetscFunctionReturn(0);
1796 }
1797 
1798 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
1799   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1800   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1801   PetscInt                      m, nz;
1802 
1803   PetscFunctionBegin;
1804   if (PetscDefined(USE_DEBUG)) {
1805     PetscInt  i;
1806     PetscBool flg, missing;
1807 
1808     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1809     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1810     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1811     PetscCall(MatMissingDiagonal(A, &missing, &i));
1812     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1813   }
1814 
1815   /* Free the old stale stuff */
1816   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1817 
1818   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1819      but they will not be used. Allocate them just for easy debugging.
1820    */
1821   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1822 
1823   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1824   fact->factortype             = MAT_FACTOR_ICC;
1825   fact->info.factor_mallocs    = 0;
1826   fact->info.fill_ratio_given  = info->fill;
1827   fact->info.fill_ratio_needed = 1.0;
1828 
1829   aij->row = NULL;
1830   aij->col = NULL;
1831 
1832   /* ====================================================================== */
1833   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1834   /* We'll do in-place factorization on fact                                */
1835   /* ====================================================================== */
1836   const int *Ai, *Aj;
1837 
1838   m  = fact->rmap->n;
1839   nz = aij->nz;
1840 
1841   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1842   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1843   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1844   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1845   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1846   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1847 
1848   /* ====================================================================== */
1849   /* Create mat descriptors for M, L                                        */
1850   /* ====================================================================== */
1851   cusparseFillMode_t fillMode;
1852   cusparseDiagType_t diagType;
1853 
1854   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1855   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1856   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1857 
1858   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1859     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1860     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1861     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1862     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1863   */
1864   fillMode = CUSPARSE_FILL_MODE_LOWER;
1865   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1866   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1867   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1868   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1869 
1870   /* ========================================================================= */
1871   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1872   /* ========================================================================= */
1873   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1874   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1875 
1876   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1877   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1878 
1879   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1880   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1881 
1882   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1883   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1884 
1885   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1886   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1887 
1888   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1889      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1890    */
1891   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1892     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1893     fs->spsvBuffer_L = fs->factBuffer_M;
1894     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1895   } else {
1896     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1897     fs->spsvBuffer_Lt = fs->factBuffer_M;
1898     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1899   }
1900 
1901   /* ========================================================================== */
1902   /* Perform analysis of ic0 on M                                               */
1903   /* The lower triangular part of M has the same sparsity pattern as L          */
1904   /* ========================================================================== */
1905   int              structural_zero;
1906   cusparseStatus_t status;
1907 
1908   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1909   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1910   if (PetscDefined(USE_DEBUG)) {
1911     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1912     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1913     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1914   }
1915 
1916   /* Estimate FLOPs of the numeric factorization */
1917   {
1918     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1919     PetscInt      *Ai, nzRow, nzLeft;
1920     PetscLogDouble flops = 0.0;
1921 
1922     Ai = Aseq->i;
1923     for (PetscInt i = 0; i < m; i++) {
1924       nzRow = Ai[i + 1] - Ai[i];
1925       if (nzRow > 1) {
1926         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1927           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1928         */
1929         nzLeft = (nzRow - 1) / 2;
1930         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1931       }
1932     }
1933     fs->numericFactFlops = flops;
1934   }
1935   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1936   PetscFunctionReturn(0);
1937 }
1938 #endif
1939 
1940 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1941   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1942 
1943   PetscFunctionBegin;
1944 #if CUSPARSE_VERSION >= 11500
1945   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1946   if (cusparseTriFactors->factorizeOnDevice) {
1947     PetscCall(ISIdentity(isrow, &row_identity));
1948     PetscCall(ISIdentity(iscol, &col_identity));
1949   }
1950   if (!info->levels && row_identity && col_identity) {
1951     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1952   } else
1953 #endif
1954   {
1955     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1956     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1957     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1958   }
1959   PetscFunctionReturn(0);
1960 }
1961 
1962 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1963   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1964 
1965   PetscFunctionBegin;
1966   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1967   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1968   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1969   PetscFunctionReturn(0);
1970 }
1971 
1972 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1973   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1974 
1975   PetscFunctionBegin;
1976 #if CUSPARSE_VERSION >= 11500
1977   PetscBool perm_identity = PETSC_FALSE;
1978   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1979   if (!info->levels && perm_identity) {
1980     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1981   } else
1982 #endif
1983   {
1984     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1985     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1986     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1987   }
1988   PetscFunctionReturn(0);
1989 }
1990 
1991 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1992   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1993 
1994   PetscFunctionBegin;
1995   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1996   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1997   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1998   PetscFunctionReturn(0);
1999 }
2000 
2001 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) {
2002   PetscFunctionBegin;
2003   *type = MATSOLVERCUSPARSE;
2004   PetscFunctionReturn(0);
2005 }
2006 
2007 /*MC
2008   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2009   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2010   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2011   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2012   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2013   algorithms are not recommended. This class does NOT support direct solver operations.
2014 
2015   Level: beginner
2016 
2017 .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2018 M*/
2019 
2020 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) {
2021   PetscInt  n = A->rmap->n;
2022   PetscBool factOnDevice, factOnHost;
2023   char     *prefix;
2024   char      factPlace[32] = "device"; /* the default */
2025 
2026   PetscFunctionBegin;
2027   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2028   PetscCall(MatSetSizes(*B, n, n, n, n));
2029   (*B)->factortype = ftype;
2030   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2031 
2032   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2033   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2034   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2035   PetscOptionsEnd();
2036   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2037   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2038   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2039   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2040 
2041   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2042   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2043     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2044     if (!A->boundtocpu) {
2045       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2046       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2047     } else {
2048       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2049       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2050     }
2051     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2052     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2053     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2054   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2055     if (!A->boundtocpu) {
2056       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2057       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2058     } else {
2059       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2060       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2061     }
2062     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2063     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2064   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2065 
2066   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2067   (*B)->canuseordering = PETSC_TRUE;
2068   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2069   PetscFunctionReturn(0);
2070 }
2071 
2072 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) {
2073   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2074   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2075 #if CUSPARSE_VERSION >= 13500
2076   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2077 #endif
2078 
2079   PetscFunctionBegin;
2080   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2081     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2082     if (A->factortype == MAT_FACTOR_NONE) {
2083       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2084       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2085     }
2086 #if CUSPARSE_VERSION >= 13500
2087     else if (fs->csrVal) {
2088       /* We have a factorized matrix on device and are able to copy it to host */
2089       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2090     }
2091 #endif
2092     else
2093       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2094     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2095     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2096     A->offloadmask = PETSC_OFFLOAD_BOTH;
2097   }
2098   PetscFunctionReturn(0);
2099 }
2100 
2101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2102   PetscFunctionBegin;
2103   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2104   *array = ((Mat_SeqAIJ *)A->data)->a;
2105   PetscFunctionReturn(0);
2106 }
2107 
2108 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2109   PetscFunctionBegin;
2110   A->offloadmask = PETSC_OFFLOAD_CPU;
2111   *array         = NULL;
2112   PetscFunctionReturn(0);
2113 }
2114 
2115 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
2116   PetscFunctionBegin;
2117   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2118   *array = ((Mat_SeqAIJ *)A->data)->a;
2119   PetscFunctionReturn(0);
2120 }
2121 
2122 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
2123   PetscFunctionBegin;
2124   *array = NULL;
2125   PetscFunctionReturn(0);
2126 }
2127 
2128 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2129   PetscFunctionBegin;
2130   *array = ((Mat_SeqAIJ *)A->data)->a;
2131   PetscFunctionReturn(0);
2132 }
2133 
2134 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
2135   PetscFunctionBegin;
2136   A->offloadmask = PETSC_OFFLOAD_CPU;
2137   *array         = NULL;
2138   PetscFunctionReturn(0);
2139 }
2140 
2141 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) {
2142   Mat_SeqAIJCUSPARSE *cusp;
2143   CsrMatrix          *matrix;
2144 
2145   PetscFunctionBegin;
2146   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2148   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2149   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2150   matrix = (CsrMatrix *)cusp->mat->mat;
2151 
2152   if (i) {
2153 #if !defined(PETSC_USE_64BIT_INDICES)
2154     *i = matrix->row_offsets->data().get();
2155 #else
2156     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2157 #endif
2158   }
2159   if (j) {
2160 #if !defined(PETSC_USE_64BIT_INDICES)
2161     *j = matrix->column_indices->data().get();
2162 #else
2163     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2164 #endif
2165   }
2166   if (a) *a = matrix->values->data().get();
2167   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2168   PetscFunctionReturn(0);
2169 }
2170 
2171 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) {
2172   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2173   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2174   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2175   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2176   cusparseStatus_t              stat;
2177   PetscBool                     both = PETSC_TRUE;
2178 
2179   PetscFunctionBegin;
2180   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2181   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2182     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2183       CsrMatrix *matrix;
2184       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2185 
2186       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2187       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2188       matrix->values->assign(a->a, a->a + a->nz);
2189       PetscCallCUDA(WaitForCUDA());
2190       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2191       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2192       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2193     } else {
2194       PetscInt nnz;
2195       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2196       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2197       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2198       delete cusparsestruct->workVector;
2199       delete cusparsestruct->rowoffsets_gpu;
2200       cusparsestruct->workVector     = NULL;
2201       cusparsestruct->rowoffsets_gpu = NULL;
2202       try {
2203         if (a->compressedrow.use) {
2204           m    = a->compressedrow.nrows;
2205           ii   = a->compressedrow.i;
2206           ridx = a->compressedrow.rindex;
2207         } else {
2208           m    = A->rmap->n;
2209           ii   = a->i;
2210           ridx = NULL;
2211         }
2212         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2213         if (!a->a) {
2214           nnz  = ii[m];
2215           both = PETSC_FALSE;
2216         } else nnz = a->nz;
2217         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2218 
2219         /* create cusparse matrix */
2220         cusparsestruct->nrows = m;
2221         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2222         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2223         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2224         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2225 
2226         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2227         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2228         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2229         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2230         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2231         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2232         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2233 
2234         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2235         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2236           /* set the matrix */
2237           CsrMatrix *mat   = new CsrMatrix;
2238           mat->num_rows    = m;
2239           mat->num_cols    = A->cmap->n;
2240           mat->num_entries = nnz;
2241           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2242           mat->row_offsets->assign(ii, ii + m + 1);
2243 
2244           mat->column_indices = new THRUSTINTARRAY32(nnz);
2245           mat->column_indices->assign(a->j, a->j + nnz);
2246 
2247           mat->values = new THRUSTARRAY(nnz);
2248           if (a->a) mat->values->assign(a->a, a->a + nnz);
2249 
2250           /* assign the pointer */
2251           matstruct->mat = mat;
2252 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2253           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2254             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2255                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2256             PetscCallCUSPARSE(stat);
2257           }
2258 #endif
2259         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2260 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2261           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2262 #else
2263           CsrMatrix *mat = new CsrMatrix;
2264           mat->num_rows = m;
2265           mat->num_cols = A->cmap->n;
2266           mat->num_entries = nnz;
2267           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2268           mat->row_offsets->assign(ii, ii + m + 1);
2269 
2270           mat->column_indices = new THRUSTINTARRAY32(nnz);
2271           mat->column_indices->assign(a->j, a->j + nnz);
2272 
2273           mat->values = new THRUSTARRAY(nnz);
2274           if (a->a) mat->values->assign(a->a, a->a + nnz);
2275 
2276           cusparseHybMat_t hybMat;
2277           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2278           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2279           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2280           PetscCallCUSPARSE(stat);
2281           /* assign the pointer */
2282           matstruct->mat = hybMat;
2283 
2284           if (mat) {
2285             if (mat->values) delete (THRUSTARRAY *)mat->values;
2286             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2287             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2288             delete (CsrMatrix *)mat;
2289           }
2290 #endif
2291         }
2292 
2293         /* assign the compressed row indices */
2294         if (a->compressedrow.use) {
2295           cusparsestruct->workVector = new THRUSTARRAY(m);
2296           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2297           matstruct->cprowIndices->assign(ridx, ridx + m);
2298           tmp = m;
2299         } else {
2300           cusparsestruct->workVector = NULL;
2301           matstruct->cprowIndices    = NULL;
2302           tmp                        = 0;
2303         }
2304         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2305 
2306         /* assign the pointer */
2307         cusparsestruct->mat = matstruct;
2308       } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
2309       PetscCallCUDA(WaitForCUDA());
2310       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2311       cusparsestruct->nonzerostate = A->nonzerostate;
2312     }
2313     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2314   }
2315   PetscFunctionReturn(0);
2316 }
2317 
2318 struct VecCUDAPlusEquals {
2319   template <typename Tuple>
2320   __host__ __device__ void operator()(Tuple t) {
2321     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2322   }
2323 };
2324 
2325 struct VecCUDAEquals {
2326   template <typename Tuple>
2327   __host__ __device__ void operator()(Tuple t) {
2328     thrust::get<1>(t) = thrust::get<0>(t);
2329   }
2330 };
2331 
2332 struct VecCUDAEqualsReverse {
2333   template <typename Tuple>
2334   __host__ __device__ void operator()(Tuple t) {
2335     thrust::get<0>(t) = thrust::get<1>(t);
2336   }
2337 };
2338 
2339 struct MatMatCusparse {
2340   PetscBool      cisdense;
2341   PetscScalar   *Bt;
2342   Mat            X;
2343   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2344   PetscLogDouble flops;
2345   CsrMatrix     *Bcsr;
2346 
2347 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2348   cusparseSpMatDescr_t matSpBDescr;
2349   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2350   cusparseDnMatDescr_t matBDescr;
2351   cusparseDnMatDescr_t matCDescr;
2352   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2353 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2354   void *dBuffer4;
2355   void *dBuffer5;
2356 #endif
2357   size_t                mmBufferSize;
2358   void                 *mmBuffer;
2359   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2360   cusparseSpGEMMDescr_t spgemmDesc;
2361 #endif
2362 };
2363 
2364 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) {
2365   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2366 
2367   PetscFunctionBegin;
2368   PetscCallCUDA(cudaFree(mmdata->Bt));
2369   delete mmdata->Bcsr;
2370 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2371   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2372   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2373   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2374   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2375 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2376   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2377   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2378 #endif
2379   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2380   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2381 #endif
2382   PetscCall(MatDestroy(&mmdata->X));
2383   PetscCall(PetscFree(data));
2384   PetscFunctionReturn(0);
2385 }
2386 
2387 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2388 
2389 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2390   Mat_Product                  *product = C->product;
2391   Mat                           A, B;
2392   PetscInt                      m, n, blda, clda;
2393   PetscBool                     flg, biscuda;
2394   Mat_SeqAIJCUSPARSE           *cusp;
2395   cusparseStatus_t              stat;
2396   cusparseOperation_t           opA;
2397   const PetscScalar            *barray;
2398   PetscScalar                  *carray;
2399   MatMatCusparse               *mmdata;
2400   Mat_SeqAIJCUSPARSEMultStruct *mat;
2401   CsrMatrix                    *csrmat;
2402 
2403   PetscFunctionBegin;
2404   MatCheckProduct(C, 1);
2405   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2406   mmdata = (MatMatCusparse *)product->data;
2407   A      = product->A;
2408   B      = product->B;
2409   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2410   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2411   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2412      Instead of silently accepting the wrong answer, I prefer to raise the error */
2413   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2414   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2415   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2416   switch (product->type) {
2417   case MATPRODUCT_AB:
2418   case MATPRODUCT_PtAP:
2419     mat = cusp->mat;
2420     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2421     m   = A->rmap->n;
2422     n   = B->cmap->n;
2423     break;
2424   case MATPRODUCT_AtB:
2425     if (!A->form_explicit_transpose) {
2426       mat = cusp->mat;
2427       opA = CUSPARSE_OPERATION_TRANSPOSE;
2428     } else {
2429       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2430       mat = cusp->matTranspose;
2431       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2432     }
2433     m = A->cmap->n;
2434     n = B->cmap->n;
2435     break;
2436   case MATPRODUCT_ABt:
2437   case MATPRODUCT_RARt:
2438     mat = cusp->mat;
2439     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2440     m   = A->rmap->n;
2441     n   = B->rmap->n;
2442     break;
2443   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2444   }
2445   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2446   csrmat = (CsrMatrix *)mat->mat;
2447   /* if the user passed a CPU matrix, copy the data to the GPU */
2448   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2449   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2450   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2451 
2452   PetscCall(MatDenseGetLDA(B, &blda));
2453   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2454     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
2455     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2456   } else {
2457     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
2458     PetscCall(MatDenseGetLDA(C, &clda));
2459   }
2460 
2461   PetscCall(PetscLogGpuTimeBegin());
2462 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2463   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2464   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2465   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2466     size_t mmBufferSize;
2467     if (mmdata->initialized && mmdata->Blda != blda) {
2468       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2469       mmdata->matBDescr = NULL;
2470     }
2471     if (!mmdata->matBDescr) {
2472       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2473       mmdata->Blda = blda;
2474     }
2475 
2476     if (mmdata->initialized && mmdata->Clda != clda) {
2477       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2478       mmdata->matCDescr = NULL;
2479     }
2480     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2481       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2482       mmdata->Clda = clda;
2483     }
2484 
2485     if (!mat->matDescr) {
2486       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2487                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2488       PetscCallCUSPARSE(stat);
2489     }
2490     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2491     PetscCallCUSPARSE(stat);
2492     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2493       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2494       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2495       mmdata->mmBufferSize = mmBufferSize;
2496     }
2497     mmdata->initialized = PETSC_TRUE;
2498   } else {
2499     /* to be safe, always update pointers of the mats */
2500     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2501     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2502     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2503   }
2504 
2505   /* do cusparseSpMM, which supports transpose on B */
2506   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2507   PetscCallCUSPARSE(stat);
2508 #else
2509   PetscInt k;
2510   /* cusparseXcsrmm does not support transpose on B */
2511   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2512     cublasHandle_t cublasv2handle;
2513     cublasStatus_t cerr;
2514 
2515     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2516     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2517     PetscCallCUBLAS(cerr);
2518     blda = B->cmap->n;
2519     k = B->cmap->n;
2520   } else {
2521     k = B->rmap->n;
2522   }
2523 
2524   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2525   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2526   PetscCallCUSPARSE(stat);
2527 #endif
2528   PetscCall(PetscLogGpuTimeEnd());
2529   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2530   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2531   if (product->type == MATPRODUCT_RARt) {
2532     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2533     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2534   } else if (product->type == MATPRODUCT_PtAP) {
2535     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
2536     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2537   } else {
2538     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2539   }
2540   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2541   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2542   PetscFunctionReturn(0);
2543 }
2544 
2545 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2546   Mat_Product        *product = C->product;
2547   Mat                 A, B;
2548   PetscInt            m, n;
2549   PetscBool           cisdense, flg;
2550   MatMatCusparse     *mmdata;
2551   Mat_SeqAIJCUSPARSE *cusp;
2552 
2553   PetscFunctionBegin;
2554   MatCheckProduct(C, 1);
2555   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2556   A = product->A;
2557   B = product->B;
2558   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2559   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2560   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2561   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2562   switch (product->type) {
2563   case MATPRODUCT_AB:
2564     m = A->rmap->n;
2565     n = B->cmap->n;
2566     break;
2567   case MATPRODUCT_AtB:
2568     m = A->cmap->n;
2569     n = B->cmap->n;
2570     break;
2571   case MATPRODUCT_ABt:
2572     m = A->rmap->n;
2573     n = B->rmap->n;
2574     break;
2575   case MATPRODUCT_PtAP:
2576     m = B->cmap->n;
2577     n = B->cmap->n;
2578     break;
2579   case MATPRODUCT_RARt:
2580     m = B->rmap->n;
2581     n = B->rmap->n;
2582     break;
2583   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2584   }
2585   PetscCall(MatSetSizes(C, m, n, m, n));
2586   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2587   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2588   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2589 
2590   /* product data */
2591   PetscCall(PetscNew(&mmdata));
2592   mmdata->cisdense = cisdense;
2593 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2594   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2595   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2596 #endif
2597   /* for these products we need intermediate storage */
2598   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2599     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2600     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2601     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2602       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2603     } else {
2604       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2605     }
2606   }
2607   C->product->data    = mmdata;
2608   C->product->destroy = MatDestroy_MatMatCusparse;
2609 
2610   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2611   PetscFunctionReturn(0);
2612 }
2613 
2614 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2615   Mat_Product                  *product = C->product;
2616   Mat                           A, B;
2617   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2618   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2619   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2620   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2621   PetscBool                     flg;
2622   cusparseStatus_t              stat;
2623   MatProductType                ptype;
2624   MatMatCusparse               *mmdata;
2625 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2626   cusparseSpMatDescr_t BmatSpDescr;
2627 #endif
2628   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2629 
2630   PetscFunctionBegin;
2631   MatCheckProduct(C, 1);
2632   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2633   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2634   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2635   mmdata = (MatMatCusparse *)C->product->data;
2636   A      = product->A;
2637   B      = product->B;
2638   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2639     mmdata->reusesym = PETSC_FALSE;
2640     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2641     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2642     Cmat = Ccusp->mat;
2643     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2644     Ccsr = (CsrMatrix *)Cmat->mat;
2645     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2646     goto finalize;
2647   }
2648   if (!c->nz) goto finalize;
2649   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2650   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2651   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2652   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2653   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2654   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2655   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2656   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2657   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2658   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2659   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2660   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2661   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2662   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2663 
2664   ptype = product->type;
2665   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2666     ptype = MATPRODUCT_AB;
2667     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2668   }
2669   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2670     ptype = MATPRODUCT_AB;
2671     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2672   }
2673   switch (ptype) {
2674   case MATPRODUCT_AB:
2675     Amat = Acusp->mat;
2676     Bmat = Bcusp->mat;
2677     break;
2678   case MATPRODUCT_AtB:
2679     Amat = Acusp->matTranspose;
2680     Bmat = Bcusp->mat;
2681     break;
2682   case MATPRODUCT_ABt:
2683     Amat = Acusp->mat;
2684     Bmat = Bcusp->matTranspose;
2685     break;
2686   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2687   }
2688   Cmat = Ccusp->mat;
2689   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2690   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2691   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2692   Acsr = (CsrMatrix *)Amat->mat;
2693   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2694   Ccsr = (CsrMatrix *)Cmat->mat;
2695   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2696   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2697   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2698   PetscCall(PetscLogGpuTimeBegin());
2699 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2700   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2701   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2702 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2703   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2704   PetscCallCUSPARSE(stat);
2705 #else
2706   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2707   PetscCallCUSPARSE(stat);
2708   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2709   PetscCallCUSPARSE(stat);
2710 #endif
2711 #else
2712   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2713                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2714   PetscCallCUSPARSE(stat);
2715 #endif
2716   PetscCall(PetscLogGpuFlops(mmdata->flops));
2717   PetscCallCUDA(WaitForCUDA());
2718   PetscCall(PetscLogGpuTimeEnd());
2719   C->offloadmask = PETSC_OFFLOAD_GPU;
2720 finalize:
2721   /* shorter version of MatAssemblyEnd_SeqAIJ */
2722   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2723   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2724   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2725   c->reallocs = 0;
2726   C->info.mallocs += 0;
2727   C->info.nz_unneeded = 0;
2728   C->assembled = C->was_assembled = PETSC_TRUE;
2729   C->num_ass++;
2730   PetscFunctionReturn(0);
2731 }
2732 
2733 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2734   Mat_Product                  *product = C->product;
2735   Mat                           A, B;
2736   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2737   Mat_SeqAIJ                   *a, *b, *c;
2738   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2739   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2740   PetscInt                      i, j, m, n, k;
2741   PetscBool                     flg;
2742   cusparseStatus_t              stat;
2743   MatProductType                ptype;
2744   MatMatCusparse               *mmdata;
2745   PetscLogDouble                flops;
2746   PetscBool                     biscompressed, ciscompressed;
2747 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2748   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2749   cusparseSpMatDescr_t BmatSpDescr;
2750 #else
2751   int cnz;
2752 #endif
2753   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2754 
2755   PetscFunctionBegin;
2756   MatCheckProduct(C, 1);
2757   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2758   A = product->A;
2759   B = product->B;
2760   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2761   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2762   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2763   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2764   a = (Mat_SeqAIJ *)A->data;
2765   b = (Mat_SeqAIJ *)B->data;
2766   /* product data */
2767   PetscCall(PetscNew(&mmdata));
2768   C->product->data    = mmdata;
2769   C->product->destroy = MatDestroy_MatMatCusparse;
2770 
2771   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2772   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2773   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2774   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2775   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2776   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2777 
2778   ptype = product->type;
2779   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2780     ptype                                          = MATPRODUCT_AB;
2781     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2782   }
2783   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2784     ptype                                          = MATPRODUCT_AB;
2785     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2786   }
2787   biscompressed = PETSC_FALSE;
2788   ciscompressed = PETSC_FALSE;
2789   switch (ptype) {
2790   case MATPRODUCT_AB:
2791     m    = A->rmap->n;
2792     n    = B->cmap->n;
2793     k    = A->cmap->n;
2794     Amat = Acusp->mat;
2795     Bmat = Bcusp->mat;
2796     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2797     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2798     break;
2799   case MATPRODUCT_AtB:
2800     m = A->cmap->n;
2801     n = B->cmap->n;
2802     k = A->rmap->n;
2803     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2804     Amat = Acusp->matTranspose;
2805     Bmat = Bcusp->mat;
2806     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2807     break;
2808   case MATPRODUCT_ABt:
2809     m = A->rmap->n;
2810     n = B->rmap->n;
2811     k = A->cmap->n;
2812     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2813     Amat = Acusp->mat;
2814     Bmat = Bcusp->matTranspose;
2815     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2816     break;
2817   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2818   }
2819 
2820   /* create cusparse matrix */
2821   PetscCall(MatSetSizes(C, m, n, m, n));
2822   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2823   c     = (Mat_SeqAIJ *)C->data;
2824   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2825   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2826   Ccsr  = new CsrMatrix;
2827 
2828   c->compressedrow.use = ciscompressed;
2829   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2830     c->compressedrow.nrows = a->compressedrow.nrows;
2831     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2832     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2833     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2834     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2835     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2836   } else {
2837     c->compressedrow.nrows  = 0;
2838     c->compressedrow.i      = NULL;
2839     c->compressedrow.rindex = NULL;
2840     Ccusp->workVector       = NULL;
2841     Cmat->cprowIndices      = NULL;
2842   }
2843   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2844   Ccusp->mat        = Cmat;
2845   Ccusp->mat->mat   = Ccsr;
2846   Ccsr->num_rows    = Ccusp->nrows;
2847   Ccsr->num_cols    = n;
2848   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2849   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2850   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2851   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2852   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2853   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2854   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2855   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2856   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2857   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2858   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2859     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2860     c->nz                = 0;
2861     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2862     Ccsr->values         = new THRUSTARRAY(c->nz);
2863     goto finalizesym;
2864   }
2865 
2866   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2867   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2868   Acsr = (CsrMatrix *)Amat->mat;
2869   if (!biscompressed) {
2870     Bcsr = (CsrMatrix *)Bmat->mat;
2871 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2872     BmatSpDescr = Bmat->matDescr;
2873 #endif
2874   } else { /* we need to use row offsets for the full matrix */
2875     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2876     Bcsr                 = new CsrMatrix;
2877     Bcsr->num_rows       = B->rmap->n;
2878     Bcsr->num_cols       = cBcsr->num_cols;
2879     Bcsr->num_entries    = cBcsr->num_entries;
2880     Bcsr->column_indices = cBcsr->column_indices;
2881     Bcsr->values         = cBcsr->values;
2882     if (!Bcusp->rowoffsets_gpu) {
2883       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2884       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2885       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2886     }
2887     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2888     mmdata->Bcsr      = Bcsr;
2889 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2890     if (Bcsr->num_rows && Bcsr->num_cols) {
2891       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2892       PetscCallCUSPARSE(stat);
2893     }
2894     BmatSpDescr = mmdata->matSpBDescr;
2895 #endif
2896   }
2897   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2898   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2899   /* precompute flops count */
2900   if (ptype == MATPRODUCT_AB) {
2901     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2902       const PetscInt st = a->i[i];
2903       const PetscInt en = a->i[i + 1];
2904       for (j = st; j < en; j++) {
2905         const PetscInt brow = a->j[j];
2906         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2907       }
2908     }
2909   } else if (ptype == MATPRODUCT_AtB) {
2910     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2911       const PetscInt anzi = a->i[i + 1] - a->i[i];
2912       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2913       flops += (2. * anzi) * bnzi;
2914     }
2915   } else { /* TODO */
2916     flops = 0.;
2917   }
2918 
2919   mmdata->flops = flops;
2920   PetscCall(PetscLogGpuTimeBegin());
2921 
2922 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2923   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2924   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2925   PetscCallCUSPARSE(stat);
2926   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2927 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2928   {
2929     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2930      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2931   */
2932     void  *dBuffer1    = NULL;
2933     void  *dBuffer2    = NULL;
2934     void  *dBuffer3    = NULL;
2935     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2936     size_t bufferSize1 = 0;
2937     size_t bufferSize2 = 0;
2938     size_t bufferSize3 = 0;
2939     size_t bufferSize4 = 0;
2940     size_t bufferSize5 = 0;
2941 
2942     /*----------------------------------------------------------------------*/
2943     /* ask bufferSize1 bytes for external memory */
2944     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2945     PetscCallCUSPARSE(stat);
2946     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2947     /* inspect the matrices A and B to understand the memory requirement for the next step */
2948     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2949     PetscCallCUSPARSE(stat);
2950 
2951     /*----------------------------------------------------------------------*/
2952     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2953     PetscCallCUSPARSE(stat);
2954     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2955     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2956     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2957     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2958     PetscCallCUSPARSE(stat);
2959     PetscCallCUDA(cudaFree(dBuffer1));
2960     PetscCallCUDA(cudaFree(dBuffer2));
2961 
2962     /*----------------------------------------------------------------------*/
2963     /* get matrix C non-zero entries C_nnz1 */
2964     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2965     c->nz                = (PetscInt)C_nnz1;
2966     /* allocate matrix C */
2967     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2968     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2969     Ccsr->values = new THRUSTARRAY(c->nz);
2970     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2971     /* update matC with the new pointers */
2972     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2973     PetscCallCUSPARSE(stat);
2974 
2975     /*----------------------------------------------------------------------*/
2976     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2977     PetscCallCUSPARSE(stat);
2978     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2979     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2980     PetscCallCUSPARSE(stat);
2981     PetscCallCUDA(cudaFree(dBuffer3));
2982     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2983     PetscCallCUSPARSE(stat);
2984     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2985   }
2986 #else
2987   size_t bufSize2;
2988   /* ask bufferSize bytes for external memory */
2989   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2990   PetscCallCUSPARSE(stat);
2991   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2992   /* inspect the matrices A and B to understand the memory requirement for the next step */
2993   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2994   PetscCallCUSPARSE(stat);
2995   /* ask bufferSize again bytes for external memory */
2996   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2997   PetscCallCUSPARSE(stat);
2998   /* The CUSPARSE documentation is not clear, nor the API
2999      We need both buffers to perform the operations properly!
3000      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3001      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3002      is stored in the descriptor! What a messy API... */
3003   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3004   /* compute the intermediate product of A * B */
3005   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3006   PetscCallCUSPARSE(stat);
3007   /* get matrix C non-zero entries C_nnz1 */
3008   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3009   c->nz = (PetscInt)C_nnz1;
3010   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3011                       mmdata->mmBufferSize / 1024));
3012   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3013   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3014   Ccsr->values = new THRUSTARRAY(c->nz);
3015   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3016   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3017   PetscCallCUSPARSE(stat);
3018   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3019   PetscCallCUSPARSE(stat);
3020 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3021 #else
3022   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3023   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3024                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3025   PetscCallCUSPARSE(stat);
3026   c->nz = cnz;
3027   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3028   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3029   Ccsr->values = new THRUSTARRAY(c->nz);
3030   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3031 
3032   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3033   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3034      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3035      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3036   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3037                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3038   PetscCallCUSPARSE(stat);
3039 #endif
3040   PetscCall(PetscLogGpuFlops(mmdata->flops));
3041   PetscCall(PetscLogGpuTimeEnd());
3042 finalizesym:
3043   c->singlemalloc = PETSC_FALSE;
3044   c->free_a       = PETSC_TRUE;
3045   c->free_ij      = PETSC_TRUE;
3046   PetscCall(PetscMalloc1(m + 1, &c->i));
3047   PetscCall(PetscMalloc1(c->nz, &c->j));
3048   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3049     PetscInt      *d_i = c->i;
3050     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3051     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3052     ii = *Ccsr->row_offsets;
3053     jj = *Ccsr->column_indices;
3054     if (ciscompressed) d_i = c->compressedrow.i;
3055     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3056     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3057   } else {
3058     PetscInt *d_i = c->i;
3059     if (ciscompressed) d_i = c->compressedrow.i;
3060     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3061     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3062   }
3063   if (ciscompressed) { /* need to expand host row offsets */
3064     PetscInt r = 0;
3065     c->i[0]    = 0;
3066     for (k = 0; k < c->compressedrow.nrows; k++) {
3067       const PetscInt next = c->compressedrow.rindex[k];
3068       const PetscInt old  = c->compressedrow.i[k];
3069       for (; r < next; r++) c->i[r + 1] = old;
3070     }
3071     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3072   }
3073   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3074   PetscCall(PetscMalloc1(m, &c->ilen));
3075   PetscCall(PetscMalloc1(m, &c->imax));
3076   c->maxnz         = c->nz;
3077   c->nonzerorowcnt = 0;
3078   c->rmax          = 0;
3079   for (k = 0; k < m; k++) {
3080     const PetscInt nn = c->i[k + 1] - c->i[k];
3081     c->ilen[k] = c->imax[k] = nn;
3082     c->nonzerorowcnt += (PetscInt) !!nn;
3083     c->rmax = PetscMax(c->rmax, nn);
3084   }
3085   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3086   PetscCall(PetscMalloc1(c->nz, &c->a));
3087   Ccsr->num_entries = c->nz;
3088 
3089   C->nonzerostate++;
3090   PetscCall(PetscLayoutSetUp(C->rmap));
3091   PetscCall(PetscLayoutSetUp(C->cmap));
3092   Ccusp->nonzerostate = C->nonzerostate;
3093   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3094   C->preallocated     = PETSC_TRUE;
3095   C->assembled        = PETSC_FALSE;
3096   C->was_assembled    = PETSC_FALSE;
3097   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3098     mmdata->reusesym = PETSC_TRUE;
3099     C->offloadmask   = PETSC_OFFLOAD_GPU;
3100   }
3101   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3102   PetscFunctionReturn(0);
3103 }
3104 
3105 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3106 
3107 /* handles sparse or dense B */
3108 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) {
3109   Mat_Product *product = mat->product;
3110   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3111 
3112   PetscFunctionBegin;
3113   MatCheckProduct(mat, 1);
3114   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3115   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3116   if (product->type == MATPRODUCT_ABC) {
3117     Ciscusp = PETSC_FALSE;
3118     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3119   }
3120   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3121     PetscBool usecpu = PETSC_FALSE;
3122     switch (product->type) {
3123     case MATPRODUCT_AB:
3124       if (product->api_user) {
3125         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3126         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3127         PetscOptionsEnd();
3128       } else {
3129         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3130         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3131         PetscOptionsEnd();
3132       }
3133       break;
3134     case MATPRODUCT_AtB:
3135       if (product->api_user) {
3136         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3137         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3138         PetscOptionsEnd();
3139       } else {
3140         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3141         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3142         PetscOptionsEnd();
3143       }
3144       break;
3145     case MATPRODUCT_PtAP:
3146       if (product->api_user) {
3147         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3148         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3149         PetscOptionsEnd();
3150       } else {
3151         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3152         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3153         PetscOptionsEnd();
3154       }
3155       break;
3156     case MATPRODUCT_RARt:
3157       if (product->api_user) {
3158         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3159         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3160         PetscOptionsEnd();
3161       } else {
3162         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3163         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3164         PetscOptionsEnd();
3165       }
3166       break;
3167     case MATPRODUCT_ABC:
3168       if (product->api_user) {
3169         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3170         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3171         PetscOptionsEnd();
3172       } else {
3173         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3174         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3175         PetscOptionsEnd();
3176       }
3177       break;
3178     default: break;
3179     }
3180     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3181   }
3182   /* dispatch */
3183   if (isdense) {
3184     switch (product->type) {
3185     case MATPRODUCT_AB:
3186     case MATPRODUCT_AtB:
3187     case MATPRODUCT_ABt:
3188     case MATPRODUCT_PtAP:
3189     case MATPRODUCT_RARt:
3190       if (product->A->boundtocpu) {
3191         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3192       } else {
3193         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3194       }
3195       break;
3196     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
3197     default: break;
3198     }
3199   } else if (Biscusp && Ciscusp) {
3200     switch (product->type) {
3201     case MATPRODUCT_AB:
3202     case MATPRODUCT_AtB:
3203     case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break;
3204     case MATPRODUCT_PtAP:
3205     case MATPRODUCT_RARt:
3206     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
3207     default: break;
3208     }
3209   } else { /* fallback for AIJ */
3210     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3211   }
3212   PetscFunctionReturn(0);
3213 }
3214 
3215 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3216   PetscFunctionBegin;
3217   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3218   PetscFunctionReturn(0);
3219 }
3220 
3221 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3222   PetscFunctionBegin;
3223   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3224   PetscFunctionReturn(0);
3225 }
3226 
3227 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3228   PetscFunctionBegin;
3229   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3230   PetscFunctionReturn(0);
3231 }
3232 
3233 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3234   PetscFunctionBegin;
3235   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3236   PetscFunctionReturn(0);
3237 }
3238 
3239 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3240   PetscFunctionBegin;
3241   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3242   PetscFunctionReturn(0);
3243 }
3244 
3245 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) {
3246   int i = blockIdx.x * blockDim.x + threadIdx.x;
3247   if (i < n) y[idx[i]] += x[i];
3248 }
3249 
3250 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3251 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) {
3252   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3253   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3254   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3255   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3256   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3257   PetscBool                     compressed;
3258 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3259   PetscInt nx, ny;
3260 #endif
3261 
3262   PetscFunctionBegin;
3263   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3264   if (!a->nz) {
3265     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
3266     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3267     PetscFunctionReturn(0);
3268   }
3269   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3270   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3271   if (!trans) {
3272     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3273     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3274   } else {
3275     if (herm || !A->form_explicit_transpose) {
3276       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3277       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3278     } else {
3279       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3280       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3281     }
3282   }
3283   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3284   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3285 
3286   try {
3287     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3288     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3289     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3290 
3291     PetscCall(PetscLogGpuTimeBegin());
3292     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3293       /* z = A x + beta y.
3294          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3295          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3296       */
3297       xptr = xarray;
3298       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3299       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3300 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3301       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3302           allocated to accommodate different uses. So we get the length info directly from mat.
3303        */
3304       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3305         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3306         nx             = mat->num_cols;
3307         ny             = mat->num_rows;
3308       }
3309 #endif
3310     } else {
3311       /* z = A^T x + beta y
3312          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3313          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3314        */
3315       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3316       dptr = zarray;
3317       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3318       if (compressed) { /* Scatter x to work vector */
3319         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3320 
3321         thrust::for_each(
3322 #if PetscDefined(HAVE_THRUST_ASYNC)
3323           thrust::cuda::par.on(PetscDefaultCudaStream),
3324 #endif
3325           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3326           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3327       }
3328 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3329       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3330         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3331         nx             = mat->num_rows;
3332         ny             = mat->num_cols;
3333       }
3334 #endif
3335     }
3336 
3337     /* csr_spmv does y = alpha op(A) x + beta y */
3338     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3339 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3340       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3341       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3342         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3343         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3344         PetscCallCUSPARSE(
3345           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3346         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3347 
3348         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3349       } else {
3350         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3351         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3352         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3353       }
3354 
3355       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3356                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3357 #else
3358       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3359       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3360 #endif
3361     } else {
3362       if (cusparsestruct->nrows) {
3363 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3364         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3365 #else
3366         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3367         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3368 #endif
3369       }
3370     }
3371     PetscCall(PetscLogGpuTimeEnd());
3372 
3373     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3374       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3375         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3376           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3377         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3378           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
3379         }
3380       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3381         PetscCall(VecSet_SeqCUDA(zz, 0));
3382       }
3383 
3384       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3385       if (compressed) {
3386         PetscCall(PetscLogGpuTimeBegin());
3387         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3388            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3389            prevent that. So I just add a ScatterAdd kernel.
3390          */
3391 #if 0
3392         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3393         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3394                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3395                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3396                          VecCUDAPlusEquals());
3397 #else
3398         PetscInt n = matstruct->cprowIndices->size();
3399         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3400 #endif
3401         PetscCall(PetscLogGpuTimeEnd());
3402       }
3403     } else {
3404       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3405     }
3406     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3407     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3408     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3409   } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
3410   if (yy) {
3411     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3412   } else {
3413     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3414   }
3415   PetscFunctionReturn(0);
3416 }
3417 
3418 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3419   PetscFunctionBegin;
3420   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3421   PetscFunctionReturn(0);
3422 }
3423 
3424 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) {
3425   PetscObjectState    onnz = A->nonzerostate;
3426   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3427 
3428   PetscFunctionBegin;
3429   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3430   if (onnz != A->nonzerostate && cusp->deviceMat) {
3431     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3432     PetscCallCUDA(cudaFree(cusp->deviceMat));
3433     cusp->deviceMat = NULL;
3434   }
3435   PetscFunctionReturn(0);
3436 }
3437 
3438 /* --------------------------------------------------------------------------------*/
3439 /*@
3440    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3441    (the default parallel PETSc format). This matrix will ultimately pushed down
3442    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3443    assembly performance the user should preallocate the matrix storage by setting
3444    the parameter nz (or the array nnz).  By setting these parameters accurately,
3445    performance during matrix assembly can be increased by more than a factor of 50.
3446 
3447    Collective
3448 
3449    Input Parameters:
3450 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3451 .  m - number of rows
3452 .  n - number of columns
3453 .  nz - number of nonzeros per row (same for all rows)
3454 -  nnz - array containing the number of nonzeros in the various rows
3455          (possibly different for each row) or NULL
3456 
3457    Output Parameter:
3458 .  A - the matrix
3459 
3460    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3461    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3462    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3463 
3464    Notes:
3465    If nnz is given then nz is ignored
3466 
3467    The AIJ format, also called
3468    compressed row storage, is fully compatible with standard Fortran 77
3469    storage.  That is, the stored row and column indices can begin at
3470    either one (as in Fortran) or zero.  See the users' manual for details.
3471 
3472    Specify the preallocated storage with either nz or nnz (not both).
3473    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3474    allocation.  For large problems you MUST preallocate memory or you
3475    will get TERRIBLE performance, see the users' manual chapter on matrices.
3476 
3477    By default, this format uses inodes (identical nodes) when possible, to
3478    improve numerical efficiency of matrix-vector products and solves. We
3479    search for consecutive rows with the same nonzero structure, thereby
3480    reusing matrix information to achieve increased efficiency.
3481 
3482    Level: intermediate
3483 
3484 .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3485 @*/
3486 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
3487   PetscFunctionBegin;
3488   PetscCall(MatCreate(comm, A));
3489   PetscCall(MatSetSizes(*A, m, n, m, n));
3490   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3491   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3492   PetscFunctionReturn(0);
3493 }
3494 
3495 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) {
3496   PetscFunctionBegin;
3497   if (A->factortype == MAT_FACTOR_NONE) {
3498     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3499   } else {
3500     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3501   }
3502   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3503   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3504   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3505   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3506   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3507   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3508   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3509   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3510   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3511   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3512   PetscCall(MatDestroy_SeqAIJ(A));
3513   PetscFunctionReturn(0);
3514 }
3515 
3516 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3517 static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3518 static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) {
3519         PetscFunctionBegin;
3520         PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3521         PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3522         PetscFunctionReturn(0);
3523 }
3524 
3525 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) {
3526   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3527   Mat_SeqAIJCUSPARSE *cy;
3528   Mat_SeqAIJCUSPARSE *cx;
3529   PetscScalar        *ay;
3530   const PetscScalar  *ax;
3531   CsrMatrix          *csry, *csrx;
3532 
3533   PetscFunctionBegin;
3534   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3535   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3536   if (X->ops->axpy != Y->ops->axpy) {
3537     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3538     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3539     PetscFunctionReturn(0);
3540   }
3541   /* if we are here, it means both matrices are bound to GPU */
3542   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3543   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3544   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3545   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3546   csry = (CsrMatrix *)cy->mat->mat;
3547   csrx = (CsrMatrix *)cx->mat->mat;
3548   /* see if we can turn this into a cublas axpy */
3549   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3550     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3551     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3552     if (eq) str = SAME_NONZERO_PATTERN;
3553   }
3554   /* spgeam is buggy with one column */
3555   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3556 
3557   if (str == SUBSET_NONZERO_PATTERN) {
3558     PetscScalar b = 1.0;
3559 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3560     size_t bufferSize;
3561     void  *buffer;
3562 #endif
3563 
3564     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3565     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3566     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3567 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3568     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3569                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3570     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3571     PetscCall(PetscLogGpuTimeBegin());
3572     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3573                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3574     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3575     PetscCall(PetscLogGpuTimeEnd());
3576     PetscCallCUDA(cudaFree(buffer));
3577 #else
3578     PetscCall(PetscLogGpuTimeBegin());
3579     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3580                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3581     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3582     PetscCall(PetscLogGpuTimeEnd());
3583 #endif
3584     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3585     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3586     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3587     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3588   } else if (str == SAME_NONZERO_PATTERN) {
3589     cublasHandle_t cublasv2handle;
3590     PetscBLASInt   one = 1, bnz = 1;
3591 
3592     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3593     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3594     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3595     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3596     PetscCall(PetscLogGpuTimeBegin());
3597     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3598     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3599     PetscCall(PetscLogGpuTimeEnd());
3600     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3601     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3602     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3603   } else {
3604     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3605     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3606   }
3607   PetscFunctionReturn(0);
3608 }
3609 
3610 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) {
3611   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3612   PetscScalar   *ay;
3613   cublasHandle_t cublasv2handle;
3614   PetscBLASInt   one = 1, bnz = 1;
3615 
3616   PetscFunctionBegin;
3617   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3618   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3619   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3620   PetscCall(PetscLogGpuTimeBegin());
3621   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3622   PetscCall(PetscLogGpuFlops(bnz));
3623   PetscCall(PetscLogGpuTimeEnd());
3624   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3625   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3626   PetscFunctionReturn(0);
3627 }
3628 
3629 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) {
3630   PetscBool   both = PETSC_FALSE;
3631   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
3632 
3633   PetscFunctionBegin;
3634   if (A->factortype == MAT_FACTOR_NONE) {
3635     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3636     if (spptr->mat) {
3637       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3638       if (matrix->values) {
3639         both = PETSC_TRUE;
3640         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3641       }
3642     }
3643     if (spptr->matTranspose) {
3644       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3645       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3646     }
3647   }
3648   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3649   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3650   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3651   else A->offloadmask = PETSC_OFFLOAD_CPU;
3652   PetscFunctionReturn(0);
3653 }
3654 
3655 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) {
3656   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3657 
3658   PetscFunctionBegin;
3659   if (A->factortype != MAT_FACTOR_NONE) {
3660     A->boundtocpu = flg;
3661     PetscFunctionReturn(0);
3662   }
3663   if (flg) {
3664     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3665 
3666     A->ops->scale                     = MatScale_SeqAIJ;
3667     A->ops->axpy                      = MatAXPY_SeqAIJ;
3668     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3669     A->ops->mult                      = MatMult_SeqAIJ;
3670     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3671     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3672     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3673     A->ops->multhermitiantranspose    = NULL;
3674     A->ops->multhermitiantransposeadd = NULL;
3675     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3676     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3677     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3678     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3679     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3680     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3681     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3682     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3683   } else {
3684     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3685     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3686     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3687     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3688     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3689     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3690     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3691     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3692     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3693     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3694     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3695     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3696     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3697     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3698     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3699     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3700     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3701 
3702     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3703     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3704     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3705     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3706     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3707     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3708   }
3709   A->boundtocpu = flg;
3710   if (flg && a->inode.size) {
3711     a->inode.use = PETSC_TRUE;
3712   } else {
3713     a->inode.use = PETSC_FALSE;
3714   }
3715   PetscFunctionReturn(0);
3716 }
3717 
3718 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) {
3719   Mat B;
3720 
3721   PetscFunctionBegin;
3722   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3723   if (reuse == MAT_INITIAL_MATRIX) {
3724     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3725   } else if (reuse == MAT_REUSE_MATRIX) {
3726     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3727   }
3728   B = *newmat;
3729 
3730   PetscCall(PetscFree(B->defaultvectype));
3731   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3732 
3733   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3734     if (B->factortype == MAT_FACTOR_NONE) {
3735       Mat_SeqAIJCUSPARSE *spptr;
3736       PetscCall(PetscNew(&spptr));
3737       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3738       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3739       spptr->format = MAT_CUSPARSE_CSR;
3740 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3741 #if CUSPARSE_VERSION > 11301
3742       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3743 #else
3744       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3745 #endif
3746       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3747       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3748 #endif
3749       B->spptr = spptr;
3750     } else {
3751       Mat_SeqAIJCUSPARSETriFactors *spptr;
3752 
3753       PetscCall(PetscNew(&spptr));
3754       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3755       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3756       B->spptr = spptr;
3757     }
3758     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3759   }
3760   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3761   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3762   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3763   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3764   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3765   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3766 
3767   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3768   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3769   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3770 #if defined(PETSC_HAVE_HYPRE)
3771   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3772 #endif
3773   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3774   PetscFunctionReturn(0);
3775 }
3776 
3777 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) {
3778   PetscFunctionBegin;
3779   PetscCall(MatCreate_SeqAIJ(B));
3780   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3781   PetscFunctionReturn(0);
3782 }
3783 
3784 /*MC
3785    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3786 
3787    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3788    CSR, ELL, or Hybrid format.
3789    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3790 
3791    Options Database Keys:
3792 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3793 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3794 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3795 +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3796 
3797   Level: beginner
3798 
3799 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3800 M*/
3801 
3802 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3803 
3804 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) {
3805   PetscFunctionBegin;
3806   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3807   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3808   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3809   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3810   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3811 
3812   PetscFunctionReturn(0);
3813 }
3814 
3815 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) {
3816   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3817 
3818   PetscFunctionBegin;
3819   if (!cusp) PetscFunctionReturn(0);
3820   delete cusp->cooPerm;
3821   delete cusp->cooPerm_a;
3822   cusp->cooPerm   = NULL;
3823   cusp->cooPerm_a = NULL;
3824   if (cusp->use_extended_coo) {
3825     PetscCallCUDA(cudaFree(cusp->jmap_d));
3826     PetscCallCUDA(cudaFree(cusp->perm_d));
3827   }
3828   cusp->use_extended_coo = PETSC_FALSE;
3829   PetscFunctionReturn(0);
3830 }
3831 
3832 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) {
3833   PetscFunctionBegin;
3834   if (*cusparsestruct) {
3835     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3836     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3837     delete (*cusparsestruct)->workVector;
3838     delete (*cusparsestruct)->rowoffsets_gpu;
3839     delete (*cusparsestruct)->cooPerm;
3840     delete (*cusparsestruct)->cooPerm_a;
3841     delete (*cusparsestruct)->csr2csc_i;
3842     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3843     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3844     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3845     PetscCall(PetscFree(*cusparsestruct));
3846   }
3847   PetscFunctionReturn(0);
3848 }
3849 
3850 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) {
3851   PetscFunctionBegin;
3852   if (*mat) {
3853     delete (*mat)->values;
3854     delete (*mat)->column_indices;
3855     delete (*mat)->row_offsets;
3856     delete *mat;
3857     *mat = 0;
3858   }
3859   PetscFunctionReturn(0);
3860 }
3861 
3862 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) {
3863   PetscFunctionBegin;
3864   if (*trifactor) {
3865     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3866     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3867     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3868     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3869     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3870 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3871     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3872 #endif
3873     PetscCall(PetscFree(*trifactor));
3874   }
3875   PetscFunctionReturn(0);
3876 }
3877 
3878 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) {
3879   CsrMatrix *mat;
3880 
3881   PetscFunctionBegin;
3882   if (*matstruct) {
3883     if ((*matstruct)->mat) {
3884       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3885 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3886         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3887 #else
3888         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3889         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3890 #endif
3891       } else {
3892         mat = (CsrMatrix *)(*matstruct)->mat;
3893         CsrMatrix_Destroy(&mat);
3894       }
3895     }
3896     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3897     delete (*matstruct)->cprowIndices;
3898     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3899     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3900     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3901 
3902 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3903     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3904     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3905     for (int i = 0; i < 3; i++) {
3906       if (mdata->cuSpMV[i].initialized) {
3907         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3908         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3909         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3910       }
3911     }
3912 #endif
3913     delete *matstruct;
3914     *matstruct = NULL;
3915   }
3916   PetscFunctionReturn(0);
3917 }
3918 
3919 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) {
3920   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3921 
3922   PetscFunctionBegin;
3923   if (fs) {
3924     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3925     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3926     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3927     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3928     delete fs->rpermIndices;
3929     delete fs->cpermIndices;
3930     delete fs->workVector;
3931     fs->rpermIndices = NULL;
3932     fs->cpermIndices = NULL;
3933     fs->workVector   = NULL;
3934     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3935     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3936     fs->init_dev_prop = PETSC_FALSE;
3937 #if CUSPARSE_VERSION >= 11500
3938     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3939     PetscCallCUDA(cudaFree(fs->csrColIdx));
3940     PetscCallCUDA(cudaFree(fs->csrVal));
3941     PetscCallCUDA(cudaFree(fs->X));
3942     PetscCallCUDA(cudaFree(fs->Y));
3943     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3944     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3945     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3946     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3947     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3948     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3949     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3950     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3951     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3952     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3953     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3954     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3955     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3956     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3957     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3958     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3959 
3960     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3961     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3962 #endif
3963   }
3964   PetscFunctionReturn(0);
3965 }
3966 
3967 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) {
3968   cusparseHandle_t handle;
3969 
3970   PetscFunctionBegin;
3971   if (*trifactors) {
3972     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3973     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
3974     PetscCall(PetscFree(*trifactors));
3975   }
3976   PetscFunctionReturn(0);
3977 }
3978 
3979 struct IJCompare {
3980   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
3981     if (t1.get<0>() < t2.get<0>()) return true;
3982     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3983     return false;
3984   }
3985 };
3986 
3987 struct IJEqual {
3988   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
3989     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3990     return true;
3991   }
3992 };
3993 
3994 struct IJDiff {
3995   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3996 };
3997 
3998 struct IJSum {
3999   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
4000 };
4001 
4002 #include <thrust/iterator/discard_iterator.h>
4003 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4004 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) {
4005   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4006   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4007   THRUSTARRAY                          *cooPerm_v = NULL;
4008   thrust::device_ptr<const PetscScalar> d_v;
4009   CsrMatrix                            *matrix;
4010   PetscInt                              n;
4011 
4012   PetscFunctionBegin;
4013   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4014   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4015   if (!cusp->cooPerm) {
4016     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4017     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4018     PetscFunctionReturn(0);
4019   }
4020   matrix = (CsrMatrix *)cusp->mat->mat;
4021   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4022   if (!v) {
4023     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4024     goto finalize;
4025   }
4026   n = cusp->cooPerm->size();
4027   if (isCudaMem(v)) {
4028     d_v = thrust::device_pointer_cast(v);
4029   } else {
4030     cooPerm_v = new THRUSTARRAY(n);
4031     cooPerm_v->assign(v, v + n);
4032     d_v = cooPerm_v->data();
4033     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4034   }
4035   PetscCall(PetscLogGpuTimeBegin());
4036   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4037     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4038       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4039       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4040       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4041         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4042         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4043       */
4044       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4045       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4046       delete cooPerm_w;
4047     } else {
4048       /* all nonzeros in d_v[] are unique entries */
4049       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4050       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4051       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4052     }
4053   } else {
4054     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4055       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4056       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4057     } else {
4058       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4059       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4060       thrust::for_each(zibit, zieit, VecCUDAEquals());
4061     }
4062   }
4063   PetscCall(PetscLogGpuTimeEnd());
4064 finalize:
4065   delete cooPerm_v;
4066   A->offloadmask = PETSC_OFFLOAD_GPU;
4067   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4068   /* shorter version of MatAssemblyEnd_SeqAIJ */
4069   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4070   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4071   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4072   a->reallocs = 0;
4073   A->info.mallocs += 0;
4074   A->info.nz_unneeded = 0;
4075   A->assembled = A->was_assembled = PETSC_TRUE;
4076   A->num_ass++;
4077   PetscFunctionReturn(0);
4078 }
4079 
4080 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) {
4081   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4082 
4083   PetscFunctionBegin;
4084   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4085   if (!cusp) PetscFunctionReturn(0);
4086   if (destroy) {
4087     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4088     delete cusp->csr2csc_i;
4089     cusp->csr2csc_i = NULL;
4090   }
4091   A->transupdated = PETSC_FALSE;
4092   PetscFunctionReturn(0);
4093 }
4094 
4095 #include <thrust/binary_search.h>
4096 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4097 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) {
4098   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4099   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4100   PetscInt            cooPerm_n, nzr = 0;
4101 
4102   PetscFunctionBegin;
4103   PetscCall(PetscLayoutSetUp(A->rmap));
4104   PetscCall(PetscLayoutSetUp(A->cmap));
4105   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4106   if (n != cooPerm_n) {
4107     delete cusp->cooPerm;
4108     delete cusp->cooPerm_a;
4109     cusp->cooPerm   = NULL;
4110     cusp->cooPerm_a = NULL;
4111   }
4112   if (n) {
4113     thrust::device_ptr<PetscInt> d_i, d_j;
4114     PetscInt                    *d_raw_i, *d_raw_j;
4115     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4116     PetscMemType                 imtype, jmtype;
4117 
4118     PetscCall(PetscGetMemType(coo_i, &imtype));
4119     if (PetscMemTypeHost(imtype)) {
4120       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4121       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4122       d_i        = thrust::device_pointer_cast(d_raw_i);
4123       free_raw_i = PETSC_TRUE;
4124       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4125     } else {
4126       d_i = thrust::device_pointer_cast(coo_i);
4127     }
4128 
4129     PetscCall(PetscGetMemType(coo_j, &jmtype));
4130     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4131       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4132       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4133       d_j        = thrust::device_pointer_cast(d_raw_j);
4134       free_raw_j = PETSC_TRUE;
4135       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4136     } else {
4137       d_j = thrust::device_pointer_cast(coo_j);
4138     }
4139 
4140     THRUSTINTARRAY ii(A->rmap->n);
4141 
4142     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4143     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4144 
4145     /* Ex.
4146       n = 6
4147       coo_i = [3,3,1,4,1,4]
4148       coo_j = [3,2,2,5,2,6]
4149     */
4150     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4151     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4152 
4153     PetscCall(PetscLogGpuTimeBegin());
4154     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4155     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4156     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4157     THRUSTINTARRAY w(d_j, d_j + n);
4158 
4159     /*
4160       d_i     = [1,1,3,3,4,4]
4161       d_j     = [2,2,2,3,5,6]
4162       cooPerm = [2,4,1,0,3,5]
4163     */
4164     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4165 
4166     /*
4167       d_i     = [1,3,3,4,4,x]
4168                             ^ekey
4169       d_j     = [2,2,3,5,6,x]
4170                            ^nekye
4171     */
4172     if (nekey == ekey) { /* all entries are unique */
4173       delete cusp->cooPerm_a;
4174       cusp->cooPerm_a = NULL;
4175     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4176       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4177       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4178       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4179       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4180       w[0]                  = 0;
4181       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4182       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4183     }
4184     thrust::counting_iterator<PetscInt> search_begin(0);
4185     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4186                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4187                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4188     PetscCall(PetscLogGpuTimeEnd());
4189 
4190     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4191     a->singlemalloc = PETSC_FALSE;
4192     a->free_a       = PETSC_TRUE;
4193     a->free_ij      = PETSC_TRUE;
4194     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4195     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4196     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4197     a->nz = a->maxnz = a->i[A->rmap->n];
4198     a->rmax          = 0;
4199     PetscCall(PetscMalloc1(a->nz, &a->a));
4200     PetscCall(PetscMalloc1(a->nz, &a->j));
4201     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4202     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4203     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4204     for (PetscInt i = 0; i < A->rmap->n; i++) {
4205       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4206       nzr += (PetscInt) !!(nnzr);
4207       a->ilen[i] = a->imax[i] = nnzr;
4208       a->rmax                 = PetscMax(a->rmax, nnzr);
4209     }
4210     a->nonzerorowcnt = nzr;
4211     A->preallocated  = PETSC_TRUE;
4212     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4213     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4214     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4215     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4216   } else {
4217     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4218   }
4219   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4220 
4221   /* We want to allocate the CUSPARSE struct for matvec now.
4222      The code is so convoluted now that I prefer to copy zeros */
4223   PetscCall(PetscArrayzero(a->a, a->nz));
4224   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4225   A->offloadmask = PETSC_OFFLOAD_CPU;
4226   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4227   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4228   PetscFunctionReturn(0);
4229 }
4230 
4231 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
4232   Mat_SeqAIJ         *seq;
4233   Mat_SeqAIJCUSPARSE *dev;
4234   PetscBool           coo_basic = PETSC_TRUE;
4235   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4236 
4237   PetscFunctionBegin;
4238   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4239   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4240   if (coo_i) {
4241     PetscCall(PetscGetMemType(coo_i, &mtype));
4242     if (PetscMemTypeHost(mtype)) {
4243       for (PetscCount k = 0; k < coo_n; k++) {
4244         if (coo_i[k] < 0 || coo_j[k] < 0) {
4245           coo_basic = PETSC_FALSE;
4246           break;
4247         }
4248       }
4249     }
4250   }
4251 
4252   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4253     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4254   } else {
4255     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4256     mat->offloadmask = PETSC_OFFLOAD_CPU;
4257     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4258     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4259     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4260     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4261     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4262     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4263     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4264     dev->use_extended_coo = PETSC_TRUE;
4265   }
4266   PetscFunctionReturn(0);
4267 }
4268 
4269 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) {
4270   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4271   const PetscCount grid_size = gridDim.x * blockDim.x;
4272   for (; i < nnz; i += grid_size) {
4273     PetscScalar sum = 0.0;
4274     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4275     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4276   }
4277 }
4278 
4279 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) {
4280   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4281   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4282   PetscCount          Annz = seq->nz;
4283   PetscMemType        memtype;
4284   const PetscScalar  *v1 = v;
4285   PetscScalar        *Aa;
4286 
4287   PetscFunctionBegin;
4288   if (dev->use_extended_coo) {
4289     PetscCall(PetscGetMemType(v, &memtype));
4290     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4291       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4292       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4293     }
4294 
4295     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4296     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4297 
4298     if (Annz) {
4299       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4300       PetscCallCUDA(cudaPeekAtLastError());
4301     }
4302 
4303     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4304     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4305 
4306     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4307   } else {
4308     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4309   }
4310   PetscFunctionReturn(0);
4311 }
4312 
4313 /*@C
4314     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
4315 
4316    Not collective
4317 
4318     Input Parameters:
4319 +   A - the matrix
4320 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4321 
4322     Output Parameters:
4323 +   ia - the CSR row pointers
4324 -   ja - the CSR column indices
4325 
4326     Level: developer
4327 
4328     Note:
4329       When compressed is true, the CSR structure does not contain empty rows
4330 
4331 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4332 @*/
4333 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
4334   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4335   CsrMatrix          *csr;
4336   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
4337 
4338   PetscFunctionBegin;
4339   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4340   if (!i || !j) PetscFunctionReturn(0);
4341   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4342   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4343   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4344   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4345   csr = (CsrMatrix *)cusp->mat->mat;
4346   if (i) {
4347     if (!compressed && a->compressedrow.use) { /* need full row offset */
4348       if (!cusp->rowoffsets_gpu) {
4349         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4350         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4351         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4352       }
4353       *i = cusp->rowoffsets_gpu->data().get();
4354     } else *i = csr->row_offsets->data().get();
4355   }
4356   if (j) *j = csr->column_indices->data().get();
4357   PetscFunctionReturn(0);
4358 }
4359 
4360 /*@C
4361     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4362 
4363    Not collective
4364 
4365     Input Parameters:
4366 +   A - the matrix
4367 -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4368 
4369     Output Parameters:
4370 +   ia - the CSR row pointers
4371 -   ja - the CSR column indices
4372 
4373     Level: developer
4374 
4375 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4376 @*/
4377 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
4378   PetscFunctionBegin;
4379   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4380   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4381   if (i) *i = NULL;
4382   if (j) *j = NULL;
4383   PetscFunctionReturn(0);
4384 }
4385 
4386 /*@C
4387    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4388 
4389    Not Collective
4390 
4391    Input Parameter:
4392 .   A - a `MATSEQAIJCUSPARSE` matrix
4393 
4394    Output Parameter:
4395 .   a - pointer to the device data
4396 
4397    Level: developer
4398 
4399    Note:
4400    May trigger host-device copies if up-to-date matrix data is on host
4401 
4402 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4403 @*/
4404 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) {
4405   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4406   CsrMatrix          *csr;
4407 
4408   PetscFunctionBegin;
4409   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4410   PetscValidPointer(a, 2);
4411   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4412   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4413   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4414   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4415   csr = (CsrMatrix *)cusp->mat->mat;
4416   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4417   *a = csr->values->data().get();
4418   PetscFunctionReturn(0);
4419 }
4420 
4421 /*@C
4422    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4423 
4424    Not Collective
4425 
4426    Input Parameter:
4427 .   A - a `MATSEQAIJCUSPARSE` matrix
4428 
4429    Output Parameter:
4430 .   a - pointer to the device data
4431 
4432    Level: developer
4433 
4434 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4435 @*/
4436 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) {
4437   PetscFunctionBegin;
4438   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4439   PetscValidPointer(a, 2);
4440   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4441   *a = NULL;
4442   PetscFunctionReturn(0);
4443 }
4444 
4445 /*@C
4446    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4447 
4448    Not Collective
4449 
4450    Input Parameter:
4451 .   A - a `MATSEQAIJCUSPARSE` matrix
4452 
4453    Output Parameter:
4454 .   a - pointer to the device data
4455 
4456    Level: developer
4457 
4458    Note:
4459    May trigger host-device copies if up-to-date matrix data is on host
4460 
4461 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4462 @*/
4463 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) {
4464   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4465   CsrMatrix          *csr;
4466 
4467   PetscFunctionBegin;
4468   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4469   PetscValidPointer(a, 2);
4470   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4471   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4472   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4473   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4474   csr = (CsrMatrix *)cusp->mat->mat;
4475   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4476   *a             = csr->values->data().get();
4477   A->offloadmask = PETSC_OFFLOAD_GPU;
4478   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4479   PetscFunctionReturn(0);
4480 }
4481 /*@C
4482    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4483 
4484    Not Collective
4485 
4486    Input Parameter:
4487 .   A - a `MATSEQAIJCUSPARSE` matrix
4488 
4489    Output Parameter:
4490 .   a - pointer to the device data
4491 
4492    Level: developer
4493 
4494 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4495 @*/
4496 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) {
4497   PetscFunctionBegin;
4498   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4499   PetscValidPointer(a, 2);
4500   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4501   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4502   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4503   *a = NULL;
4504   PetscFunctionReturn(0);
4505 }
4506 
4507 /*@C
4508    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4509 
4510    Not Collective
4511 
4512    Input Parameter:
4513 .   A - a `MATSEQAIJCUSPARSE` matrix
4514 
4515    Output Parameter:
4516 .   a - pointer to the device data
4517 
4518    Level: developer
4519 
4520    Note:
4521    Does not trigger host-device copies and flags data validity on the GPU
4522 
4523 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4524 @*/
4525 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) {
4526   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4527   CsrMatrix          *csr;
4528 
4529   PetscFunctionBegin;
4530   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4531   PetscValidPointer(a, 2);
4532   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4533   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4534   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4535   csr = (CsrMatrix *)cusp->mat->mat;
4536   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4537   *a             = csr->values->data().get();
4538   A->offloadmask = PETSC_OFFLOAD_GPU;
4539   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4540   PetscFunctionReturn(0);
4541 }
4542 
4543 /*@C
4544    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4545 
4546    Not Collective
4547 
4548    Input Parameter:
4549 .   A - a `MATSEQAIJCUSPARSE` matrix
4550 
4551    Output Parameter:
4552 .   a - pointer to the device data
4553 
4554    Level: developer
4555 
4556 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4557 @*/
4558 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) {
4559   PetscFunctionBegin;
4560   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4561   PetscValidPointer(a, 2);
4562   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4563   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4564   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4565   *a = NULL;
4566   PetscFunctionReturn(0);
4567 }
4568 
4569 struct IJCompare4 {
4570   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) {
4571     if (t1.get<0>() < t2.get<0>()) return true;
4572     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4573     return false;
4574   }
4575 };
4576 
4577 struct Shift {
4578   int _shift;
4579 
4580   Shift(int shift) : _shift(shift) { }
4581   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4582 };
4583 
4584 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4585 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) {
4586   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4587   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4588   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4589   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4590   PetscInt                      Annz, Bnnz;
4591   cusparseStatus_t              stat;
4592   PetscInt                      i, m, n, zero = 0;
4593 
4594   PetscFunctionBegin;
4595   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4596   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4597   PetscValidPointer(C, 4);
4598   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4599   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4600   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4601   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4602   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4603   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4604   if (reuse == MAT_INITIAL_MATRIX) {
4605     m = A->rmap->n;
4606     n = A->cmap->n + B->cmap->n;
4607     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4608     PetscCall(MatSetSizes(*C, m, n, m, n));
4609     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4610     c                       = (Mat_SeqAIJ *)(*C)->data;
4611     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4612     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4613     Ccsr                    = new CsrMatrix;
4614     Cmat->cprowIndices      = NULL;
4615     c->compressedrow.use    = PETSC_FALSE;
4616     c->compressedrow.nrows  = 0;
4617     c->compressedrow.i      = NULL;
4618     c->compressedrow.rindex = NULL;
4619     Ccusp->workVector       = NULL;
4620     Ccusp->nrows            = m;
4621     Ccusp->mat              = Cmat;
4622     Ccusp->mat->mat         = Ccsr;
4623     Ccsr->num_rows          = m;
4624     Ccsr->num_cols          = n;
4625     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4626     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4627     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4628     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4629     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4630     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4631     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4632     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4633     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4634     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4635     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4636     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4637     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4638 
4639     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4640     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4641     Annz                 = (PetscInt)Acsr->column_indices->size();
4642     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4643     c->nz                = Annz + Bnnz;
4644     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4645     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4646     Ccsr->values         = new THRUSTARRAY(c->nz);
4647     Ccsr->num_entries    = c->nz;
4648     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4649     if (c->nz) {
4650       auto              Acoo = new THRUSTINTARRAY32(Annz);
4651       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4652       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4653       THRUSTINTARRAY32 *Aroff, *Broff;
4654 
4655       if (a->compressedrow.use) { /* need full row offset */
4656         if (!Acusp->rowoffsets_gpu) {
4657           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4658           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4659           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4660         }
4661         Aroff = Acusp->rowoffsets_gpu;
4662       } else Aroff = Acsr->row_offsets;
4663       if (b->compressedrow.use) { /* need full row offset */
4664         if (!Bcusp->rowoffsets_gpu) {
4665           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4666           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4667           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4668         }
4669         Broff = Bcusp->rowoffsets_gpu;
4670       } else Broff = Bcsr->row_offsets;
4671       PetscCall(PetscLogGpuTimeBegin());
4672       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4673       PetscCallCUSPARSE(stat);
4674       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4675       PetscCallCUSPARSE(stat);
4676       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4677       auto Aperm = thrust::make_constant_iterator(1);
4678       auto Bperm = thrust::make_constant_iterator(0);
4679 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4680       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4681       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4682 #else
4683       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4684       auto Bcib = Bcsr->column_indices->begin();
4685       auto Bcie = Bcsr->column_indices->end();
4686       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4687 #endif
4688       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4689       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4690       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4691       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4692       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4693       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4694       auto p1    = Ccusp->cooPerm->begin();
4695       auto p2    = Ccusp->cooPerm->begin();
4696       thrust::advance(p2, Annz);
4697       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4698 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4699       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4700 #endif
4701       auto cci = thrust::make_counting_iterator(zero);
4702       auto cce = thrust::make_counting_iterator(c->nz);
4703 #if 0 //Errors on SUMMIT cuda 11.1.0
4704       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4705 #else
4706       auto pred = thrust::identity<int>();
4707       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4708       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4709 #endif
4710       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4711       PetscCallCUSPARSE(stat);
4712       PetscCall(PetscLogGpuTimeEnd());
4713       delete wPerm;
4714       delete Acoo;
4715       delete Bcoo;
4716       delete Ccoo;
4717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4718       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4719       PetscCallCUSPARSE(stat);
4720 #endif
4721       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4722         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4723         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4724         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4725         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4726         CsrMatrix                    *CcsrT = new CsrMatrix;
4727         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4728         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4729 
4730         (*C)->form_explicit_transpose = PETSC_TRUE;
4731         (*C)->transupdated            = PETSC_TRUE;
4732         Ccusp->rowoffsets_gpu         = NULL;
4733         CmatT->cprowIndices           = NULL;
4734         CmatT->mat                    = CcsrT;
4735         CcsrT->num_rows               = n;
4736         CcsrT->num_cols               = m;
4737         CcsrT->num_entries            = c->nz;
4738 
4739         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4740         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4741         CcsrT->values         = new THRUSTARRAY(c->nz);
4742 
4743         PetscCall(PetscLogGpuTimeBegin());
4744         auto rT = CcsrT->row_offsets->begin();
4745         if (AT) {
4746           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4747           thrust::advance(rT, -1);
4748         }
4749         if (BT) {
4750           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4751           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4752           thrust::copy(titb, tite, rT);
4753         }
4754         auto cT = CcsrT->column_indices->begin();
4755         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4756         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4757         auto vT = CcsrT->values->begin();
4758         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4759         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4760         PetscCall(PetscLogGpuTimeEnd());
4761 
4762         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4763         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4764         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4765         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4766         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4767         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4768         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4769         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4770         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4771 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4772         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4773         PetscCallCUSPARSE(stat);
4774 #endif
4775         Ccusp->matTranspose = CmatT;
4776       }
4777     }
4778 
4779     c->singlemalloc = PETSC_FALSE;
4780     c->free_a       = PETSC_TRUE;
4781     c->free_ij      = PETSC_TRUE;
4782     PetscCall(PetscMalloc1(m + 1, &c->i));
4783     PetscCall(PetscMalloc1(c->nz, &c->j));
4784     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4785       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4786       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4787       ii = *Ccsr->row_offsets;
4788       jj = *Ccsr->column_indices;
4789       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4790       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4791     } else {
4792       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4793       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4794     }
4795     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4796     PetscCall(PetscMalloc1(m, &c->ilen));
4797     PetscCall(PetscMalloc1(m, &c->imax));
4798     c->maxnz         = c->nz;
4799     c->nonzerorowcnt = 0;
4800     c->rmax          = 0;
4801     for (i = 0; i < m; i++) {
4802       const PetscInt nn = c->i[i + 1] - c->i[i];
4803       c->ilen[i] = c->imax[i] = nn;
4804       c->nonzerorowcnt += (PetscInt) !!nn;
4805       c->rmax = PetscMax(c->rmax, nn);
4806     }
4807     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4808     PetscCall(PetscMalloc1(c->nz, &c->a));
4809     (*C)->nonzerostate++;
4810     PetscCall(PetscLayoutSetUp((*C)->rmap));
4811     PetscCall(PetscLayoutSetUp((*C)->cmap));
4812     Ccusp->nonzerostate = (*C)->nonzerostate;
4813     (*C)->preallocated  = PETSC_TRUE;
4814   } else {
4815     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4816     c = (Mat_SeqAIJ *)(*C)->data;
4817     if (c->nz) {
4818       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4819       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4820       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4821       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4822       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4823       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4824       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4825       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4826       Acsr = (CsrMatrix *)Acusp->mat->mat;
4827       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4828       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4829       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4830       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4831       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4832       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4833       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4834       auto pmid = Ccusp->cooPerm->begin();
4835       thrust::advance(pmid, Acsr->num_entries);
4836       PetscCall(PetscLogGpuTimeBegin());
4837       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4838       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4839       thrust::for_each(zibait, zieait, VecCUDAEquals());
4840       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4841       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4842       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4843       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4844       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4845         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4846         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4847         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4848         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4849         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4850         auto       vT    = CcsrT->values->begin();
4851         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4852         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4853         (*C)->transupdated = PETSC_TRUE;
4854       }
4855       PetscCall(PetscLogGpuTimeEnd());
4856     }
4857   }
4858   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4859   (*C)->assembled     = PETSC_TRUE;
4860   (*C)->was_assembled = PETSC_FALSE;
4861   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4862   PetscFunctionReturn(0);
4863 }
4864 
4865 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) {
4866   bool               dmem;
4867   const PetscScalar *av;
4868 
4869   PetscFunctionBegin;
4870   dmem = isCudaMem(v);
4871   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4872   if (n && idx) {
4873     THRUSTINTARRAY widx(n);
4874     widx.assign(idx, idx + n);
4875     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4876 
4877     THRUSTARRAY                    *w = NULL;
4878     thrust::device_ptr<PetscScalar> dv;
4879     if (dmem) {
4880       dv = thrust::device_pointer_cast(v);
4881     } else {
4882       w  = new THRUSTARRAY(n);
4883       dv = w->data();
4884     }
4885     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4886 
4887     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4888     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4889     thrust::for_each(zibit, zieit, VecCUDAEquals());
4890     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4891     delete w;
4892   } else {
4893     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4894   }
4895   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4896   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4897   PetscFunctionReturn(0);
4898 }
4899